JWS-112 Bumping version of T-Coffee to version 11.00.8cbe486.
[jabaws.git] / binaries / src / tcoffee / t_coffee_source / io_lib / pb_util_read_sequence.c
diff --git a/binaries/src/tcoffee/t_coffee_source/io_lib/pb_util_read_sequence.c b/binaries/src/tcoffee/t_coffee_source/io_lib/pb_util_read_sequence.c
new file mode 100644 (file)
index 0000000..9ac6681
--- /dev/null
@@ -0,0 +1,559 @@
+/******************************COPYRIGHT NOTICE*******************************/
+/*  (c) Centro de Regulacio Genomica                                                        */
+/*  and                                                                                     */
+/*  Cedric Notredame                                                                        */
+/*  12 Aug 2014 - 22:07.                                                                    */
+/*All rights reserved.                                                                      */
+/*This file is part of T-COFFEE.                                                            */
+/*                                                                                          */
+/*    T-COFFEE is free software; you can redistribute it and/or modify                      */
+/*    it under the terms of the GNU General Public License as published by                  */
+/*    the Free Software Foundation; either version 2 of the License, or                     */
+/*    (at your option) any later version.                                                   */
+/*                                                                                          */
+/*    T-COFFEE is distributed in the hope that it will be useful,                           */
+/*    but WITHOUT ANY WARRANTY; without even the implied warranty of                        */
+/*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                         */
+/*    GNU General Public License for more details.                                          */
+/*                                                                                          */
+/*    You should have received a copy of the GNU General Public License                     */
+/*    along with Foobar; if not, write to the Free Software                                 */
+/*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA             */
+/*...............................................                                           */
+/*  If you need some more information                                                       */
+/*  cedric.notredame@europe.com                                                             */
+/*...............................................                                           */
+/******************************COPYRIGHT NOTICE*******************************/
+/********* Sequence input routines for CLUSTAL W *******************/
+/* DES was here.  FEB. 1994 */
+/* Now reads PILEUP/MSF and CLUSTAL alignment files */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include "io_lib_header.h"
+#include "util_lib_header.h"
+#include "define_header.h"
+
+/*
+*      Prototypes
+*/
+
+extern Boolean linetype(char *,char *);
+extern Boolean blankline(char *);
+extern void warning(char *,...);
+extern void error(char *,...);
+extern char *  rtrim(char *);
+extern char *  blank_to_(char *);
+extern void    getstr(char *,char *);
+
+void fill_chartab(void);
+
+
+static void         get_seq(char *,char *,int *,char *);
+static void get_clustal_seq(char *,char *,int *,char *,int);
+static void     get_msf_seq(char *,char *,int *,char *,int);
+static void check_infile(int *);
+
+
+
+
+static int count_clustal_seqs(void);
+static int count_msf_seqs(void);
+
+/*
+ *     Global variables
+ */
+
+static FILE *fin;
+
+
+char *amino_acid_codes   =    "ABCDEFGHIKLMNPQRSTUVWXYZ-";  /* DES */
+char *nucleic_acid_order =       "ACGTUN";
+static int seqFormat;
+static char chartab[128];
+
+void fill_chartab(void)        /* Create translation and check table */
+{
+       register int i;
+       register int c;
+
+
+       for(i=0;i<128;chartab[i++]=0);
+       for(i=0,c=0;c<=amino_acid_codes[i];i++)
+               chartab[c]=chartab[tolower(c)]=c;
+
+}
+
+static void get_msf_seq(char *sname,char *seq,int *len,char *tit,int seqno)
+/* read the seqno_th. sequence from a PILEUP multiple alignment file */
+{
+       static char *line;
+       int i,j,k;
+       unsigned char c;
+       if ( !line)line=(char*)vcalloc ( (MAXLINE+1), sizeof (char));
+
+       fseek(fin,0,0);                 /* start at the beginning */
+
+       *len=0;                         /* initialise length to zero */
+        for(i=0;;i++) {
+               if(fgets(line,MAXLINE+1,fin)==NULL) return; /* read the title*/
+               if(linetype(line,"/") ) break;              /* lines...ignore*/
+       }
+
+       while (fgets(line,MAXLINE+1,fin) != NULL) {
+               if(!blankline(line)) {
+
+                       for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin);
+                        for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
+                       for(k=j;k<=strlen(line);k++) if(line[k] == ' ') break;
+                       strncpy(sname,line+j,MIN(MAXNAMES,k-j)); 
+                       sname[MIN(MAXNAMES,k-j)]=EOS;
+                       rtrim(sname);
+                               blank_to_(sname);
+
+                       for(i=k;*len < SEQ_MAX_LEN;i++) {
+                               c=line[i];
+                               if(c == '.') c = '-';
+                               if(c == '*') c = 'X';
+                               if(c == '\n' || c == EOS) break; /* EOL */
+                               if( (c=chartab[c])) seq[++(*len)]=c;
+                       }
+                       if(*len == SEQ_MAX_LEN) return;
+
+                       for(i=0;;i++) {
+                               if(fgets(line,MAXLINE+1,fin)==NULL) return;
+                               if(blankline(line)) break;
+                       }
+               }
+       }
+}
+
+
+static void get_clustal_seq(char *sname,char *seq,int *len,char *tit,int seqno)
+/* read the seqno_th. sequence from a clustal multiple alignment file */
+{
+       static char *line;
+       int i,j;
+       unsigned char c;
+       if ( !line)line=(char*)vcalloc ( (MAXLINE+1), sizeof (char));
+       fseek(fin,0,0);                 /* start at the beginning */
+
+       *len=0;                         /* initialise length to zero */
+       fgets(line,MAXLINE+1,fin);      /* read the title line...ignore it */
+
+       while (fgets(line,MAXLINE+1,fin) != NULL) {
+               if(!blankline(line)) {
+
+                       for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin);
+                        for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
+                       strncpy(sname,line+j,MAXNAMES-j); /* remember entryname */
+                       sname[MAXNAMES]=EOS;
+                       rtrim(sname);
+                               blank_to_(sname);
+
+                       for(i=14;*len < SEQ_MAX_LEN;i++) {
+                               c=line[i];
+                               if(c == '\n' || c == EOS) break; /* EOL */
+                               if( (c=chartab[c])) seq[++(*len)]=c;
+                       }
+                       if(*len == SEQ_MAX_LEN) return;
+
+                       for(i=0;;i++) {
+                               if(fgets(line,MAXLINE+1,fin)==NULL) return;
+                               if(blankline(line)) break;
+                       }
+               }
+       }
+}
+
+
+static void get_seq(char *sname,char *seq,int *len,char *tit)
+{
+       static char *line;
+       int i, offset, c=0;
+
+       if ( !line)line=(char*)vcalloc ( (MAXLINE+1), sizeof (char));
+       switch(seqFormat) {
+
+/************************************/
+               case EMBLSWISS:
+                       while( !linetype(line,"ID") )
+                               fgets(line,MAXLINE+1,fin);
+                       
+                        for(i=5;i<=strlen(line);i++)  /* DES */
+                               if(line[i] != ' ') break;
+                       strncpy(sname,line+i,MAXNAMES); /* remember entryname */
+                       sname[MAXNAMES]=EOS;
+                       rtrim(sname);
+                        blank_to_(sname);
+                       fprintf ( stderr, "\n%s", sname);
+                       while( !linetype(line,"SQ") )
+                               fgets(line,MAXLINE+1,fin);
+                       
+                       *len=0;
+                       while(fgets(line,MAXLINE+1,fin)) {
+                               for(i=0;*len < SEQ_MAX_LEN;i++) {
+                                       c=line[i];
+                               if(c == '\n' || c == EOS || c == '/')
+                                       break;                  /* EOL */
+                               if( (c=chartab[c]))
+                                       seq[++(*len)]=c;
+                               }
+                       if(*len == SEQ_MAX_LEN || c == '/') break;
+                       }
+               break;
+               
+/************************************/
+               case PIR:
+                       while(*line != '>')
+                               {
+                               
+                               fgets(line,MAXLINE+1,fin);
+                                }                      
+                        for(i=4;i<=strlen(line);i++)  /* DES */
+                               if(line[i] != ' ') break;
+                       strncpy(sname,line+i,MAXNAMES); /* remember entryname */
+                       sname[MAXNAMES]=EOS;
+                       rtrim(sname);
+                        blank_to_(sname);
+
+                       fgets(line,MAXLINE+1,fin);
+                       strncpy(tit,line,MAXTITLES);
+                       tit[MAXTITLES]=EOS;
+                       i=strlen(tit);
+                       if(tit[i-1]=='\n') tit[i-1]=EOS;
+                       
+                       *len=0;
+                       while(fgets(line,MAXLINE+1,fin)) {
+                               
+                               for(i=0;*len < SEQ_MAX_LEN;i++) 
+                                  {
+                                  c=line[i];
+                                  if(c == '\n' || c == EOS || c == '*')
+                                       break;                  /* EOL */
+                       
+                                  if( (c=chartab[c]))
+                                       seq[++(*len)]=c;
+
+                                  }
+                       if(*len == SEQ_MAX_LEN || c == '*') break;
+                       }
+               break;
+/***********************************************/
+       case (PEARSON):
+                       
+                       while(*line != '>')
+                               {
+                               fgets(line,MAXLINE+1,fin);
+                               }
+                        for(i=1;i<=strlen(line);i++)  /* DES */
+                               if(line[i] != ' ') break;
+                       strncpy(sname,line+i,MAXNAMES); /* remember entryname */
+                       sname[MAXNAMES]=EOS;
+                       rtrim(sname);
+                        blank_to_(sname);
+
+                       *tit=EOS;
+                       
+                       *len=0;
+                       while(fgets(line,MAXLINE+1,fin)) 
+                               {
+                               for(i=0;*len < SEQ_MAX_LEN;i++) 
+                                       {
+                                       c=line[i];
+                                       if(c == '\n' || c == EOS || c == '>')
+                                               break;                  /* EOL */
+                                               
+                                       if( (c=chartab[c]))
+                                               {seq[++(*len)]=c;
+                                                }
+                                       }
+                               if(*len == SEQ_MAX_LEN || c == '>') break;
+                               }
+                       break;
+/**********************************************/
+               case GDE:
+               
+                       while(*line != '#' ||*line != '%' )
+                                       fgets(line,MAXLINE+1,fin);
+                       
+                       
+                       
+                       
+                       for (i=1;i<=MAXNAMES;i++) {
+                               if (line[i] == '(' || line[i] == '\n')
+                                  {
+                                    i--;
+                                    break;
+                                  }
+                               sname[i-1] = line[i];
+                       }
+                       sname[i]=EOS;
+                       offset=0;
+                       if (sname[i-1] == '(') sscanf(&line[i],"%d",&offset);
+                       else offset = 0;
+                       for(i=MAXNAMES-1;i > 0;i--) 
+                               if(isspace(sname[i])) {
+                                       sname[i]=EOS;   
+                                       break;
+                               }               
+                        blank_to_(sname);
+
+
+                       *tit=EOS;
+                       
+                       *len=0;
+                       for (i=0;i<offset;i++) seq[++(*len)] = '-';
+                       while(fgets(line,MAXLINE+1,fin)) {
+                       if(*line == '%' || *line == '#' || *line == '"') break;
+                               for(i=0;*len < SEQ_MAX_LEN;i++) {
+                                       c=line[i];
+                               if(c == '\n' || c == EOS) 
+                                       break;                  /* EOL */
+                       
+                               if( (c=chartab[c]))
+                                       seq[++(*len)]=c;
+                               }
+                       if(*len == SEQ_MAX_LEN) break;
+                       }
+               break;
+/***********************************************/
+       }
+       
+       if(*len == SEQ_MAX_LEN)
+               warning("Sequence %s truncated to %d residues",
+                               sname,(pint)SEQ_MAX_LEN);
+                               
+       seq[*len+1]=EOS;
+}
+
+
+int readseqs(char *saga_file,char ***SAGA_SEQ, char*** SAGA_NAMES, int ***SAGA_LEN) /*first_seq is the #no. of the first seq. to read */
+{
+       
+       static char *line;
+       static char *seq1;
+       static char *sname1;
+       static char *title;
+       int i,j,no_seqs;
+       static int l1;
+       int a;
+       int b, l;
+       int first_seq=0;
+       
+       if ( !line) line=(char*)vcalloc   ( (FILENAMELEN+1), sizeof (char));
+       if ( !seq1) seq1=(char*)vcalloc   ( (SEQ_MAX_LEN+2), sizeof (char));
+       if ( !sname1)sname1=(char*)vcalloc ( (MAXNAMES+1), sizeof (char));
+       if ( !title)title=(char*)vcalloc  ( (MAXTITLES+1), sizeof (char));
+       
+       fill_chartab();
+       
+       fin=vfopen(saga_file,"r");
+       
+       
+       no_seqs=0;
+       check_infile(&no_seqs);
+
+       if(no_seqs == 0)
+               return 0;       /* return the number of seqs. (zero here)*/
+       
+       SAGA_SEQ[0]=(char**) vcalloc ( no_seqs, sizeof ( char*));
+       SAGA_NAMES[0]=(char**) vcalloc ( no_seqs, sizeof ( char*));
+       SAGA_LEN[0]= declare_int ( no_seqs,3);
+       
+       
+       for(i=first_seq;i<=first_seq+no_seqs-1;i++) 
+               {    /* get the seqs now*/
+               if(seqFormat == CLUSTAL) 
+                       get_clustal_seq(sname1,seq1,&l1,title,i-first_seq+1);
+               if(seqFormat == MSF)
+                           get_msf_seq(sname1,seq1,&l1,title,i-first_seq+1);
+               else
+                       get_seq(sname1,seq1,&l1,title);
+               
+               if(l1 > SEQ_MAX_LEN) 
+                       {
+                       error("Sequence too long. Maximum is %d",(pint)SEQ_MAX_LEN);
+                       return 0;       /* also return zero if too many */
+                       }
+               
+               
+               
+               for ( a=0; a<l1; a++)
+                       seq1[a]=seq1[a+1];      
+               seq1[l1]='\0';
+               
+               (SAGA_SEQ[0])[i]=(char*)vcalloc ( strlen ( seq1)+1, sizeof ( char));
+               (SAGA_NAMES[0])[i]=(char*)vcalloc ( strlen ( sname1)+1, sizeof ( char));
+               
+               l1=strlen (seq1);
+               (SAGA_LEN[0])[i][0]=l1;
+               (SAGA_SEQ[0])[i][l1]='\0';
+               
+               sprintf ( (SAGA_SEQ[0])[i], "%s", seq1);
+               sprintf ( (SAGA_NAMES[0])[i], "%s", sname1);
+               
+               l=strlen (SAGA_NAMES[0][i]);
+               for ( b=0; b< l; b++)
+                 {
+                   if ( isspace(SAGA_NAMES[0][i][b])){SAGA_NAMES[0][i][b]='\0';break;}
+                 }
+                         
+               
+       }
+
+       fclose(fin);
+/*
+   JULIE
+   check sequence names are all different - otherwise phylip tree is 
+   confused.
+*/
+       for(i=first_seq;i<=first_seq+no_seqs-1;i++) {
+               for(j=i+1;j<=first_seq+no_seqs-1;j++) {
+                       if (strncmp((SAGA_NAMES[0])[i],(SAGA_NAMES[0])[j],MAXNAMES) == 0) {
+                               error("Multiple sequences found with same name (first %d chars are significant)", MAXNAMES);
+                               return -1;
+                       }
+               }
+       }
+                       
+       return no_seqs;    /* return the number of seqs. read in this call */
+}
+
+
+
+
+static void check_infile(int *nseqs)
+{
+       static char *line;
+       int i;  
+
+       if ( !line)line=(char*)vcalloc ( (MAXLINE+1), sizeof (char));
+       
+       for ( i=0; i<=MAXLINE; i++)line[i]='a';
+       *nseqs=0;
+       while (fgets(line,MAXLINE+1,fin) != NULL) {
+               if ((*line != '\n') && (*line != ' ') && (*line != '\t'))
+                       break;
+       }
+        
+       for(i=0;i<=6;i++) line[i] = toupper(line[i]);
+
+       if( linetype(line,"ID") ) {                                     /* EMBL/Swiss-Prot format ? */
+               seqFormat=EMBLSWISS;
+               (*nseqs)++;
+       }
+        else if( linetype(line,"CLUSTAL") ) {
+               seqFormat=CLUSTAL;
+       }
+        else if( linetype(line,"PILEUP") ) {
+               seqFormat = MSF;
+       }
+       else if(*line == '>') {                                         /* no */
+               seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */
+               (*nseqs)++;
+       }
+       else if((*line == '"') || (*line == '%') || (*line == '#')) {
+               seqFormat=GDE; /* GDE format */
+               if (*line == '%') {
+                        (*nseqs)++;
+                       
+               }
+               else if (*line == '#') {
+                       (*nseqs)++;
+                       
+               }
+       }
+       else {
+               seqFormat=UNKNOWN;
+               return;
+       }
+
+       while(fgets(line,MAXLINE+1,fin) != NULL) {
+               switch(seqFormat) {
+                       case EMBLSWISS:
+                               if( linetype(line,"ID") )
+                                       (*nseqs)++;
+                               break;
+                       case PIR:
+                       case PEARSON:
+                               if( *line == '>' )
+                                       (*nseqs)++;
+                               break;
+                       case GDE:
+                               if(( *line == '%' ) )
+                                       (*nseqs)++;
+                               else if (( *line == '#') )
+                                       (*nseqs)++;
+                               break;
+                       case CLUSTAL:
+                               *nseqs = count_clustal_seqs();
+/* DES */                      /* fprintf(stdout,"\nnseqs = %d\n",(pint)*nseqs); */
+                               fseek(fin,0,0);
+                               return;
+                               break;
+                       case MSF:
+                               *nseqs = count_msf_seqs();
+                               fseek(fin,0,0);
+                               return;
+                               break;
+                       case USER:
+                       default:
+                               break;
+               }
+       }
+       fseek(fin,0,0);
+}
+
+
+static int count_clustal_seqs(void)
+/* count the number of sequences in a clustal alignment file */
+{
+       static char *line;
+       int  nseqs;
+
+       if ( !line)line=(char*)vcalloc ( (MAXLINE+1), sizeof (char));
+
+       while (fgets(line,MAXLINE+1,fin) != NULL) {
+               if(!blankline(line)) break;             /* Look for next non- */
+       }                                               /* blank line */
+       nseqs = 1;
+
+       while (fgets(line,MAXLINE+1,fin) != NULL) {
+               if(blankline(line)) return nseqs;
+               nseqs++;
+       }
+
+       return 0;       /* if you got to here-funny format/no seqs.*/
+}
+
+static int count_msf_seqs(void)
+{
+/* count the number of sequences in a PILEUP alignment file */
+
+       static char *line;
+       int  nseqs;
+
+       if ( !line)line=(char*)vcalloc ( (MAXLINE+1), sizeof (char));
+
+       while (fgets(line,MAXLINE+1,fin) != NULL) {
+               if(linetype(line,"/")) break;
+       }
+
+       while (fgets(line,MAXLINE+1,fin) != NULL) {
+               if(!blankline(line)) break;             /* Look for next non- */
+       }                                               /* blank line */
+       nseqs = 1;
+
+       while (fgets(line,MAXLINE+1,fin) != NULL) {
+               if(blankline(line)) return nseqs;
+               nseqs++;
+       }
+
+       return 0;       /* if you got to here-funny format/no seqs.*/
+}
+
+
+