--- /dev/null
+/******************************COPYRIGHT NOTICE*******************************/
+/* (c) Centro de Regulacio Genomica */
+/* and */
+/* Cedric Notredame */
+/* 12 Aug 2014 - 22:07. */
+/*All rights reserved. */
+/*This file is part of T-COFFEE. */
+/* */
+/* T-COFFEE is free software; you can redistribute it and/or modify */
+/* it under the terms of the GNU General Public License as published by */
+/* the Free Software Foundation; either version 2 of the License, or */
+/* (at your option) any later version. */
+/* */
+/* T-COFFEE is distributed in the hope that it will be useful, */
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
+/* GNU General Public License for more details. */
+/* */
+/* You should have received a copy of the GNU General Public License */
+/* along with Foobar; if not, write to the Free Software */
+/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+/*............................................... */
+/* If you need some more information */
+/* cedric.notredame@europe.com */
+/*............................................... */
+/******************************COPYRIGHT NOTICE*******************************/
+/********* Sequence input routines for CLUSTAL W *******************/
+/* DES was here. FEB. 1994 */
+/* Now reads PILEUP/MSF and CLUSTAL alignment files */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include "io_lib_header.h"
+#include "util_lib_header.h"
+#include "define_header.h"
+
+/*
+* Prototypes
+*/
+
+extern Boolean linetype(char *,char *);
+extern Boolean blankline(char *);
+extern void warning(char *,...);
+extern void error(char *,...);
+extern char * rtrim(char *);
+extern char * blank_to_(char *);
+extern void getstr(char *,char *);
+
+void fill_chartab(void);
+
+
+static void get_seq(char *,char *,int *,char *);
+static void get_clustal_seq(char *,char *,int *,char *,int);
+static void get_msf_seq(char *,char *,int *,char *,int);
+static void check_infile(int *);
+
+
+
+
+static int count_clustal_seqs(void);
+static int count_msf_seqs(void);
+
+/*
+ * Global variables
+ */
+
+static FILE *fin;
+
+
+char *amino_acid_codes = "ABCDEFGHIKLMNPQRSTUVWXYZ-"; /* DES */
+char *nucleic_acid_order = "ACGTUN";
+static int seqFormat;
+static char chartab[128];
+
+void fill_chartab(void) /* Create translation and check table */
+{
+ register int i;
+ register int c;
+
+
+ for(i=0;i<128;chartab[i++]=0);
+ for(i=0,c=0;c<=amino_acid_codes[i];i++)
+ chartab[c]=chartab[tolower(c)]=c;
+
+}
+
+static void get_msf_seq(char *sname,char *seq,int *len,char *tit,int seqno)
+/* read the seqno_th. sequence from a PILEUP multiple alignment file */
+{
+ static char *line;
+ int i,j,k;
+ unsigned char c;
+ if ( !line)line=(char*)vcalloc ( (MAXLINE+1), sizeof (char));
+
+ fseek(fin,0,0); /* start at the beginning */
+
+ *len=0; /* initialise length to zero */
+ for(i=0;;i++) {
+ if(fgets(line,MAXLINE+1,fin)==NULL) return; /* read the title*/
+ if(linetype(line,"/") ) break; /* lines...ignore*/
+ }
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(!blankline(line)) {
+
+ for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin);
+ for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
+ for(k=j;k<=strlen(line);k++) if(line[k] == ' ') break;
+ strncpy(sname,line+j,MIN(MAXNAMES,k-j));
+ sname[MIN(MAXNAMES,k-j)]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+
+ for(i=k;*len < SEQ_MAX_LEN;i++) {
+ c=line[i];
+ if(c == '.') c = '-';
+ if(c == '*') c = 'X';
+ if(c == '\n' || c == EOS) break; /* EOL */
+ if( (c=chartab[c])) seq[++(*len)]=c;
+ }
+ if(*len == SEQ_MAX_LEN) return;
+
+ for(i=0;;i++) {
+ if(fgets(line,MAXLINE+1,fin)==NULL) return;
+ if(blankline(line)) break;
+ }
+ }
+ }
+}
+
+
+static void get_clustal_seq(char *sname,char *seq,int *len,char *tit,int seqno)
+/* read the seqno_th. sequence from a clustal multiple alignment file */
+{
+ static char *line;
+ int i,j;
+ unsigned char c;
+ if ( !line)line=(char*)vcalloc ( (MAXLINE+1), sizeof (char));
+ fseek(fin,0,0); /* start at the beginning */
+
+ *len=0; /* initialise length to zero */
+ fgets(line,MAXLINE+1,fin); /* read the title line...ignore it */
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(!blankline(line)) {
+
+ for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin);
+ for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break;
+ strncpy(sname,line+j,MAXNAMES-j); /* remember entryname */
+ sname[MAXNAMES]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+
+ for(i=14;*len < SEQ_MAX_LEN;i++) {
+ c=line[i];
+ if(c == '\n' || c == EOS) break; /* EOL */
+ if( (c=chartab[c])) seq[++(*len)]=c;
+ }
+ if(*len == SEQ_MAX_LEN) return;
+
+ for(i=0;;i++) {
+ if(fgets(line,MAXLINE+1,fin)==NULL) return;
+ if(blankline(line)) break;
+ }
+ }
+ }
+}
+
+
+static void get_seq(char *sname,char *seq,int *len,char *tit)
+{
+ static char *line;
+ int i, offset, c=0;
+
+ if ( !line)line=(char*)vcalloc ( (MAXLINE+1), sizeof (char));
+ switch(seqFormat) {
+
+/************************************/
+ case EMBLSWISS:
+ while( !linetype(line,"ID") )
+ fgets(line,MAXLINE+1,fin);
+
+ for(i=5;i<=strlen(line);i++) /* DES */
+ if(line[i] != ' ') break;
+ strncpy(sname,line+i,MAXNAMES); /* remember entryname */
+ sname[MAXNAMES]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+ fprintf ( stderr, "\n%s", sname);
+ while( !linetype(line,"SQ") )
+ fgets(line,MAXLINE+1,fin);
+
+ *len=0;
+ while(fgets(line,MAXLINE+1,fin)) {
+ for(i=0;*len < SEQ_MAX_LEN;i++) {
+ c=line[i];
+ if(c == '\n' || c == EOS || c == '/')
+ break; /* EOL */
+ if( (c=chartab[c]))
+ seq[++(*len)]=c;
+ }
+ if(*len == SEQ_MAX_LEN || c == '/') break;
+ }
+ break;
+
+/************************************/
+ case PIR:
+ while(*line != '>')
+ {
+
+ fgets(line,MAXLINE+1,fin);
+ }
+ for(i=4;i<=strlen(line);i++) /* DES */
+ if(line[i] != ' ') break;
+ strncpy(sname,line+i,MAXNAMES); /* remember entryname */
+ sname[MAXNAMES]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+
+ fgets(line,MAXLINE+1,fin);
+ strncpy(tit,line,MAXTITLES);
+ tit[MAXTITLES]=EOS;
+ i=strlen(tit);
+ if(tit[i-1]=='\n') tit[i-1]=EOS;
+
+ *len=0;
+ while(fgets(line,MAXLINE+1,fin)) {
+
+ for(i=0;*len < SEQ_MAX_LEN;i++)
+ {
+ c=line[i];
+ if(c == '\n' || c == EOS || c == '*')
+ break; /* EOL */
+
+ if( (c=chartab[c]))
+ seq[++(*len)]=c;
+
+ }
+ if(*len == SEQ_MAX_LEN || c == '*') break;
+ }
+ break;
+/***********************************************/
+ case (PEARSON):
+
+ while(*line != '>')
+ {
+ fgets(line,MAXLINE+1,fin);
+ }
+ for(i=1;i<=strlen(line);i++) /* DES */
+ if(line[i] != ' ') break;
+ strncpy(sname,line+i,MAXNAMES); /* remember entryname */
+ sname[MAXNAMES]=EOS;
+ rtrim(sname);
+ blank_to_(sname);
+
+ *tit=EOS;
+
+ *len=0;
+ while(fgets(line,MAXLINE+1,fin))
+ {
+ for(i=0;*len < SEQ_MAX_LEN;i++)
+ {
+ c=line[i];
+ if(c == '\n' || c == EOS || c == '>')
+ break; /* EOL */
+
+ if( (c=chartab[c]))
+ {seq[++(*len)]=c;
+ }
+ }
+ if(*len == SEQ_MAX_LEN || c == '>') break;
+ }
+ break;
+/**********************************************/
+ case GDE:
+
+ while(*line != '#' ||*line != '%' )
+ fgets(line,MAXLINE+1,fin);
+
+
+
+
+ for (i=1;i<=MAXNAMES;i++) {
+ if (line[i] == '(' || line[i] == '\n')
+ {
+ i--;
+ break;
+ }
+ sname[i-1] = line[i];
+ }
+ sname[i]=EOS;
+ offset=0;
+ if (sname[i-1] == '(') sscanf(&line[i],"%d",&offset);
+ else offset = 0;
+ for(i=MAXNAMES-1;i > 0;i--)
+ if(isspace(sname[i])) {
+ sname[i]=EOS;
+ break;
+ }
+ blank_to_(sname);
+
+
+ *tit=EOS;
+
+ *len=0;
+ for (i=0;i<offset;i++) seq[++(*len)] = '-';
+ while(fgets(line,MAXLINE+1,fin)) {
+ if(*line == '%' || *line == '#' || *line == '"') break;
+ for(i=0;*len < SEQ_MAX_LEN;i++) {
+ c=line[i];
+ if(c == '\n' || c == EOS)
+ break; /* EOL */
+
+ if( (c=chartab[c]))
+ seq[++(*len)]=c;
+ }
+ if(*len == SEQ_MAX_LEN) break;
+ }
+ break;
+/***********************************************/
+ }
+
+ if(*len == SEQ_MAX_LEN)
+ warning("Sequence %s truncated to %d residues",
+ sname,(pint)SEQ_MAX_LEN);
+
+ seq[*len+1]=EOS;
+}
+
+
+int readseqs(char *saga_file,char ***SAGA_SEQ, char*** SAGA_NAMES, int ***SAGA_LEN) /*first_seq is the #no. of the first seq. to read */
+{
+
+ static char *line;
+ static char *seq1;
+ static char *sname1;
+ static char *title;
+ int i,j,no_seqs;
+ static int l1;
+ int a;
+ int b, l;
+ int first_seq=0;
+
+ if ( !line) line=(char*)vcalloc ( (FILENAMELEN+1), sizeof (char));
+ if ( !seq1) seq1=(char*)vcalloc ( (SEQ_MAX_LEN+2), sizeof (char));
+ if ( !sname1)sname1=(char*)vcalloc ( (MAXNAMES+1), sizeof (char));
+ if ( !title)title=(char*)vcalloc ( (MAXTITLES+1), sizeof (char));
+
+ fill_chartab();
+
+ fin=vfopen(saga_file,"r");
+
+
+ no_seqs=0;
+ check_infile(&no_seqs);
+
+ if(no_seqs == 0)
+ return 0; /* return the number of seqs. (zero here)*/
+
+ SAGA_SEQ[0]=(char**) vcalloc ( no_seqs, sizeof ( char*));
+ SAGA_NAMES[0]=(char**) vcalloc ( no_seqs, sizeof ( char*));
+ SAGA_LEN[0]= declare_int ( no_seqs,3);
+
+
+ for(i=first_seq;i<=first_seq+no_seqs-1;i++)
+ { /* get the seqs now*/
+ if(seqFormat == CLUSTAL)
+ get_clustal_seq(sname1,seq1,&l1,title,i-first_seq+1);
+ if(seqFormat == MSF)
+ get_msf_seq(sname1,seq1,&l1,title,i-first_seq+1);
+ else
+ get_seq(sname1,seq1,&l1,title);
+
+ if(l1 > SEQ_MAX_LEN)
+ {
+ error("Sequence too long. Maximum is %d",(pint)SEQ_MAX_LEN);
+ return 0; /* also return zero if too many */
+ }
+
+
+
+ for ( a=0; a<l1; a++)
+ seq1[a]=seq1[a+1];
+ seq1[l1]='\0';
+
+ (SAGA_SEQ[0])[i]=(char*)vcalloc ( strlen ( seq1)+1, sizeof ( char));
+ (SAGA_NAMES[0])[i]=(char*)vcalloc ( strlen ( sname1)+1, sizeof ( char));
+
+ l1=strlen (seq1);
+ (SAGA_LEN[0])[i][0]=l1;
+ (SAGA_SEQ[0])[i][l1]='\0';
+
+ sprintf ( (SAGA_SEQ[0])[i], "%s", seq1);
+ sprintf ( (SAGA_NAMES[0])[i], "%s", sname1);
+
+ l=strlen (SAGA_NAMES[0][i]);
+ for ( b=0; b< l; b++)
+ {
+ if ( isspace(SAGA_NAMES[0][i][b])){SAGA_NAMES[0][i][b]='\0';break;}
+ }
+
+
+ }
+
+ fclose(fin);
+/*
+ JULIE
+ check sequence names are all different - otherwise phylip tree is
+ confused.
+*/
+ for(i=first_seq;i<=first_seq+no_seqs-1;i++) {
+ for(j=i+1;j<=first_seq+no_seqs-1;j++) {
+ if (strncmp((SAGA_NAMES[0])[i],(SAGA_NAMES[0])[j],MAXNAMES) == 0) {
+ error("Multiple sequences found with same name (first %d chars are significant)", MAXNAMES);
+ return -1;
+ }
+ }
+ }
+
+ return no_seqs; /* return the number of seqs. read in this call */
+}
+
+
+
+
+static void check_infile(int *nseqs)
+{
+ static char *line;
+ int i;
+
+ if ( !line)line=(char*)vcalloc ( (MAXLINE+1), sizeof (char));
+
+ for ( i=0; i<=MAXLINE; i++)line[i]='a';
+ *nseqs=0;
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if ((*line != '\n') && (*line != ' ') && (*line != '\t'))
+ break;
+ }
+
+ for(i=0;i<=6;i++) line[i] = toupper(line[i]);
+
+ if( linetype(line,"ID") ) { /* EMBL/Swiss-Prot format ? */
+ seqFormat=EMBLSWISS;
+ (*nseqs)++;
+ }
+ else if( linetype(line,"CLUSTAL") ) {
+ seqFormat=CLUSTAL;
+ }
+ else if( linetype(line,"PILEUP") ) {
+ seqFormat = MSF;
+ }
+ else if(*line == '>') { /* no */
+ seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */
+ (*nseqs)++;
+ }
+ else if((*line == '"') || (*line == '%') || (*line == '#')) {
+ seqFormat=GDE; /* GDE format */
+ if (*line == '%') {
+ (*nseqs)++;
+
+ }
+ else if (*line == '#') {
+ (*nseqs)++;
+
+ }
+ }
+ else {
+ seqFormat=UNKNOWN;
+ return;
+ }
+
+ while(fgets(line,MAXLINE+1,fin) != NULL) {
+ switch(seqFormat) {
+ case EMBLSWISS:
+ if( linetype(line,"ID") )
+ (*nseqs)++;
+ break;
+ case PIR:
+ case PEARSON:
+ if( *line == '>' )
+ (*nseqs)++;
+ break;
+ case GDE:
+ if(( *line == '%' ) )
+ (*nseqs)++;
+ else if (( *line == '#') )
+ (*nseqs)++;
+ break;
+ case CLUSTAL:
+ *nseqs = count_clustal_seqs();
+/* DES */ /* fprintf(stdout,"\nnseqs = %d\n",(pint)*nseqs); */
+ fseek(fin,0,0);
+ return;
+ break;
+ case MSF:
+ *nseqs = count_msf_seqs();
+ fseek(fin,0,0);
+ return;
+ break;
+ case USER:
+ default:
+ break;
+ }
+ }
+ fseek(fin,0,0);
+}
+
+
+static int count_clustal_seqs(void)
+/* count the number of sequences in a clustal alignment file */
+{
+ static char *line;
+ int nseqs;
+
+ if ( !line)line=(char*)vcalloc ( (MAXLINE+1), sizeof (char));
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(!blankline(line)) break; /* Look for next non- */
+ } /* blank line */
+ nseqs = 1;
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(blankline(line)) return nseqs;
+ nseqs++;
+ }
+
+ return 0; /* if you got to here-funny format/no seqs.*/
+}
+
+static int count_msf_seqs(void)
+{
+/* count the number of sequences in a PILEUP alignment file */
+
+ static char *line;
+ int nseqs;
+
+ if ( !line)line=(char*)vcalloc ( (MAXLINE+1), sizeof (char));
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(linetype(line,"/")) break;
+ }
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(!blankline(line)) break; /* Look for next non- */
+ } /* blank line */
+ nseqs = 1;
+
+ while (fgets(line,MAXLINE+1,fin) != NULL) {
+ if(blankline(line)) return nseqs;
+ nseqs++;
+ }
+
+ return 0; /* if you got to here-funny format/no seqs.*/
+}
+
+
+