--- /dev/null
+/***************************************************************************
+
+fast2blc: A program to convert a FASTA format alignment file to an
+ AMPS blockfile.
+
+ Copyright: Geoffrey J. Barton (1992,1997)
+
+ email: geoff@ebi.ac.uk
+ Please see the README file for details of conditions for use of this program.
+
+ $Id: fast2blc.c,v 1.3 1999/07/09 10:35:29 geoff Exp $
+ $Log: fast2blc.c,v $
+ Revision 1.3 1999/07/09 10:35:29 geoff
+ Change version and copyright statement to reflect 1997 status
+
+ Revision 1.2 1998/09/17 16:55:01 geoff
+ Check consistency with archive
+
+
+****************************************************************************
+
+Notes: This program can be run as a pipe: type fast2blc -q < input > output
+Only error messages will be output to std_err
+
+Default mode is interactive and prompts for filenames.
+
+The storage for the sequences is allocated dynamically, so the MAX_SEQ_LEN
+defines in the header file "defaults.h" have no effect. If a system memory
+limit is reached, then a "malloc error" message will be written and the
+program will stop. Most computers should happily cope with large numbers of
+long sequences. If yours doesn't, some possible solutions are outlined in
+the user manual - alscript.doc.
+
+24 Oct 1994 - modify to remove dots and dashes from input file before writing
+output file (if -n option is second argument ).
+
+19 April 1995 - fast2blc derived from clus2blc to read a FASTA style alignment
+file. This assumes the title line is after the id code rather than on the
+next line.
+
+****************************************************************************/
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "gjutil.h"
+#include "array.h"
+#include "defaults.h"
+
+#define TOKENS " \t\n"
+
+
+main(int argc,char *argv[])
+{
+ struct seqdat *seqs;
+ FILE *fp,*fout;
+ int nseq;
+ int found;
+ int i,j;
+ char *token,*sbit;
+ char *line;
+ extern FILE *std_err,*std_in,*std_out;
+ char *msffile;
+ char *blocfile;
+ int quiet;
+ char c;
+ int allen; /* total alignment length */
+ int nodot;
+
+ std_err = stderr;
+ std_in = stdin;
+ std_out = stdout;
+
+ line = GJstrcreate(MAX_INLEN," ");
+ msffile = GJstrcreate(MAX_INLEN,NULL);
+ blocfile = GJstrcreate(MAX_INLEN,NULL);
+
+ nseq = 0;
+ found = 0;
+ quiet = 0;
+ allen = 0;
+ nodot = 0;
+
+ if(argc > 1){
+ if(strcmp(argv[1],"-q")==0){
+ /* Quiet mode - read .MSF file from stdin and output block file to stdout */
+ quiet = 1;
+ fp = std_in;
+ fout = std_out;
+ }
+ if(argc > 2){
+ if(strcmp(argv[2],"-n")==0){
+ /* set flag to remove dots */
+ nodot = 1;
+ }
+ }
+ }else{
+ /* Verbose mode - prompt for all filenames */
+ fprintf(std_out,"\n\n");
+ fprintf(std_out,"FASTA format to AMPS Blockfile conversion\n");
+ fprintf(std_out,"Copyright: G. J. Barton (1992)\n");
+ fprintf(std_out,"Author: G. J. Barton (1992)\n\n");
+ fprintf(std_out,"Max number/length of alignment - Defined by System\n");
+ fprintf(std_out,"If you get a malloc error message - see manual\n\n");
+ fprintf(std_out,"Enter FASTA format alignment filename: ");
+
+ fscanf(std_in,"%s",msffile);
+ fprintf(std_out,"Opening: %s\n",msffile);
+ fp = GJfopen(msffile,"r",1);
+
+ fprintf(std_out,"Enter Block filename: ");
+ fscanf(std_in,"%s",blocfile);
+ fprintf(std_out,"Opening: %s\n",blocfile);
+ fout = GJfopen(blocfile,"w",1);
+ }
+
+ fprintf(fout,"\n");
+ fprintf(fout,"Conversion of FASTA file to AMPS BLOCKFILE format\n");
+ fprintf(fout,"fast2blc: Geoffrey J. Barton (1992)\n\n");
+
+ seqs = (struct seqdat *) GJmalloc(sizeof(struct seqdat));
+
+ if(!quiet)fprintf(std_out,"Reading .fas file\n");
+ nseq = 0;
+ while(fgets(line,MAX_INLEN,fp) != NULL){
+ if(line[0] == '>'){
+ /* found an identifier */
+ token = strtok(&line[1]," -\n");
+ if(token != NULL){
+ seqs = (struct seqdat *) GJrealloc(seqs,sizeof(struct seqdat) * (nseq + 1));
+ seqs[nseq].id = GJstrdup(token);
+ token = strtok(NULL,"\n");
+ if(token != NULL){
+ /* assign the title line */
+ seqs[nseq].title = GJstrdup(token);
+ seqs[nseq].seq = GJstrcreate(MAX_SEQ_LEN,NULL);
+ seqs[nseq].slen = 0;
+ seqs[nseq].seq = (char *) GJmalloc(sizeof(char));
+ i=0;
+ while((c = fgetc(fp)) != '*'){
+ /* read characters until * */
+ if(isalpha(c) || c == '-' || c == '.'){
+ seqs[nseq].seq = (char *) GJrealloc(seqs[nseq].seq,sizeof(char) * (i+1));
+ seqs[nseq].seq[i] = c;
+ ++i;
+ }else if(c == EOF){
+ break;
+ }
+ }
+ }else{
+ fprintf(std_err,
+ "Error: No title in sequence %s - is this FASTA format?\n",seqs[nseq].id);
+ exit(1);
+ }
+ seqs[nseq].slen = i;
+ if(i > allen) allen = i;
+ ++nseq;
+ }
+ }
+ }
+
+ if(!quiet)fprintf(std_out,"All %d sequences read in\n",nseq);
+ if(!quiet)fprintf(std_out,"Writing .blc file\n");
+
+ for(i=0;i<nseq;++i){
+ fprintf(fout,">%s %s\n",seqs[i].id,seqs[i].title);
+ }
+ fprintf(fout,"* iteration 1\n");
+ for(i=0;i<allen;++i){
+ for(j=0;j<nseq;++j){
+ if(seqs[j].slen <= i){
+ fprintf(fout,"%c",' ');
+ }else{
+ /* edit out dots and dashes if required */
+ if(nodot == 1){
+ if(seqs[j].seq[i] == '.' || seqs[j].seq[i] == '-'){
+ seqs[j].seq[i] = ' ';
+ }
+ }
+ fprintf(fout,"%c",seqs[j].seq[i]);
+ }
+ }
+ fprintf(fout,"\n");
+ }
+ fprintf(fout,"*\n");
+ if(!quiet)fprintf(std_out,"All done\n");
+
+ for(i=0;i<nseq;++i){
+ GJfree(seqs[i].seq);
+ GJfree(seqs[i].id);
+ GJfree(seqs[i].title);
+ }
+ GJfree(seqs);
+ GJfree(line);
+ GJfree(blocfile);
+ GJfree(msffile);
+
+}