--- /dev/null
+/***************************************************************************
+
+msf2blc: A program to convert a GCG .MSF file into an AMPS blockfile.
+
+ Copyright: Geoffrey J. Barton (1992,1997)
+
+ email: geoff@ebi.ac.uk
+ Please see the README file for details of conditions for use of this program.
+
+ $Id: msf2blc.c,v 1.3 1999/07/09 10:35:29 geoff Exp $
+ $Log: msf2blc.c,v $
+ Revision 1.3 1999/07/09 10:35:29 geoff
+ Change version and copyright statement to reflect 1997 status
+
+ Revision 1.2 1998/09/17 16:55:06 geoff
+ Check consistency with archive
+
+
+****************************************************************************
+
+Notes: This program can be run as a pipe: type msf2blc -q < input > output
+Only error messages will be output to std_err
+
+Default mode is interactive and prompts for filenames.
+
+The storage for the sequences is allocated dynamically, so the MAX_SEQ_LEN
+defines in the header file "defaults.h" have no effect. If a system memory
+limit is reached, then a "malloc error" message will be written and the
+program will stop. Most computers should happily cope with large numbers of
+long sequences. Some possible solutions to this problem are outlined in
+the user manual - alscript.doc
+
+24 October 1994: Add -n option to delete . from alignment if found.
+
+****************************************************************************/
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "gjutil.h"
+#include "array.h"
+#include "defaults.h"
+
+#define TOKENS " \t\n"
+
+
+main(int argc,char *argv[])
+{
+ struct seqdat *seqs;
+ FILE *fp,*fout;
+ int nseq;
+ int found;
+ int i,j;
+ char *token,*sbit;
+ char *line;
+ extern FILE *std_err,*std_in,*std_out;
+ char *msffile;
+ char *blocfile;
+ int quiet;
+ int nodot;
+/*
+ std_err = stderr;
+ std_in = stdin;
+ std_out = stdout;
+*/
+
+ GJinitfile();
+
+ line = GJstrcreate(MAX_INLEN," ");
+ msffile = GJstrcreate(MAX_INLEN,NULL);
+ blocfile = GJstrcreate(MAX_INLEN,NULL);
+
+ nseq = 0;
+ found = 0;
+ quiet = 0;
+ nodot = 0;
+
+ if(argc > 1){
+ if(strcmp(argv[1],"-q")==0){
+ /* Quiet mode - read .MSF file from stdin and output block file to stdout */
+ quiet = 1;
+ fp = std_in;
+ fout = std_out;
+ }
+ if(argc > 2){
+ if(strcmp(argv[2],"-n")==0){
+ /* set flag to remove dots */
+ nodot = 1;
+ }
+ }
+ }else{
+ /* Verbose mode - prompt for all filenames */
+ fprintf(std_out,"\n\n");
+ fprintf(std_out,"GCG .MSF to AMPS Blockfile conversion\n");
+ fprintf(std_out,"Copyright: G. J. Barton (1992)\n");
+ fprintf(std_out,"Author: G. J. Barton (1992)\n\n");
+ fprintf(std_out,"Max number/length of alignment - Defined by System\n");
+ fprintf(std_out,"If you get a malloc error message - see manual\n\n");
+ fprintf(std_out,"Enter MSF filename: ");
+
+ fscanf(std_in,"%s",msffile);
+ fprintf(std_out,"Opening: %s\n",msffile);
+ fp = GJfopen(msffile,"r",1);
+
+ fprintf(std_out,"Enter Block filename: ");
+ fscanf(std_in,"%s",blocfile);
+ fprintf(std_out,"Opening: %s\n",blocfile);
+ fout = GJfopen(blocfile,"w",1);
+ }
+
+ fprintf(fout,"\n");
+ fprintf(fout,"Conversion of GCG .MSF file to AMPS BLOCKFILE format\n");
+ fprintf(fout,"msf2blc: Geoffrey J. Barton (1992)\n\n");
+
+ seqs = (struct seqdat *) GJmalloc(sizeof(struct seqdat));
+
+ if(!quiet)fprintf(std_out,"Reading .msf file\n");
+ while(fgets(line,MAX_INLEN,fp) != NULL){
+ if(line[0] != '\n'){
+ token = strtok(line,TOKENS);
+ if(token != NULL){
+ if(strcmp(token,"Name:") == 0){
+ /* This is a seq id name */
+ token = strtok(NULL,TOKENS);
+ seqs = (struct seqdat *) GJrealloc(seqs,sizeof(struct seqdat) * (nseq +1));
+ seqs[nseq].id = GJstrdup(token);
+ seqs[nseq].title = GJstrdup(line);
+ seqs[nseq].slen = 0;
+ seqs[nseq].seq = (char *) GJmalloc(sizeof(char));
+ ++nseq;
+ if(!quiet)fprintf(std_out,"%s\n",seqs[nseq-1].id);
+ }else if((strcmp(token,"//") == 0) || found){
+ /* this signals the end of identifiers so process sequences*/
+ found = 1;
+ if(token != NULL){
+ /* find out which seq this is */
+ i=0;
+ for(i=0;i<nseq;++i){
+ if(strcmp(token,seqs[i].id) == 0){
+ break;
+ }
+ }
+ /* read in the sequence */
+ if(i < nseq){
+ token = strtok(NULL,"\n");
+ if(token == NULL){
+ GJerror("Cannot find sequence in line");
+ fprintf(std_err,"%s",line);
+ exit(1);
+ }
+ j=0;
+ while(token[j] != '\0'){
+ if(isalpha(token[j]) || token[j] == '.'){
+ seqs[i].seq = (char *) GJrealloc(seqs[i].seq,sizeof(char) * (seqs[i].slen +1));
+ seqs[i].seq[seqs[i].slen] = token[j];
+ ++seqs[i].slen;
+ }
+ ++j;
+ }
+ }
+ }
+ }else{
+ /* this is a comment line - just echo */
+ fprintf(fout,"%s\n",line);
+ }
+ }
+ }
+ }
+ if(!quiet)fprintf(std_out,"All %d sequences read in\n",nseq);
+ if(!quiet)fprintf(std_out,"Writing .blc file\n");
+
+ for(i=0;i<nseq;++i){
+ fprintf(fout,">%s %s\n",seqs[i].id,seqs[i].title);
+ }
+ fprintf(fout,"* iteration 1\n");
+ for(i=0;i<seqs[0].slen;++i){
+ for(j=0;j<nseq;++j){
+ /* edit out dots if required */
+ if(nodot == 1){
+ if(seqs[j].seq[i] == '.'){
+ seqs[j].seq[i] = ' ';
+ }
+ }
+ fprintf(fout,"%c",seqs[j].seq[i]);
+ }
+ fprintf(fout,"\n");
+ }
+ fprintf(fout,"*\n");
+ if(!quiet)fprintf(std_out,"All done\n");
+
+ for(i=0;i<nseq;++i){
+ GJfree(seqs[i].seq);
+ GJfree(seqs[i].id);
+ GJfree(seqs[i].title);
+ }
+ GJfree(seqs);
+ GJfree(line);
+ GJfree(blocfile);
+ GJfree(msffile);
+
+}