JPRED-2 Add alscript to the Git repository
[jpred.git] / sources / alscript / src / msf2blc.c
diff --git a/sources/alscript/src/msf2blc.c b/sources/alscript/src/msf2blc.c
new file mode 100644 (file)
index 0000000..f3aec8d
--- /dev/null
@@ -0,0 +1,204 @@
+/***************************************************************************
+
+msf2blc:  A program to convert a GCG .MSF file into an AMPS blockfile.
+
+   Copyright:  Geoffrey J. Barton (1992,1997)
+
+   email: geoff@ebi.ac.uk
+   Please see the README file for details of conditions for use of this program.
+
+   $Id: msf2blc.c,v 1.3 1999/07/09 10:35:29 geoff Exp $
+   $Log: msf2blc.c,v $
+   Revision 1.3  1999/07/09 10:35:29  geoff
+   Change version and copyright statement to reflect 1997 status
+
+   Revision 1.2  1998/09/17 16:55:06  geoff
+   Check consistency with archive
+
+
+****************************************************************************
+
+Notes:  This program can be run as a pipe:  type msf2blc -q < input > output
+Only error messages will be output to std_err
+
+Default mode is interactive and prompts for filenames.
+
+The storage for the sequences is allocated dynamically, so the MAX_SEQ_LEN
+defines in the header file "defaults.h" have no effect.  If a system memory
+limit is reached, then a "malloc error" message will be written and the
+program will stop.  Most computers should happily cope with large numbers of
+long sequences.  Some possible solutions to this problem are outlined in
+the user manual - alscript.doc
+
+24 October 1994: Add -n option to delete . from alignment if found.
+
+****************************************************************************/
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "gjutil.h"
+#include "array.h"
+#include "defaults.h"
+
+#define TOKENS " \t\n"
+
+
+main(int argc,char *argv[])
+{
+       struct seqdat *seqs;
+       FILE *fp,*fout;
+       int nseq;
+       int found;
+       int i,j;
+        char *token,*sbit;
+        char *line;
+        extern FILE *std_err,*std_in,*std_out;
+        char *msffile;
+        char *blocfile;
+        int quiet;
+       int nodot;
+/*        
+        std_err = stderr;
+        std_in = stdin;
+        std_out = stdout;
+*/
+
+       GJinitfile();
+        
+        line = GJstrcreate(MAX_INLEN," ");
+        msffile = GJstrcreate(MAX_INLEN,NULL);
+        blocfile = GJstrcreate(MAX_INLEN,NULL);
+
+        nseq = 0;
+        found = 0;
+        quiet = 0;
+       nodot = 0;
+
+        if(argc > 1){
+         if(strcmp(argv[1],"-q")==0){
+            /* Quiet mode - read .MSF file from stdin and output block file to stdout */
+            quiet = 1;
+            fp = std_in;
+            fout = std_out;
+         }
+         if(argc > 2){
+             if(strcmp(argv[2],"-n")==0){
+                 /* set flag to remove dots */
+                 nodot = 1;
+             }
+         }
+        }else{
+          /* Verbose mode - prompt for all filenames */
+          fprintf(std_out,"\n\n");
+          fprintf(std_out,"GCG .MSF to AMPS Blockfile conversion\n");
+          fprintf(std_out,"Copyright: G. J. Barton (1992)\n");
+          fprintf(std_out,"Author: G. J. Barton (1992)\n\n");
+          fprintf(std_out,"Max number/length of alignment - Defined by System\n");
+          fprintf(std_out,"If you get a malloc error message - see manual\n\n");
+          fprintf(std_out,"Enter MSF filename: ");
+          
+          fscanf(std_in,"%s",msffile);
+          fprintf(std_out,"Opening: %s\n",msffile);
+          fp = GJfopen(msffile,"r",1);
+          
+          fprintf(std_out,"Enter Block filename: ");
+          fscanf(std_in,"%s",blocfile);
+          fprintf(std_out,"Opening: %s\n",blocfile);
+          fout = GJfopen(blocfile,"w",1);
+        }
+       
+       fprintf(fout,"\n");
+       fprintf(fout,"Conversion of GCG .MSF file to AMPS BLOCKFILE format\n");
+       fprintf(fout,"msf2blc:  Geoffrey J. Barton (1992)\n\n");
+
+        seqs = (struct seqdat *) GJmalloc(sizeof(struct seqdat));
+
+               if(!quiet)fprintf(std_out,"Reading .msf file\n");
+        while(fgets(line,MAX_INLEN,fp) != NULL){
+         if(line[0] != '\n'){
+             token = strtok(line,TOKENS);
+             if(token != NULL){
+               if(strcmp(token,"Name:") == 0){
+                 /* This is a seq id name */
+                  token = strtok(NULL,TOKENS);
+                  seqs = (struct seqdat *) GJrealloc(seqs,sizeof(struct seqdat) * (nseq +1));
+                  seqs[nseq].id = GJstrdup(token);
+                  seqs[nseq].title = GJstrdup(line);
+                  seqs[nseq].slen = 0;
+                  seqs[nseq].seq = (char *) GJmalloc(sizeof(char));
+                  ++nseq;
+                  if(!quiet)fprintf(std_out,"%s\n",seqs[nseq-1].id);
+              }else if((strcmp(token,"//") == 0) || found){
+                  /* this signals the end of identifiers so process sequences*/
+                  found = 1;
+                  if(token != NULL){
+                    /* find out which seq this is */
+                    i=0;
+                   for(i=0;i<nseq;++i){
+                      if(strcmp(token,seqs[i].id) == 0){
+                        break;
+                      }
+                    }
+                     /* read in the sequence */
+                     if(i < nseq){
+                       token = strtok(NULL,"\n");
+                       if(token == NULL){
+                         GJerror("Cannot find sequence in line");
+                         fprintf(std_err,"%s",line);
+                         exit(1);
+                      }
+                       j=0;
+                       while(token[j] != '\0'){
+                         if(isalpha(token[j]) || token[j] == '.'){
+                           seqs[i].seq = (char *) GJrealloc(seqs[i].seq,sizeof(char) * (seqs[i].slen +1));
+                           seqs[i].seq[seqs[i].slen] = token[j];
+                           ++seqs[i].slen;
+                        }
+                         ++j;
+                      }
+                    }
+                 }
+               }else{
+                  /* this is a comment line - just echo */
+                  fprintf(fout,"%s\n",line);
+               }
+            }
+          }
+       }
+        if(!quiet)fprintf(std_out,"All %d sequences read in\n",nseq);
+        if(!quiet)fprintf(std_out,"Writing .blc file\n");
+        
+        for(i=0;i<nseq;++i){
+            fprintf(fout,">%s %s\n",seqs[i].id,seqs[i].title);
+        }
+        fprintf(fout,"* iteration 1\n");
+        for(i=0;i<seqs[0].slen;++i){
+            for(j=0;j<nseq;++j){
+               /* edit out dots if required */
+               if(nodot == 1){
+                   if(seqs[j].seq[i] == '.'){
+                       seqs[j].seq[i] = ' ';
+                   }
+               }
+                fprintf(fout,"%c",seqs[j].seq[i]);
+            }
+            fprintf(fout,"\n");
+        }
+        fprintf(fout,"*\n");
+        if(!quiet)fprintf(std_out,"All done\n");
+        
+        for(i=0;i<nseq;++i){
+         GJfree(seqs[i].seq);
+         GJfree(seqs[i].id);
+         GJfree(seqs[i].title);
+       }
+       GJfree(seqs);
+       GJfree(line);
+       GJfree(blocfile);
+       GJfree(msffile);
+
+}