JPRED-2 Add alscript to the Git repository
[jpred.git] / sources / alscript / src / clus2blc.c
diff --git a/sources/alscript/src/clus2blc.c b/sources/alscript/src/clus2blc.c
new file mode 100644 (file)
index 0000000..6863aab
--- /dev/null
@@ -0,0 +1,194 @@
+/***************************************************************************
+
+clus2blc:  A program to convert a CLUSTAL PIR format alignment file to an 
+                                                        AMPS blockfile.
+
+   Copyright:  Geoffrey J. Barton (1992,1997)
+
+   email: geoff@ebi.ac.uk
+   Please see the README file for details of conditions 
+   for use of this program.
+
+ $Id: clus2blc.c,v 1.3 1999/07/09 10:35:28 geoff Exp $
+ $Log: clus2blc.c,v $
+ Revision 1.3  1999/07/09 10:35:28  geoff
+ Change version and copyright statement to reflect 1997 status
+
+ Revision 1.2  1998/09/17 16:55:00  geoff
+ Check consistency with archive
+
+
+
+****************************************************************************
+
+Notes:  This program can be run as a pipe:  type clus2blc -q < input > output
+Only error messages will be output to std_err
+
+Default mode is interactive and prompts for filenames.
+
+The storage for the sequences is allocated dynamically, so the MAX_SEQ_LEN
+defines in the header file "defaults.h" have no effect.  If a system memory
+limit is reached, then a "malloc error" message will be written and the
+program will stop.  Most computers should happily cope with large numbers of
+long sequences.  If yours doesn't, some  possible solutions are outlined in
+the user manual - alscript.doc.
+
+24 Oct 1994 - modify to remove dots and dashes from input file before writing
+output file (if -n option is second argument ).
+
+****************************************************************************/
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "gjutil.h"
+#include "array.h"
+#include "defaults.h"
+
+#define TOKENS " \t\n"
+
+
+main(int argc,char *argv[])
+{
+       struct seqdat *seqs;
+       FILE *fp,*fout;
+       int nseq;
+       int found;
+       int i,j;
+        char *token,*sbit;
+        char *line;
+        extern FILE *std_err,*std_in,*std_out;
+        char *msffile;
+        char *blocfile;
+        int quiet;
+        char c;
+       int allen;  /* total alignment length */
+       int nodot;
+        
+        std_err = stderr;
+        std_in = stdin;
+        std_out = stdout;
+        
+        line = GJstrcreate(MAX_INLEN," ");
+        msffile = GJstrcreate(MAX_INLEN,NULL);
+        blocfile = GJstrcreate(MAX_INLEN,NULL);
+
+        nseq = 0;
+        found = 0;
+        quiet = 0;
+       allen = 0;
+       nodot = 0;
+
+        if(argc > 1){
+         if(strcmp(argv[1],"-q")==0){
+            /* Quiet mode - read .MSF file from stdin and output block file to stdout */
+            quiet = 1;
+            fp = std_in;
+            fout = std_out;
+         }
+         if(argc > 2){
+             if(strcmp(argv[2],"-n")==0){
+                 /* set flag to remove dots */
+                 nodot = 1;
+             }
+         }
+        }else{
+          /* Verbose mode - prompt for all filenames */
+          fprintf(std_out,"\n\n");
+          fprintf(std_out,"CLUSTAL NBRF-PIR format to AMPS Blockfile conversion\n");
+          fprintf(std_out,"Copyright: G. J. Barton (1992)\n");
+          fprintf(std_out,"Author: G. J. Barton (1992)\n\n");
+          fprintf(std_out,"Max number/length of alignment - Defined by System\n");
+          fprintf(std_out,"If you get a malloc error message - see manual\n\n");
+          fprintf(std_out,"Enter CLUSTAL NBRF-PIR alignment filename: ");
+          
+          fscanf(std_in,"%s",msffile);
+          fprintf(std_out,"Opening: %s\n",msffile);
+          fp = GJfopen(msffile,"r",1);
+          
+          fprintf(std_out,"Enter Block filename: ");
+          fscanf(std_in,"%s",blocfile);
+          fprintf(std_out,"Opening: %s\n",blocfile);
+          fout = GJfopen(blocfile,"w",1);
+        }
+       
+       fprintf(fout,"\n");
+       fprintf(fout,"Conversion of CLUSTAL NBRF-PIR file to AMPS BLOCKFILE format\n");
+       fprintf(fout,"clus2blc:  Geoffrey J. Barton (1992)\n\n");
+
+        seqs = (struct seqdat *) GJmalloc(sizeof(struct seqdat));
+
+               if(!quiet)fprintf(std_out,"Reading .pir file\n");
+               nseq = 0;
+        while(fgets(line,MAX_INLEN,fp) != NULL){
+         if(line[0] == '>'){
+           /* found an identifier */
+           token = strtok(&line[1]," \n");
+            if(token != NULL){
+              seqs = (struct seqdat *) GJrealloc(seqs,sizeof(struct seqdat) * (nseq + 1));
+              seqs[nseq].id = GJstrdup(token);
+              if(fgets(line,MAX_INLEN,fp) != NULL){
+                /* read the title line */
+                seqs[nseq].title = GJstrdup(line);
+                seqs[nseq].seq = GJstrcreate(MAX_SEQ_LEN,NULL);
+                seqs[nseq].slen = 0;
+                seqs[nseq].seq = (char *) GJmalloc(sizeof(char));
+                i=0;
+                while((c = fgetc(fp)) != '*'){
+                    /* read characters until * */
+                    if(isalpha(c) || c == '-' || c == '.'){
+                        seqs[nseq].seq = (char *) GJrealloc(seqs[nseq].seq,sizeof(char) * (i+1));
+                        seqs[nseq].seq[i] = c;
+                        ++i;
+                    }else if(c == EOF){
+                        break;
+                    }
+               }
+             }
+              seqs[nseq].slen = i;
+             if(i > allen) allen = i;
+              ++nseq;
+           }
+         }
+       }
+
+        if(!quiet)fprintf(std_out,"All %d sequences read in\n",nseq);
+        if(!quiet)fprintf(std_out,"Writing .blc file\n");
+        
+        for(i=0;i<nseq;++i){
+            fprintf(fout,">%s %s",seqs[i].id,seqs[i].title);
+        }
+        fprintf(fout,"* iteration 1\n");
+        for(i=0;i<allen;++i){
+            for(j=0;j<nseq;++j){
+               if(seqs[j].slen <= i){
+                 fprintf(fout,"%c",' ');
+               }else{
+               /* edit out dots and dashes if required */
+               if(nodot == 1){
+                   if(seqs[j].seq[i] == '.' || seqs[j].seq[i] == '-'){
+                       seqs[j].seq[i] = ' ';
+                   }
+               }
+                 fprintf(fout,"%c",seqs[j].seq[i]);
+               }
+            }
+            fprintf(fout,"\n");
+        }
+        fprintf(fout,"*\n");
+        if(!quiet)fprintf(std_out,"All done\n");
+        
+        for(i=0;i<nseq;++i){
+         GJfree(seqs[i].seq);
+         GJfree(seqs[i].id);
+         GJfree(seqs[i].title);
+       }
+       GJfree(seqs);
+       GJfree(line);
+       GJfree(blocfile);
+       GJfree(msffile);
+
+}