--- /dev/null
+/* Bonnie Berger, David Wilson and Theodore Tonchev 1992 */
+/* Modified by Ethan Wolf 1995 */
+/* C Code File */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sc.h"
+#include "scio.h"
+#include "scsystem.h"
+#include "scconst.h"
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <unistd.h>
+
+#include <pwd.h> /* For finding location of a ~usr directory. */
+
+/* Amino Acid IDs */
+/* char aaname[] = "LIVMFYGAKRHEDQNSTCWPX"; */
+char aaname[]= "LIVMFYGAKRHEDQNSTCWPXBZ.";
+
+/* Position IDs */
+char posname[] = "abcdefg";
+
+/* Array Sizes */
+#define SIZEOFGTS 1
+#define SIZEOFGFS AANUM
+#define SIZEOFGTP POSNUM
+#define SIZEOFGFP (AANUM*AANUM*POSNUM)
+
+#define SIZEOFPTS POSNUM
+#define SIZEOFPFS (AANUM*POSNUM)
+#define SIZEOFPTP (POSNUM*POSNUM)
+#define SIZEOFPFP (AANUM*AANUM*POSNUM*POSNUM)
+
+/* Magic Numbers */
+#define GENFREQ_MAGIC "GenFreq 1 + 20 + 7 + 20x20x7 long\n"
+#define POSFREQ_MAGIC "PosFreq 7 + 20x7 + 7x7 + 20x20x7x7 long\n"
+
+#ifndef ERROR
+#define ERROR(STR) error(FOPEN_ERR, (STR))
+#endif
+#define NULLERR(X) if (!(X)) error(FOPEN_ERR, name)
+static fd_set _popened_files;
+static fd_set *popened_files=NULL;
+
+void open_file (FILE** fl, char* name, char* ops)
+{
+ int str_len;
+ char cmd[256];
+
+ /* Initialize set of popened files */
+ if (!popened_files) {
+ popened_files = &_popened_files;
+ FD_ZERO(popened_files);
+ }
+ /* If we aren't reading, do the usual thing */
+ if (strcmp(ops,"r")) {
+ NULLERR(*fl = fopen(name, ops));
+ return; }
+
+ /* If we're reading a .Z file, use zcat */
+ str_len = strlen(name);
+ if (str_len>256-8)
+ error(FOPEN_ERR,"file name too long for internal buffer");
+ strcpy(cmd,"zcat ");
+ strcat(cmd,name);
+ if (name[str_len-2]=='.' && name[str_len-1]=='Z') {
+ NULLERR(*fl = popen(cmd, "r"));
+ FD_SET(fileno(*fl),popened_files);
+ return; }
+
+ /* Otherwise, if the file exists, read it normally */
+ if (!access(name,F_OK)) {
+ NULLERR(*fl = fopen(name, ops));
+ return; }
+
+ /* And if the file doesn't exist, try adding .Z */
+ strcat(cmd,".Z");
+ if (!access(&(cmd[5]),F_OK)) {
+ NULLERR(*fl = popen(cmd, "r"));
+ FD_SET(fileno(*fl),popened_files);
+ return; }
+
+ /* The file does not exist. */
+ error(FOPEN_ERR,name);
+}
+void close_file (FILE *fl)
+{
+ if (FD_ISSET(fileno(fl),popened_files)) {
+ FD_CLR(fileno(fl),popened_files);
+ pclose(fl);
+ }
+ else
+ fclose(fl);
+}
+
+void htonla();
+void ntohla();
+
+
+
+void writegenfreq(FILE* fl,
+ long totals[], long freqs[AANUM],
+ long totalp[POSNUM], long freqp[AANUM][AANUM][POSNUM],
+ FILE* flog)
+{
+ int i;
+
+ fputs("Writing Gen Frequences.\n", flog);
+
+ fputs(GENFREQ_MAGIC, fl);
+
+ htonla(totals, SIZEOFGTS);
+ if (fwrite(totals, sizeof(long), SIZEOFGTS, fl) != SIZEOFGTS)
+ error(FWRITE_ERR, "<GenFreq> file");
+
+ htonla(freqs, SIZEOFGFS);
+ if (fwrite(freqs, sizeof(long), SIZEOFGFS, fl) != SIZEOFGFS)
+ error(FWRITE_ERR, "<GenFreq> file");
+
+ htonla(totalp, SIZEOFGTP);
+ if (fwrite(totalp, sizeof(long), SIZEOFGTP, fl) != SIZEOFGTP)
+ error(FWRITE_ERR, "<GenFreq> file");
+
+ htonla(freqp, SIZEOFGFP);
+ if (fwrite(freqp, sizeof(long), SIZEOFGFP, fl) != SIZEOFGFP)
+ error(FWRITE_ERR, "<GenFreq> file");
+}
+
+void readgenfreq(FILE* fl,
+ long totals[], long freqs[AANUM],
+ long totalp[POSNUM], long freqp[AANUM*AANUM*POSNUM],
+ FILE* flog)
+{
+ char line[MAXLINE];
+ int i;
+
+ fputs("Reading Gen Frequences.\n", flog);
+
+ if (strlen(GENFREQ_MAGIC) > 0) {
+ fgets(line, MAXLINE, fl);
+ if (strcmp(line, GENFREQ_MAGIC))
+ error(MAGIC_ERR, "<GenFreq> file");
+ }
+
+/*
+ if (fread(totals, sizeof(long), SIZEOFGTS, fl) != SIZEOFGTS)
+ error(FREAD_ERR, "<GenFreq> file");
+ ntohla(totals, SIZEOFGTS);
+
+ if (fread(freqs, sizeof(long), SIZEOFGFS, fl) != SIZEOFGFS)
+ error(FREAD_ERR, "<GenFreq> file");
+ ntohla(freqs, SIZEOFGFS);
+
+ if (fread(totalp, sizeof(long), SIZEOFGTP, fl) != SIZEOFGTP)
+ error(FREAD_ERR, "<GenFreq> file");
+ ntohla(totalp, SIZEOFGTP);
+
+ if (fread(freqp, sizeof(long), SIZEOFGFP, fl) != SIZEOFGFP)
+ error(FREAD_ERR, "<GenFreq> file");
+ ntohla(freqp, SIZEOFGFP);
+*/
+
+ for (i = 0; i < SIZEOFGTS; i++) {
+ fscanf(fl, "%ld", &totals[i]);
+ }
+ for (i = 0; i < SIZEOFGFS; i++) {
+ fscanf(fl, "%ld", &freqs[i]);
+ }
+ for (i = 0; i < SIZEOFGTP; i++) {
+ fscanf(fl, "%ld", &totalp[i]);
+ }
+ for (i = 0; i < SIZEOFGFP; i++) {
+ fscanf(fl, "%ld", &freqp[i]);
+ }
+
+/*
+ printf("gtotals\n");
+ for (i = 0; i < SIZEOFGTS; i++) {
+ printf("%ld\n", totals[i]);
+ }
+ printf("gfreqs\n");
+ for (i = 0; i < SIZEOFGFS; i++) {
+ printf("%ld\n", freqs[i]);
+ }
+ printf("gtotalp\n");
+ for (i = 0; i < SIZEOFGTP; i++) {
+ printf("%ld\n", totalp[i]);
+ }
+ printf("gfreqp\n");
+ for (i = 0; i < SIZEOFGFP; i++) {
+ printf("%ld\n", freqp[i]);
+ }
+*/
+}
+
+
+void writeposfreq(FILE* fl,
+ long totals[POSNUM], long freqs[AANUM][POSNUM],
+ long totalp[POSNUM][POSNUM],
+ long freqp[AANUM][AANUM][POSNUM][POSNUM],
+ FILE* flog)
+{
+ int i,j;
+
+ fputs("Writing Position Frequences.\n", flog);
+
+ fputs(POSFREQ_MAGIC, fl);
+
+ htonla(totals, SIZEOFPTS);
+ if (fwrite(totals, sizeof(long), SIZEOFPTS, fl) != SIZEOFPTS)
+ error(FWRITE_ERR, "<PosFreq> file");
+
+ htonla(freqs, SIZEOFPFS);
+ if (fwrite(freqs, sizeof(long), SIZEOFPFS, fl) != SIZEOFPFS)
+ error(FWRITE_ERR, "<PosFreq> file");
+
+ htonla(totalp, SIZEOFPTP);
+ if (fwrite(totalp, sizeof(long), SIZEOFPTP, fl) != SIZEOFPTP)
+ error(FWRITE_ERR, "<PosFreq> file");
+
+ htonla(freqp, SIZEOFPFP);
+ if (fwrite(freqp, sizeof(long), SIZEOFPFP, fl) != SIZEOFPFP)
+ error(FWRITE_ERR, "<PosFreq> file");
+}
+
+void readposfreq(FILE* fl,
+ long totals[POSNUM], long freqs[AANUM*POSNUM],
+ long totalp[POSNUM*POSNUM],
+ long freqp[AANUM*AANUM*POSNUM*POSNUM],
+ FILE* flog)
+{
+ char line[MAXLINE];
+ int i;
+
+ fprintf(flog, "Reading Position Frequences.\n");
+
+ if (strlen(POSFREQ_MAGIC) > 0) {
+ fgets(line, MAXLINE, fl);
+ if (strcmp(line, POSFREQ_MAGIC))
+ error(MAGIC_ERR, "<PosFreq> file");
+ }
+/*
+ if (fread(totals, sizeof(long), SIZEOFPTS, fl) != SIZEOFPTS)
+ error(FREAD_ERR, "<PosFreq> file");
+ ntohla(totals, SIZEOFPTS);
+
+ if (fread(freqs, sizeof(long), SIZEOFPFS, fl) != SIZEOFPFS)
+ error(FREAD_ERR, "<PosFreq> file");
+ ntohla(freqs, SIZEOFPFS);
+
+ if (fread(totalp, sizeof(long), SIZEOFPTP, fl) != SIZEOFPTP)
+ error(FREAD_ERR, "<PosFreq> file");
+ ntohla(totalp, SIZEOFPTP);
+
+ if (fread(freqp, sizeof(long), SIZEOFPFP, fl) != SIZEOFPFP)
+ error(FREAD_ERR, "<PosFreq> file");
+ ntohla(freqp, SIZEOFPFP);
+*/
+
+ for (i = 0; i < SIZEOFPTS; i++) {
+ fscanf(fl, "%ld", &totals[i]);
+ }
+ for (i = 0; i < SIZEOFPFS; i++) {
+ fscanf(fl, "%ld", &freqs[i]);
+ }
+ for (i = 0; i < SIZEOFPTP; i++) {
+ fscanf(fl, "%ld", &totalp[i]);
+ }
+ for (i = 0; i < SIZEOFPFP; i++) {
+ fscanf(fl, "%ld", &freqp[i]);
+ }
+
+
+/*
+ for (i = 0; i < SIZEOFPTS; i++) {
+ printf("%d\n", totals[i]);
+ }
+ for (i = 0; i < SIZEOFPFS; i++) {
+ printf("%d\n", freqs[i]);
+ }
+ for (i = 0; i < SIZEOFPTP; i++) {
+ printf("%d\n", totalp[i]);
+ }
+ for (i = 0; i < SIZEOFPFP; i++) {
+ printf("%d\n", freqp[i]);
+ }
+*/
+}
+
+void htonla(long arr[], int size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ arr[i] = htonl(arr[i]);
+}
+
+void ntohla(long arr[], int size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ arr[i] = ntohl(arr[i]);
+}
+
+
+int getseq(FILE* fl, char seq[MAXSEQLEN], int* offset, int* seqlen,
+ char *title)
+{
+ char line[MAXLINE];
+ char ch;
+ int i;
+
+ if (title) *title = 0;
+ *seqlen = 0;
+ *offset = POSNUM;
+ while (1) {
+ if (! (fgets (line, MAXLINE, fl)) )
+ return 0;
+ if (!strncmp("OFFSET", line, 6) || line[0] == '>' ||
+ !strncmp(" ..", line+strlen(line)-5, 4)) {
+ if (line[0] == '>')
+ if (title) {
+ fgets (title, MAXLINE, fl); /* skips label */
+ title[strlen(title)-1] = 0;
+ }
+ else
+ fgets (line, MAXLINE, fl); /* skips label */
+ if (!strncmp("OFFSET", line, 6))
+ if (line[6] && line[7] >= '0' && line[7] <= '6')
+ *offset = line[7] - '0';
+ else *offset = DEFOFFSET;
+ while (1) {
+ if (! (fgets (line, MAXLINE, fl)) )
+ return 1;
+ for (i=0; line[i]; i++)
+ if ((ch = aanum(line[i])) != -1)
+ seq[(*seqlen)++] = ch;
+ else if (line[i] == '*' ||
+ (line[i] == '/' && line[i+1] == '/'))
+ return 1;
+ }
+ }
+ }
+}
+
+
+int getseq2 (FILE* fl, Sequence *sequence)
+{
+ char line[MAXLINE];
+ char *res = sequence->seq, *tmp;
+ int i, j, cc;
+ int posr=0;
+
+ sequence->title[0] = 0;
+ while (fgets(line, MAXLINE, fl)) {
+ if (line[0] == '>') {
+ /* Copy the line - ">" into the code */
+ strcpy(sequence->code,&(line[1]));
+
+ /* We make tmp point to the first space, \t or \n in the line. We then */
+ /* change that space, \t or \n to a 0, and make tmp point to the next */
+ /* non space, \t or \n character. If there is none (i.e. the line had */
+ /* ended so there are no more characters), then tmp[0]=0 still. In this*/
+ /* case, the next line holds the title. Otherwise, the next series of */
+ /* characters in the line are the title. */
+
+ tmp = &(sequence->code[strcspn(sequence->code," \t\n")]);
+ tmp[0]=0;
+ tmp = &(tmp[strspn(&tmp[1]," \t\n") + 1]);
+
+ if (tmp[0]) /* Read more characters after the code, so probably FASTA. */
+ strcpy(sequence->title,tmp);
+ else
+ fgets (sequence->title, MAXLINE, fl);
+ sequence->title[strlen(sequence->title)-1] = 0; /* Remove the newline.*/
+
+
+ while (1) {
+ cc=fgetc(fl); /** We want to check the first character of the */
+ /** next line without reading in the whole line. */
+ ungetc(cc,fl); /** Put the character back onto the next line. */
+
+ if ((cc != EOF) && (cc != '>')) {
+ /** If we are not starting the next sequence: */
+ fgets(line, MAXLINE, fl);
+ for (i=0; line[i] && ( (res-sequence->seq) < MAXSEQLEN); i++)
+ switch (toupper(line[i])) {
+ case 'L': *(res++) = Leucine; break;
+ case 'I': *(res++) = Isoleucine; break;
+ case 'V': *(res++) = Valine; break;
+ case 'M': *(res++) = Methionine; break;
+ case 'F': *(res++) = Phenylalanine; break;
+ case 'Y': *(res++) = Tyrosine; break;
+ case 'G': *(res++) = Glycine; break;
+ case 'A': *(res++) = Alanine; break;
+ case 'K': *(res++) = Lysine; break;
+ case 'R': *(res++) = Arginine; break;
+ case 'H': *(res++) = Histidine; break;
+ case 'E': *(res++) = GlutamicAcid; break;
+ case 'D': *(res++) = AsparticAcid; break;
+ case 'Q': *(res++) = Glutamine; break;
+ case 'N': *(res++) = Asparagine; break;
+ case 'S': *(res++) = Serine; break;
+ case 'T': *(res++) = Threonine; break;
+ case 'C': *(res++) = Cysteine; break;
+ case 'W': *(res++) = Tryptophan; break;
+ case 'P': *(res++) = Proline; break;
+ case 'X': *(res++) = AnyRes; break;
+ case 'B': *(res++) = Asparagix; break;
+ case 'Z': *(res++) = Glutamix; break;
+/* fprintf(stderr,"Warning, not dealing with B or Z.\n");
+ *(res++) = AnyRes; break;
+*/
+ case '(':
+ case ')':
+ case '=':
+ case '/':
+ case '.':
+ case ',':
+ /* fprintf(stderr,"Warning, not dealing with punctuation.\n"); */
+ break;
+
+
+ case '*':
+ sequence->seqlen = (res-sequence->seq);
+
+ /* Get the register info if any */
+ while (1) {
+ switch (cc=getc(fl)) {
+ case '>': /* Next sequence */
+ ungetc(cc,fl);
+ if (posr==0) {sequence->reg = NULL; return 1;}
+ if (posr==sequence->seqlen) return 1;
+ fprintf(stderr, "Erroneous .pos file!\n"); exit(-1);
+ break;
+ case EOF:
+ if (posr==0) {sequence->reg = NULL; return 1;}
+ if (posr==sequence->seqlen) return 1;
+ fprintf(stderr, "Erroneous .pos file!\n"); exit(-1);
+ break;
+ default:
+ ungetc(cc,fl);
+ }
+
+ fgets(line, MAXLINE, fl);
+ for (j=0; line[j]; j++) {
+ switch (cc = line[j]) {
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'e':
+ case 'f':
+ case 'g':
+ sequence->reg[posr++] = cc-'a';
+ break;
+ case '.':
+ sequence->reg[posr++] = '.';
+ /* not 'genbank', but unspecified */
+ break;
+ case '*':
+ if (posr==sequence->seqlen) return 1;
+ if (posr ==0) {sequence->reg = NULL; return 1;}
+ /* A 2nd * indicates not pos. */
+ fprintf(stderr, "Erroneous .pos file!\n"); exit(-1);
+ case ' ':
+ case '\n':
+ case '\t':
+ break;
+ default:
+ fprintf(stderr,"Unable to parse input.\n");
+ exit(-1);
+ }
+ }
+ }
+ return 1; /*** A '*' signals the end of the input seq for */
+ /*** non-FASTA format. */
+ case ' ':
+ case '\n':
+ case '\t':
+ break;
+ default:
+ printf("Unable to parse input %c.\n", line[i]);
+ exit(-1);
+ } /* Ends the switch and the for loop */
+
+ if ( (res-sequence->seq)==MAXSEQLEN) { /* couldn't read whole seq. */
+ sequence->seqlen=MAXSEQLEN;
+ fprintf(stderr,"\nSequence %s was to long to read all of.",
+ sequence->code);
+ }
+ }
+ else {
+ sequence->seqlen = (res-sequence->seq);
+ return 1; /* If we read a 2nd > or and EOF then we have finished */
+ /* reading the seq. */
+ }
+ }
+ printf("Unexpected end of input.\n");
+ exit(-1);
+ }
+ }
+ return 0;
+}
+
+char aanum(char ch)
+{
+ int i;
+
+ ch = toupper(ch);
+ for (i = 0; aaname[i]; i++)
+ if (ch == aaname[i])
+ return i;
+ return -1;
+}
+
+
+
+char numaa(char ch)
+{
+ if (ch >= 0 && ch < strlen(aaname))
+ return aaname[ch];
+ else
+ return 0;
+}
+
+char posnum(char ch)
+{
+ int i;
+ for (i = 0; posname[i]; i++)
+ if (ch == posname[i])
+ return i;
+ return -1;
+}
+
+char numpos(char ch)
+{
+ if (ch >= 0 && ch < strlen(posname))
+ return posname[ch];
+ else
+ return 0;
+}
+
+FILE *sopen (char* name, char* ops)
+{
+ FILE *fl;
+ int str_len;
+ char cmd[256];
+ char temp_name[500];
+ char usr_name[50];
+ int i;
+
+ /* Initialize set of popened files */
+ if (!popened_files) {
+ popened_files = &_popened_files;
+ FD_ZERO(popened_files);
+ }
+ /* Is name valid? */
+ if (!name) return NULL;
+
+ /* Maybe we user wants stdin or stdout */
+ if (!strcmp(name,"-")) {
+ if (!strcmp(ops,"r"))
+ return stdin;
+ if (!strcmp(ops,"w"))
+ return stdout;
+ ERROR(name);
+ }
+ /* If writing and filename starts with "|", use popen */
+ if (!strcmp(ops,"w") && name[0]=='|') {
+ NULLERR(fl = popen(&(name[1]), ops));
+ FD_SET(fileno(fl),popened_files);
+ return fl; }
+
+ if (name[0]=='~')
+ if (name[1]=='/') { /* Manually find the home dir. */
+ strcpy(temp_name,getenv("HOME")); /* Get home directory. */
+ strcat(temp_name, &name[1]); /* Put it in place of ~ */
+ name= temp_name;
+ }
+ else { /* Convert ~usr into /location/usr */
+ strncpy(usr_name, &name[1], strcspn(&name[1],"/"));
+ /* Figures out length of usr name (between ~ and /) */
+ /* and copies the usrname into usr_name. Then we need */
+ /* to add end of string character. */
+ usr_name[strcspn(&name[1],"/")] = 0; /* End of string. */
+
+ strcpy(temp_name, getpwnam(usr_name)->pw_dir);
+ /* copies name of usr login dir to temp_name */
+
+ strcat(temp_name, &name[strcspn(name,"/")]); /* Replaces ~ with */
+ /* /location */
+ name= temp_name;
+ }
+
+
+
+ /* If we aren't reading, do the usual thing */
+ if (strcmp(ops,"r")) {
+ NULLERR(fl = fopen(name, ops));
+ return fl; }
+
+ /* If we're reading a .Z file, use zcat */
+ str_len = strlen(name);
+ if (str_len>256-16)
+ ERROR("file name too long for internal buffer");
+
+ if (name[str_len-2]=='.' && name[str_len-1]=='Z') {
+ strcpy(cmd,"zcat ");
+ strcat(cmd,name);
+ NULLERR(fl = popen(cmd, "r"));
+ FD_SET(fileno(fl),popened_files);
+ return fl; }
+
+ /* Otherwise, if the file exists, read it normally */
+ if (!access(name,F_OK)) {
+ NULLERR(fl = fopen(name, ops));
+ return fl; }
+
+ /* And if the file doesn't exist, try adding .Z */
+ strcpy(cmd,"zcat ");
+ strcat(cmd,name);
+ strcat(cmd,".Z");
+ if (!access(&(cmd[5]),F_OK)) {
+ NULLERR(fl = popen(cmd, "r"));
+ FD_SET(fileno(fl),popened_files);
+ return fl; }
+
+ /* The file does not exist. */
+ ERROR(name);
+}
+
+void sclose (FILE *fl)
+{
+ if (FD_ISSET(fileno(fl),popened_files)) {
+ FD_CLR(fileno(fl),popened_files);
+ pclose(fl);
+ }
+ else
+ fclose(fl);
+}
+
+/* End of Code */