/* Bonnie Berger, David Wilson and Theodore Tonchev 1992 */ /* Modified by Ethan Wolf 1995 */ /* C Code File */ #include #include #include #include "sc.h" #include "scio.h" #include "scsystem.h" #include "scconst.h" #include #include #include #include /* For finding location of a ~usr directory. */ /* Amino Acid IDs */ /* char aaname[] = "LIVMFYGAKRHEDQNSTCWPX"; */ char aaname[]= "LIVMFYGAKRHEDQNSTCWPXBZ."; /* Position IDs */ char posname[] = "abcdefg"; /* Array Sizes */ #define SIZEOFGTS 1 #define SIZEOFGFS AANUM #define SIZEOFGTP POSNUM #define SIZEOFGFP (AANUM*AANUM*POSNUM) #define SIZEOFPTS POSNUM #define SIZEOFPFS (AANUM*POSNUM) #define SIZEOFPTP (POSNUM*POSNUM) #define SIZEOFPFP (AANUM*AANUM*POSNUM*POSNUM) /* Magic Numbers */ #define GENFREQ_MAGIC "GenFreq 1 + 20 + 7 + 20x20x7 long\n" #define POSFREQ_MAGIC "PosFreq 7 + 20x7 + 7x7 + 20x20x7x7 long\n" #ifndef ERROR #define ERROR(STR) error(FOPEN_ERR, (STR)) #endif #define NULLERR(X) if (!(X)) error(FOPEN_ERR, name) static fd_set _popened_files; static fd_set *popened_files=NULL; void open_file (FILE** fl, char* name, char* ops) { int str_len; char cmd[256]; /* Initialize set of popened files */ if (!popened_files) { popened_files = &_popened_files; FD_ZERO(popened_files); } /* If we aren't reading, do the usual thing */ if (strcmp(ops,"r")) { NULLERR(*fl = fopen(name, ops)); return; } /* If we're reading a .Z file, use zcat */ str_len = strlen(name); if (str_len>256-8) error(FOPEN_ERR,"file name too long for internal buffer"); strcpy(cmd,"zcat "); strcat(cmd,name); if (name[str_len-2]=='.' && name[str_len-1]=='Z') { NULLERR(*fl = popen(cmd, "r")); FD_SET(fileno(*fl),popened_files); return; } /* Otherwise, if the file exists, read it normally */ if (!access(name,F_OK)) { NULLERR(*fl = fopen(name, ops)); return; } /* And if the file doesn't exist, try adding .Z */ strcat(cmd,".Z"); if (!access(&(cmd[5]),F_OK)) { NULLERR(*fl = popen(cmd, "r")); FD_SET(fileno(*fl),popened_files); return; } /* The file does not exist. */ error(FOPEN_ERR,name); } void close_file (FILE *fl) { if (FD_ISSET(fileno(fl),popened_files)) { FD_CLR(fileno(fl),popened_files); pclose(fl); } else fclose(fl); } void htonla(); void ntohla(); void writegenfreq(FILE* fl, long totals[], long freqs[AANUM], long totalp[POSNUM], long freqp[AANUM][AANUM][POSNUM], FILE* flog) { int i; fputs("Writing Gen Frequences.\n", flog); fputs(GENFREQ_MAGIC, fl); htonla(totals, SIZEOFGTS); if (fwrite(totals, sizeof(long), SIZEOFGTS, fl) != SIZEOFGTS) error(FWRITE_ERR, " file"); htonla(freqs, SIZEOFGFS); if (fwrite(freqs, sizeof(long), SIZEOFGFS, fl) != SIZEOFGFS) error(FWRITE_ERR, " file"); htonla(totalp, SIZEOFGTP); if (fwrite(totalp, sizeof(long), SIZEOFGTP, fl) != SIZEOFGTP) error(FWRITE_ERR, " file"); htonla(freqp, SIZEOFGFP); if (fwrite(freqp, sizeof(long), SIZEOFGFP, fl) != SIZEOFGFP) error(FWRITE_ERR, " file"); } void readgenfreq(FILE* fl, long totals[], long freqs[AANUM], long totalp[POSNUM], long freqp[AANUM*AANUM*POSNUM], FILE* flog) { char line[MAXLINE]; int i; fputs("Reading Gen Frequences.\n", flog); if (strlen(GENFREQ_MAGIC) > 0) { fgets(line, MAXLINE, fl); if (strcmp(line, GENFREQ_MAGIC)) error(MAGIC_ERR, " file"); } /* if (fread(totals, sizeof(long), SIZEOFGTS, fl) != SIZEOFGTS) error(FREAD_ERR, " file"); ntohla(totals, SIZEOFGTS); if (fread(freqs, sizeof(long), SIZEOFGFS, fl) != SIZEOFGFS) error(FREAD_ERR, " file"); ntohla(freqs, SIZEOFGFS); if (fread(totalp, sizeof(long), SIZEOFGTP, fl) != SIZEOFGTP) error(FREAD_ERR, " file"); ntohla(totalp, SIZEOFGTP); if (fread(freqp, sizeof(long), SIZEOFGFP, fl) != SIZEOFGFP) error(FREAD_ERR, " file"); ntohla(freqp, SIZEOFGFP); */ for (i = 0; i < SIZEOFGTS; i++) { fscanf(fl, "%ld", &totals[i]); } for (i = 0; i < SIZEOFGFS; i++) { fscanf(fl, "%ld", &freqs[i]); } for (i = 0; i < SIZEOFGTP; i++) { fscanf(fl, "%ld", &totalp[i]); } for (i = 0; i < SIZEOFGFP; i++) { fscanf(fl, "%ld", &freqp[i]); } /* printf("gtotals\n"); for (i = 0; i < SIZEOFGTS; i++) { printf("%ld\n", totals[i]); } printf("gfreqs\n"); for (i = 0; i < SIZEOFGFS; i++) { printf("%ld\n", freqs[i]); } printf("gtotalp\n"); for (i = 0; i < SIZEOFGTP; i++) { printf("%ld\n", totalp[i]); } printf("gfreqp\n"); for (i = 0; i < SIZEOFGFP; i++) { printf("%ld\n", freqp[i]); } */ } void writeposfreq(FILE* fl, long totals[POSNUM], long freqs[AANUM][POSNUM], long totalp[POSNUM][POSNUM], long freqp[AANUM][AANUM][POSNUM][POSNUM], FILE* flog) { int i,j; fputs("Writing Position Frequences.\n", flog); fputs(POSFREQ_MAGIC, fl); htonla(totals, SIZEOFPTS); if (fwrite(totals, sizeof(long), SIZEOFPTS, fl) != SIZEOFPTS) error(FWRITE_ERR, " file"); htonla(freqs, SIZEOFPFS); if (fwrite(freqs, sizeof(long), SIZEOFPFS, fl) != SIZEOFPFS) error(FWRITE_ERR, " file"); htonla(totalp, SIZEOFPTP); if (fwrite(totalp, sizeof(long), SIZEOFPTP, fl) != SIZEOFPTP) error(FWRITE_ERR, " file"); htonla(freqp, SIZEOFPFP); if (fwrite(freqp, sizeof(long), SIZEOFPFP, fl) != SIZEOFPFP) error(FWRITE_ERR, " file"); } void readposfreq(FILE* fl, long totals[POSNUM], long freqs[AANUM*POSNUM], long totalp[POSNUM*POSNUM], long freqp[AANUM*AANUM*POSNUM*POSNUM], FILE* flog) { char line[MAXLINE]; int i; fprintf(flog, "Reading Position Frequences.\n"); if (strlen(POSFREQ_MAGIC) > 0) { fgets(line, MAXLINE, fl); if (strcmp(line, POSFREQ_MAGIC)) error(MAGIC_ERR, " file"); } /* if (fread(totals, sizeof(long), SIZEOFPTS, fl) != SIZEOFPTS) error(FREAD_ERR, " file"); ntohla(totals, SIZEOFPTS); if (fread(freqs, sizeof(long), SIZEOFPFS, fl) != SIZEOFPFS) error(FREAD_ERR, " file"); ntohla(freqs, SIZEOFPFS); if (fread(totalp, sizeof(long), SIZEOFPTP, fl) != SIZEOFPTP) error(FREAD_ERR, " file"); ntohla(totalp, SIZEOFPTP); if (fread(freqp, sizeof(long), SIZEOFPFP, fl) != SIZEOFPFP) error(FREAD_ERR, " file"); ntohla(freqp, SIZEOFPFP); */ for (i = 0; i < SIZEOFPTS; i++) { fscanf(fl, "%ld", &totals[i]); } for (i = 0; i < SIZEOFPFS; i++) { fscanf(fl, "%ld", &freqs[i]); } for (i = 0; i < SIZEOFPTP; i++) { fscanf(fl, "%ld", &totalp[i]); } for (i = 0; i < SIZEOFPFP; i++) { fscanf(fl, "%ld", &freqp[i]); } /* for (i = 0; i < SIZEOFPTS; i++) { printf("%d\n", totals[i]); } for (i = 0; i < SIZEOFPFS; i++) { printf("%d\n", freqs[i]); } for (i = 0; i < SIZEOFPTP; i++) { printf("%d\n", totalp[i]); } for (i = 0; i < SIZEOFPFP; i++) { printf("%d\n", freqp[i]); } */ } void htonla(long arr[], int size) { int i; for (i = 0; i < size; i++) arr[i] = htonl(arr[i]); } void ntohla(long arr[], int size) { int i; for (i = 0; i < size; i++) arr[i] = ntohl(arr[i]); } int getseq(FILE* fl, char seq[MAXSEQLEN], int* offset, int* seqlen, char *title) { char line[MAXLINE]; char ch; int i; if (title) *title = 0; *seqlen = 0; *offset = POSNUM; while (1) { if (! (fgets (line, MAXLINE, fl)) ) return 0; if (!strncmp("OFFSET", line, 6) || line[0] == '>' || !strncmp(" ..", line+strlen(line)-5, 4)) { if (line[0] == '>') if (title) { fgets (title, MAXLINE, fl); /* skips label */ title[strlen(title)-1] = 0; } else fgets (line, MAXLINE, fl); /* skips label */ if (!strncmp("OFFSET", line, 6)) if (line[6] && line[7] >= '0' && line[7] <= '6') *offset = line[7] - '0'; else *offset = DEFOFFSET; while (1) { if (! (fgets (line, MAXLINE, fl)) ) return 1; for (i=0; line[i]; i++) if ((ch = aanum(line[i])) != -1) seq[(*seqlen)++] = ch; else if (line[i] == '*' || (line[i] == '/' && line[i+1] == '/')) return 1; } } } } int getseq2 (FILE* fl, Sequence *sequence) { char line[MAXLINE]; char *res = sequence->seq, *tmp; int i, j, cc; int posr=0; sequence->title[0] = 0; while (fgets(line, MAXLINE, fl)) { if (line[0] == '>') { /* Copy the line - ">" into the code */ strcpy(sequence->code,&(line[1])); /* We make tmp point to the first space, \t or \n in the line. We then */ /* change that space, \t or \n to a 0, and make tmp point to the next */ /* non space, \t or \n character. If there is none (i.e. the line had */ /* ended so there are no more characters), then tmp[0]=0 still. In this*/ /* case, the next line holds the title. Otherwise, the next series of */ /* characters in the line are the title. */ tmp = &(sequence->code[strcspn(sequence->code," \t\n")]); tmp[0]=0; tmp = &(tmp[strspn(&tmp[1]," \t\n") + 1]); if (tmp[0]) /* Read more characters after the code, so probably FASTA. */ strcpy(sequence->title,tmp); else fgets (sequence->title, MAXLINE, fl); sequence->title[strlen(sequence->title)-1] = 0; /* Remove the newline.*/ while (1) { cc=fgetc(fl); /** We want to check the first character of the */ /** next line without reading in the whole line. */ ungetc(cc,fl); /** Put the character back onto the next line. */ if ((cc != EOF) && (cc != '>')) { /** If we are not starting the next sequence: */ fgets(line, MAXLINE, fl); for (i=0; line[i] && ( (res-sequence->seq) < MAXSEQLEN); i++) switch (toupper(line[i])) { case 'L': *(res++) = Leucine; break; case 'I': *(res++) = Isoleucine; break; case 'V': *(res++) = Valine; break; case 'M': *(res++) = Methionine; break; case 'F': *(res++) = Phenylalanine; break; case 'Y': *(res++) = Tyrosine; break; case 'G': *(res++) = Glycine; break; case 'A': *(res++) = Alanine; break; case 'K': *(res++) = Lysine; break; case 'R': *(res++) = Arginine; break; case 'H': *(res++) = Histidine; break; case 'E': *(res++) = GlutamicAcid; break; case 'D': *(res++) = AsparticAcid; break; case 'Q': *(res++) = Glutamine; break; case 'N': *(res++) = Asparagine; break; case 'S': *(res++) = Serine; break; case 'T': *(res++) = Threonine; break; case 'C': *(res++) = Cysteine; break; case 'W': *(res++) = Tryptophan; break; case 'P': *(res++) = Proline; break; case 'X': *(res++) = AnyRes; break; case 'B': *(res++) = Asparagix; break; case 'Z': *(res++) = Glutamix; break; /* fprintf(stderr,"Warning, not dealing with B or Z.\n"); *(res++) = AnyRes; break; */ case '(': case ')': case '=': case '/': case '.': case ',': /* fprintf(stderr,"Warning, not dealing with punctuation.\n"); */ break; case '*': sequence->seqlen = (res-sequence->seq); /* Get the register info if any */ while (1) { switch (cc=getc(fl)) { case '>': /* Next sequence */ ungetc(cc,fl); if (posr==0) {sequence->reg = NULL; return 1;} if (posr==sequence->seqlen) return 1; fprintf(stderr, "Erroneous .pos file!\n"); exit(-1); break; case EOF: if (posr==0) {sequence->reg = NULL; return 1;} if (posr==sequence->seqlen) return 1; fprintf(stderr, "Erroneous .pos file!\n"); exit(-1); break; default: ungetc(cc,fl); } fgets(line, MAXLINE, fl); for (j=0; line[j]; j++) { switch (cc = line[j]) { case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': sequence->reg[posr++] = cc-'a'; break; case '.': sequence->reg[posr++] = '.'; /* not 'genbank', but unspecified */ break; case '*': if (posr==sequence->seqlen) return 1; if (posr ==0) {sequence->reg = NULL; return 1;} /* A 2nd * indicates not pos. */ fprintf(stderr, "Erroneous .pos file!\n"); exit(-1); case ' ': case '\n': case '\t': break; default: fprintf(stderr,"Unable to parse input.\n"); exit(-1); } } } return 1; /*** A '*' signals the end of the input seq for */ /*** non-FASTA format. */ case ' ': case '\n': case '\t': break; default: printf("Unable to parse input %c.\n", line[i]); exit(-1); } /* Ends the switch and the for loop */ if ( (res-sequence->seq)==MAXSEQLEN) { /* couldn't read whole seq. */ sequence->seqlen=MAXSEQLEN; fprintf(stderr,"\nSequence %s was to long to read all of.", sequence->code); } } else { sequence->seqlen = (res-sequence->seq); return 1; /* If we read a 2nd > or and EOF then we have finished */ /* reading the seq. */ } } printf("Unexpected end of input.\n"); exit(-1); } } return 0; } char aanum(char ch) { int i; ch = toupper(ch); for (i = 0; aaname[i]; i++) if (ch == aaname[i]) return i; return -1; } char numaa(char ch) { if (ch >= 0 && ch < strlen(aaname)) return aaname[ch]; else return 0; } char posnum(char ch) { int i; for (i = 0; posname[i]; i++) if (ch == posname[i]) return i; return -1; } char numpos(char ch) { if (ch >= 0 && ch < strlen(posname)) return posname[ch]; else return 0; } FILE *sopen (char* name, char* ops) { FILE *fl; int str_len; char cmd[256]; char temp_name[500]; char usr_name[50]; int i; /* Initialize set of popened files */ if (!popened_files) { popened_files = &_popened_files; FD_ZERO(popened_files); } /* Is name valid? */ if (!name) return NULL; /* Maybe we user wants stdin or stdout */ if (!strcmp(name,"-")) { if (!strcmp(ops,"r")) return stdin; if (!strcmp(ops,"w")) return stdout; ERROR(name); } /* If writing and filename starts with "|", use popen */ if (!strcmp(ops,"w") && name[0]=='|') { NULLERR(fl = popen(&(name[1]), ops)); FD_SET(fileno(fl),popened_files); return fl; } if (name[0]=='~') if (name[1]=='/') { /* Manually find the home dir. */ strcpy(temp_name,getenv("HOME")); /* Get home directory. */ strcat(temp_name, &name[1]); /* Put it in place of ~ */ name= temp_name; } else { /* Convert ~usr into /location/usr */ strncpy(usr_name, &name[1], strcspn(&name[1],"/")); /* Figures out length of usr name (between ~ and /) */ /* and copies the usrname into usr_name. Then we need */ /* to add end of string character. */ usr_name[strcspn(&name[1],"/")] = 0; /* End of string. */ strcpy(temp_name, getpwnam(usr_name)->pw_dir); /* copies name of usr login dir to temp_name */ strcat(temp_name, &name[strcspn(name,"/")]); /* Replaces ~ with */ /* /location */ name= temp_name; } /* If we aren't reading, do the usual thing */ if (strcmp(ops,"r")) { NULLERR(fl = fopen(name, ops)); return fl; } /* If we're reading a .Z file, use zcat */ str_len = strlen(name); if (str_len>256-16) ERROR("file name too long for internal buffer"); if (name[str_len-2]=='.' && name[str_len-1]=='Z') { strcpy(cmd,"zcat "); strcat(cmd,name); NULLERR(fl = popen(cmd, "r")); FD_SET(fileno(fl),popened_files); return fl; } /* Otherwise, if the file exists, read it normally */ if (!access(name,F_OK)) { NULLERR(fl = fopen(name, ops)); return fl; } /* And if the file doesn't exist, try adding .Z */ strcpy(cmd,"zcat "); strcat(cmd,name); strcat(cmd,".Z"); if (!access(&(cmd[5]),F_OK)) { NULLERR(fl = popen(cmd, "r")); FD_SET(fileno(fl),popened_files); return fl; } /* The file does not exist. */ ERROR(name); } void sclose (FILE *fl) { if (FD_ISSET(fileno(fl),popened_files)) { FD_CLR(fileno(fl),popened_files); pclose(fl); } else fclose(fl); } /* End of Code */