X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=sources%2Fpairwise%2Fpairwise.h;fp=sources%2Fpairwise%2Fpairwise.h;h=403c22337537fbedec4c7bd12712369fdbd9956c;hb=81362e35a140cd040e948b921053e74267f8a6e3;hp=0000000000000000000000000000000000000000;hpb=2cf032f4b987ba747c04159965aed78e3820d942;p=jpred.git diff --git a/sources/pairwise/pairwise.h b/sources/pairwise/pairwise.h new file mode 100644 index 0000000..403c223 --- /dev/null +++ b/sources/pairwise/pairwise.h @@ -0,0 +1,329 @@ +#include +#include +#include +#include +#include + +#include +#include + +#define ARRAY 50 +#define FILEBUF 1024 + +/* Macro's added by Patrick, mucho neato, aparently ifndef aren't POSIX + * anymore */ +#if !defined(MAX) +#define MAX(A, B) ((A) > (B) ? (A) : (B)) +#endif + +#if !defined(MIN) +#define MIN(A, B) ((A) < (B) ? (A) : (B)) +#endif + +/* Macro's added by Jon, allows checking of NULL pointers from memory + * allocations without function calls, but how can you return values? */ +#if !defined(MALLOC) +#define MALLOC(PTR, SIZE) do { PTR = malloc(SIZE); if (PTR == NULL) fatal_sys_error("malloc returned NULL"); } while (0) +#endif + +#if !defined(REALLOC) +#define REALLOC(PTR, SIZE) do { PTR = realloc(PTR, SIZE); if (PTR == NULL && SIZE != 0) fatal_sys_error("realloc returned NULL"); } while (0) +#endif + +/* Structure for sequences from FASTA files */ +struct fasta { + char *id; + char *seq; + int start; + int end; + int numres; +}; + +void fatal_error (char *message); +FILE * xfopen (const char *path, const char *mode); +void * xmalloc (size_t size); +void * xrealloc (void *ptr, size_t size); +int diff (char a, char b); +float pairwise (struct fasta *a, struct fasta *b); +void populate (struct fasta *a); +void check_length (struct fasta **array); +struct fasta ** read_fasta (FILE *fh); +char ** do_pairwise (struct fasta **array); +void free_fasta (struct fasta *); +int jsprintf (char *str, const char *const fmt, ...); + +void +free_fasta (struct fasta *seq) { + free(seq->id); + free(seq->seq); + free(seq); +} + +void +fatal_error (char *message) { + fprintf(stderr, "%s", message); + exit(EXIT_FAILURE); +} + +void +fatal_sys_error (char *message) { + perror(message); + exit(EXIT_FAILURE); +} + +FILE * +xfopen (const char *path, const char *mode) { + FILE *fh; + fh = fopen(path, mode); + if (fh == NULL) + fatal_sys_error("fopen returned NULL"); + return fh; +} + +void * +xmalloc (size_t size) { + void *ptr; + ptr = (void *) malloc(size); + if (ptr == NULL) + fatal_sys_error("malloc returned NULL"); + return ptr; +} + +void * +xrealloc (void *ptr, size_t size) { + ptr = (void *) realloc(ptr, size); + if (ptr == NULL && size != 0) + fatal_sys_error("realloc returned NULL"); + return ptr; +} + +/* If the same resdidue return 1 else return 0 */ +int +diff (char a, char b) { + if (a == b && a == '-') + return 0; + if (a == b) + return 1; + return 0; +} + +/* Does the pairwise comparison + * Compares the regions of the alignment that overlap, common gaps are + * ignored, and the number of common residues divided by the length of + * the shortest sequence is the identity. + * */ +float +pairwise (struct fasta *a, struct fasta *b) { + float result; + int start, end, numres, i, id = 0; + + /* If the sequences don't overlap then the seq ID is 0 */ + if (a->end < b->start || a->start > b->end) + return 0; + + start = MIN( a->start, b->start ); + end = MAX( a->end, b->end ); + numres = MAX( a->numres, b->numres ); + + for (i = start; i < end; i++) + id += diff(a->seq[i], b->seq[i]); + + result = 100 * (float) id / (float) numres; + + return result; +} + +/* "Populate" the rest of the fasta structure with information + * Gets the start position of the sequence (first non-gap character) + * Gets the end position (the last non-gap character) + * Gets the number of residues in the sequence (that aren't gaps) + * Gaps are represented by '-' + * */ +void +populate (struct fasta *a) { + int i; + int len = strlen(a->seq); + + a->numres = 0; + + for (i = 0; i <= len; i++) { + if (a->seq[i] != '-') { + a->start = i; + break; + } + } + for (i = len; i > 0; i--) { + if (a->seq[i] != '-') { + a->end = i; + break; + } + } + for (i = a->start; i < a->end; i++) { + if (a->seq[i] != '-') { + a->numres++; + } + } +} + +/* Makes sure that all of the sequences are the same length */ +void +check_length (struct fasta **array) { + int i, length; + + if (array[0] != NULL) + length = strlen(array[0]->seq); + else { + fatal_error("check_length() not passed an array of fasta structs\n"); + } + + for (i = 0; array[i] != NULL; i++) { + if (length != (int) strlen(array[i]->seq)) { + fatal_error("Not all of the sequences are the same length\n"); + } + } +} + +/* Reads in a FASTA file and returns an array of fasta structures */ +struct fasta ** +read_fasta (FILE *fh) { + int i, j, k, c, filesize = 0; + char *file; + struct fasta **array; + struct stat file_stat; + + array = (struct fasta **) malloc(ARRAY * sizeof(struct fasta *)); + + /* Find the size of the file and allocate it */ + fstat(fileno(fh), &file_stat); + if (! S_ISREG(file_stat.st_mode)) { + fatal_error("Input file is not a regular file"); + } + file = (char *) malloc(sizeof(char) * (file_stat.st_size + 1)); + + /* Read in the file and null terminate it */ + while ((c = getc(fh)) != EOF) { + file[filesize++] = c; + } + file[filesize] = '\0'; + + /* Parse the FASTA file into an array of structures */ + for (i = 0, j = 0, k = 0; i < filesize; i++) { + + /* Begining of FASTA record */ + if (file[i] == '>') { + if (j % ARRAY == 0) + array = (struct fasta **) xrealloc(array, sizeof(struct fasta *) * (j + ARRAY)); + + /* Initalise the structure */ + array[j] = (struct fasta *) xmalloc(sizeof(struct fasta)); + array[j]->id = (char *) xmalloc(sizeof(char)); + array[j]->seq = (char *) xmalloc(sizeof(char)); + array[j]->numres = 0; + + /* Read in the ID, terminated by a newline or NULL (end of string) */ + i++; + while (file[i] != '\0' && file[i] != '\n') { + if (k % FILEBUF == 0) + array[j]->id = (char *) xrealloc(array[j]->id, sizeof(char) * (FILEBUF + k)); + array[j]->id[k++] = file[i++]; + } + array[j]->id = (char *) xrealloc(array[j]->id, sizeof(char) * k + 1); + array[j]->id[k] = '\0'; + k = 0; + + /* read in the sequence, terminated by a '>' */ + for (k = 0; file[i] != '\0' && file[i] != '>'; i++) { + if (file[i] != '\n') { + if (k % FILEBUF == 0) + array[j]->seq = (char *) xrealloc(array[j]->seq, sizeof(char) * (FILEBUF + k)); + array[j]->seq[k++] = file[i]; + } + } + array[j]->seq = (char *) xrealloc(array[j]->seq, sizeof(char) * k + 1); + array[j]->seq[k] = '\0'; + + k = 0; + i--; + j++; + } + } + + free(file); + array = (struct fasta **) xrealloc(array, sizeof(struct fasta *) * (j + 1)); + array[j] = NULL; + + /* find the start and end points for the alignments */ + for (i = 0; array[i] != NULL; i++) + populate(array[i]); + return array; +} + +char ** +do_pairwise (struct fasta **array) { + int j = 0, i = 0, offset = 0, size; + char **output; + + /* Find the size of the output and preallocate the space in the output + * array */ + { + while (array[i] != NULL) + i++; + + /* Number of IDs, each ID, then the upper corner distances, then for + * NULL terminator */ + size = 1 + i + (i * i - i) / 2 + 1; + + output = (char **) xmalloc(sizeof(char *) * size); + + for (i = 0; i < size - 1; i++) { + output[i] = (char *) xmalloc(sizeof(char) * FILEBUF); + memset(output[i], 0x00, sizeof(char) * FILEBUF); + } + output[size - 1] = NULL; + } + + /* The number of IDs */ + i = 0; + while (array[i] != NULL) + i++; + jsprintf(output[offset++], "%i", i); + + /* If this statement is above the other one then it core dumps!? */ + + /* Each ID */ + for (i = 0; array[i] != NULL; i++) + jsprintf(output[offset++], "%s", array[i]->id); + + /* Upper corner pairwise comparisons */ + for (i = 0; array[i] != NULL; i++) { + for (j = i + 1; array[j] != NULL; j++) { + jsprintf(output[offset++], "%f", pairwise(array[i], array[j])); + } + } + + return output; +} + +/* Grows buffer dynamically to make input fit inside it. + * Requires malloc'd pointer to string. + * Returns length of string. + * */ +int +jsprintf (char *str, const char *const fmt, ...) { + int i, size; + va_list ap; + va_start(ap, fmt); + + str = (char *) realloc(str, sizeof(char) * FILEBUF); + + for (i = 1; (size = vsnprintf(str, sizeof(char) * FILEBUF * i, fmt, ap)) == -1; i++) { + if (i == 100) + fatal_error("Reached 100 iterations of vsnprinft, it's probably blown up. Check input"); + str = (char *) xrealloc(str, sizeof(char) * FILEBUF * i); + } + + str = (char *) xrealloc(str, sizeof(char) * size + 1); + + va_end(ap); + return size; +}