#include #include #include #include #include #include #include #define ARRAY 50 #define FILEBUF 1024 /* Macro's added by Patrick, mucho neato, aparently ifndef aren't POSIX * anymore */ #if !defined(MAX) #define MAX(A, B) ((A) > (B) ? (A) : (B)) #endif #if !defined(MIN) #define MIN(A, B) ((A) < (B) ? (A) : (B)) #endif /* Macro's added by Jon, allows checking of NULL pointers from memory * allocations without function calls, but how can you return values? */ #if !defined(MALLOC) #define MALLOC(PTR, SIZE) do { PTR = malloc(SIZE); if (PTR == NULL) fatal_sys_error("malloc returned NULL"); } while (0) #endif #if !defined(REALLOC) #define REALLOC(PTR, SIZE) do { PTR = realloc(PTR, SIZE); if (PTR == NULL && SIZE != 0) fatal_sys_error("realloc returned NULL"); } while (0) #endif /* Structure for sequences from FASTA files */ struct fasta { char *id; char *seq; int start; int end; int numres; }; void fatal_error (char *message); FILE * xfopen (const char *path, const char *mode); void * xmalloc (size_t size); void * xrealloc (void *ptr, size_t size); int diff (char a, char b); float pairwise (struct fasta *a, struct fasta *b); void populate (struct fasta *a); void check_length (struct fasta **array); struct fasta ** read_fasta (FILE *fh); char ** do_pairwise (struct fasta **array); void free_fasta (struct fasta *); int jsprintf (char *str, const char *const fmt, ...); void free_fasta (struct fasta *seq) { free(seq->id); free(seq->seq); free(seq); } void fatal_error (char *message) { fprintf(stderr, "%s", message); exit(EXIT_FAILURE); } void fatal_sys_error (char *message) { perror(message); exit(EXIT_FAILURE); } FILE * xfopen (const char *path, const char *mode) { FILE *fh; fh = fopen(path, mode); if (fh == NULL) fatal_sys_error("fopen returned NULL"); return fh; } void * xmalloc (size_t size) { void *ptr; ptr = (void *) malloc(size); if (ptr == NULL) fatal_sys_error("malloc returned NULL"); return ptr; } void * xrealloc (void *ptr, size_t size) { ptr = (void *) realloc(ptr, size); if (ptr == NULL && size != 0) fatal_sys_error("realloc returned NULL"); return ptr; } /* If the same resdidue return 1 else return 0 */ int diff (char a, char b) { if (a == b && a == '-') return 0; if (a == b) return 1; return 0; } /* Does the pairwise comparison * Compares the regions of the alignment that overlap, common gaps are * ignored, and the number of common residues divided by the length of * the shortest sequence is the identity. * */ float pairwise (struct fasta *a, struct fasta *b) { float result; int start, end, numres, i, id = 0; /* If the sequences don't overlap then the seq ID is 0 */ if (a->end < b->start || a->start > b->end) return 0; start = MIN( a->start, b->start ); end = MAX( a->end, b->end ); numres = MAX( a->numres, b->numres ); for (i = start; i < end; i++) id += diff(a->seq[i], b->seq[i]); result = 100 * (float) id / (float) numres; return result; } /* "Populate" the rest of the fasta structure with information * Gets the start position of the sequence (first non-gap character) * Gets the end position (the last non-gap character) * Gets the number of residues in the sequence (that aren't gaps) * Gaps are represented by '-' * */ void populate (struct fasta *a) { int i; int len = strlen(a->seq); a->numres = 0; for (i = 0; i <= len; i++) { if (a->seq[i] != '-') { a->start = i; break; } } for (i = len; i > 0; i--) { if (a->seq[i] != '-') { a->end = i; break; } } for (i = a->start; i < a->end; i++) { if (a->seq[i] != '-') { a->numres++; } } } /* Makes sure that all of the sequences are the same length */ void check_length (struct fasta **array) { int i, length; if (array[0] != NULL) length = strlen(array[0]->seq); else { fatal_error("check_length() not passed an array of fasta structs\n"); } for (i = 0; array[i] != NULL; i++) { if (length != (int) strlen(array[i]->seq)) { fatal_error("Not all of the sequences are the same length\n"); } } } /* Reads in a FASTA file and returns an array of fasta structures */ struct fasta ** read_fasta (FILE *fh) { int i, j, k, c, filesize = 0; char *file; struct fasta **array; struct stat file_stat; array = (struct fasta **) malloc(ARRAY * sizeof(struct fasta *)); /* Find the size of the file and allocate it */ fstat(fileno(fh), &file_stat); if (! S_ISREG(file_stat.st_mode)) { fatal_error("Input file is not a regular file"); } file = (char *) malloc(sizeof(char) * (file_stat.st_size + 1)); /* Read in the file and null terminate it */ while ((c = getc(fh)) != EOF) { file[filesize++] = c; } file[filesize] = '\0'; /* Parse the FASTA file into an array of structures */ for (i = 0, j = 0, k = 0; i < filesize; i++) { /* Begining of FASTA record */ if (file[i] == '>') { if (j % ARRAY == 0) array = (struct fasta **) xrealloc(array, sizeof(struct fasta *) * (j + ARRAY)); /* Initalise the structure */ array[j] = (struct fasta *) xmalloc(sizeof(struct fasta)); array[j]->id = (char *) xmalloc(sizeof(char)); array[j]->seq = (char *) xmalloc(sizeof(char)); array[j]->numres = 0; /* Read in the ID, terminated by a newline or NULL (end of string) */ i++; while (file[i] != '\0' && file[i] != '\n') { if (k % FILEBUF == 0) array[j]->id = (char *) xrealloc(array[j]->id, sizeof(char) * (FILEBUF + k)); array[j]->id[k++] = file[i++]; } array[j]->id = (char *) xrealloc(array[j]->id, sizeof(char) * k + 1); array[j]->id[k] = '\0'; k = 0; /* read in the sequence, terminated by a '>' */ for (k = 0; file[i] != '\0' && file[i] != '>'; i++) { if (file[i] != '\n') { if (k % FILEBUF == 0) array[j]->seq = (char *) xrealloc(array[j]->seq, sizeof(char) * (FILEBUF + k)); array[j]->seq[k++] = file[i]; } } array[j]->seq = (char *) xrealloc(array[j]->seq, sizeof(char) * k + 1); array[j]->seq[k] = '\0'; k = 0; i--; j++; } } free(file); array = (struct fasta **) xrealloc(array, sizeof(struct fasta *) * (j + 1)); array[j] = NULL; /* find the start and end points for the alignments */ for (i = 0; array[i] != NULL; i++) populate(array[i]); return array; } char ** do_pairwise (struct fasta **array) { int j = 0, i = 0, offset = 0, size; char **output; /* Find the size of the output and preallocate the space in the output * array */ { while (array[i] != NULL) i++; /* Number of IDs, each ID, then the upper corner distances, then for * NULL terminator */ size = 1 + i + (i * i - i) / 2 + 1; output = (char **) xmalloc(sizeof(char *) * size); for (i = 0; i < size - 1; i++) { output[i] = (char *) xmalloc(sizeof(char) * FILEBUF); memset(output[i], 0x00, sizeof(char) * FILEBUF); } output[size - 1] = NULL; } /* The number of IDs */ i = 0; while (array[i] != NULL) i++; jsprintf(output[offset++], "%i", i); /* If this statement is above the other one then it core dumps!? */ /* Each ID */ for (i = 0; array[i] != NULL; i++) jsprintf(output[offset++], "%s", array[i]->id); /* Upper corner pairwise comparisons */ for (i = 0; array[i] != NULL; i++) { for (j = i + 1; array[j] != NULL; j++) { jsprintf(output[offset++], "%f", pairwise(array[i], array[j])); } } return output; } /* Grows buffer dynamically to make input fit inside it. * Requires malloc'd pointer to string. * Returns length of string. * */ int jsprintf (char *str, const char *const fmt, ...) { int i, size; va_list ap; va_start(ap, fmt); str = (char *) realloc(str, sizeof(char) * FILEBUF); for (i = 1; (size = vsnprintf(str, sizeof(char) * FILEBUF * i, fmt, ap)) == -1; i++) { if (i == 100) fatal_error("Reached 100 iterations of vsnprinft, it's probably blown up. Check input"); str = (char *) xrealloc(str, sizeof(char) * FILEBUF * i); } str = (char *) xrealloc(str, sizeof(char) * size + 1); va_end(ap); return size; }