Main Page | Alphabetical List | Data Structures | File List | Data Fields | Globals

pairwise.c

Go to the documentation of this file.
00001 /* Pairwise identity program.
00002  *
00003  * $Id
00004  *
00005  * Reads in a gapped alignment sequences in a FASTA file format.
00006  * Assume gaps are represented by '-' characters.
00007  *
00008  * For each pair calculates the sequence percentage identity, see comments
00009  * for pairwise() for details.
00010  *
00011  * Output is in the format for OC to read in, i.e. the number of sequences,
00012  * followed by the sequence ID's, followed by the pairwise comparisons.
00013  *
00014  * Thu Dec  5 14:38:43 GMT 2002 Jon
00015  * Added checking for eqilength sequences
00016  *
00017  * Thu Jul 24 12:14:04 BST 2003 Jon
00018  * Moved FASTA reading into read_fasta function
00019  * Checked into CVS
00020  *
00021  * */
00022 
00023 #include <stdio.h>
00024 #include <stdlib.h>
00025 #include <errno.h>
00026 #include <string.h>
00027 #include "pairwise.h"
00028 
00029 #define ARRAY 50
00030 #define FILEBUF 1000
00031 
00032 /* Macro's added by Patrick, mucho neato, aparently ifndef aren't POSIX
00033  * anymore */
00034 #if !defined(MAX)
00035 #define MAX(A, B)       ((A) > (B) ? (A) : (B))
00036 #endif
00037 
00038 #if !defined(MIN)
00039 #define MIN(A, B)       ((A) < (B) ? (A) : (B))
00040 #endif
00041 
00042 /* Macro's added by Jon, allows checking of NULL pointers from memory
00043  * allocations without function calls, but how can you return values? */
00044 #if !defined(MALLOC)
00045 #define MALLOC(PTR, SIZE) do { PTR = malloc(SIZE); if (PTR == NULL) fatal_sys_error("malloc returned NULL"); } while (0)
00046 #endif
00047 
00048 #if !defined(REALLOC)
00049 #define REALLOC(PTR, SIZE) do { PTR = realloc(PTR, SIZE); if (PTR == NULL && SIZE != 0) fatal_sys_error("realloc returned NULL"); } while (0)
00050 #endif
00051 
00052 void
00053 fatal_error (char *message) {
00054         printf(message);
00055         exit(EXIT_FAILURE);
00056 }
00057 
00058 void
00059 fatal_sys_error (char *message) {
00060         perror(message);
00061         exit(EXIT_FAILURE);
00062 }
00063 
00064 FILE *
00065 xfopen (const char *path, const char *mode) {
00066         FILE *fh;
00067         fh = fopen(path, mode);
00068         if (fh == NULL)
00069                 fatal_sys_error("fopen returned NULL");
00070         return fh;
00071 }
00072 
00073 void *
00074 xmalloc (size_t size) {
00075         void *ptr;
00076         ptr = (void *) malloc(size);
00077         if (ptr == NULL)
00078                 fatal_sys_error("malloc returned NULL");
00079         return ptr;
00080 }
00081 
00082 void *
00083 xrealloc (void *ptr, size_t size) {
00084         ptr = (void *) realloc(ptr, size);
00085         if (ptr == NULL && size != 0)
00086                 fatal_sys_error("realloc returned NULL");
00087         return ptr;
00088 }
00089 
00090 /* If the same resdidue return 1 else return 0 */
00091 int 
00092 diff (char a, char b) {
00093         if (a == b && a == '-')
00094                 return 0;
00095         if (a == b)
00096                 return 1;
00097         return 0;
00098 }
00099 
00100 /* Does the pairwise comparison
00101  * Compares the regions of the alignment that overlap, common gaps are
00102  * ignored, and the number of common residues divided by the length of
00103  * the shortest sequence is the identity.
00104  * */
00105 float
00106 pairwise (struct fasta *a, struct fasta *b) {
00107         float result;
00108         int start, end, numres, i, id = 0;
00109         
00110         /* If the sequences don't overlap then the seq ID is 0 */
00111         if (a->end < b->start || a->start > b->end)
00112                 return 0;
00113 
00114         start = MIN( a->start, b->start );
00115         end = MAX( a->end, b->end );
00116         numres = MAX( a->numres, b->numres );
00117 
00118         for (i = start; i < end; i++)
00119                 id += diff(a->seq[i], b->seq[i]);
00120 
00121         result = 100 * (float) id / (float) numres;
00122 
00123         return result;
00124 }
00125 
00126 /* "Populate" the rest of the fasta structure with information
00127  * Gets the start position of the sequence (first non-gap character)
00128  * Gets the end position (the last non-gap character)
00129  * Gets the number of residues in the sequence (that aren't gaps)
00130  * Gaps are represented by '-'
00131  * */
00132 void
00133 populate (struct fasta *a) {
00134         int i;
00135         int len = strlen(a->seq);
00136 
00137         a->numres = 0;
00138         
00139         for (i = 0; i <= len; i++) {
00140                 if (a->seq[i] != '-') {
00141                         a->start = i;
00142                         break;
00143                 }
00144         }
00145         for (i = len; i > 0; i--) {
00146                 if (a->seq[i] != '-') {
00147                         a->end = i;
00148                         break;
00149                 }
00150         }
00151         for (i = a->start; i < a->end; i++) {
00152                 if (a->seq[i] != '-') {
00153                         a->numres++;
00154                 }
00155         }
00156 }
00157 
00158 /* Makes sure that all of the sequences are the same length */
00159 void
00160 check_length (struct fasta **array) {
00161         int i, length;
00162 
00163         if (array[0] != NULL)
00164                 length = strlen(array[0]->seq);
00165         else {
00166                 fprintf(stderr, "check_length() not passed an array of fasta structs\n");
00167                 return;
00168         }
00169 
00170         for (i = 0; array[i] != NULL; i++) {
00171                 if (length != strlen(array[i]->seq)) {
00172                         fatal_error("Not all of the sequences are the same length\n");
00173                 }
00174         }
00175 }
00176 
00177 /* Reads in a FASTA file and returns an array of fasta structures */
00178 struct fasta **
00179 read_fasta (FILE *fh) {
00180         int i, j, k, c, filesize = 1;
00181         char *file;
00182         struct fasta **array;
00183 
00184         array = (struct fasta **) xmalloc(ARRAY * sizeof(struct fasta *));
00185         file = (char *) xmalloc(sizeof(char));
00186 
00187         /* Allocate initial space for the file */
00188         file = xrealloc(file, sizeof(char) * filesize);
00189         file[filesize] = '\0';
00190 
00191         /* Read in the file */
00192         while ((c = getc(fh)) != EOF) {
00193                 if (filesize % FILEBUF)
00194                         file = xrealloc(file, sizeof(char) * (FILEBUF + filesize));
00195 
00196                 file[filesize] = c;
00197                 filesize++;
00198         }
00199 
00200         /* Parse the FASTA file into an array of structures */
00201         for (i = 0, j = 0, k = 0; i < filesize; i++) {
00202                 if (file[i] == '>') {
00203                         if (j % ARRAY == 0)
00204                                 array = (struct fasta **) xrealloc(array, sizeof(struct fasta *) * (j + ARRAY));
00205 
00206                         array[j] = (struct fasta *) xmalloc(sizeof(struct fasta));
00207                         array[j]->id = (char *) xmalloc(sizeof(char));
00208                         array[j]->seq = (char *) xmalloc(sizeof(char));
00209                         array[j]->numres = 0;
00210 
00211                         i++;
00212                         while (file[i] != '\0' && file[i] != '\n') {
00213                                 if (k % ARRAY == 0)
00214                                         array[j]->id = (char *) xrealloc(array[j]->id, sizeof(char) * (ARRAY + k));
00215                                 array[j]->id[k] = file[i];
00216                                 k++; i++;
00217                         }
00218                         array[j]->id = (char *) xrealloc(array[j]->id, sizeof(char) * (ARRAY + k));
00219                         array[j]->id[k] = '\0';
00220                         k = 0;
00221 
00222                         while (file[i] != '\0' && file[i] != '>') {
00223                                 if (file[i] == '\n') {
00224                                         i++;
00225                                         continue;
00226                                 }
00227                                 if (k % ARRAY == 0)
00228                                         array[j]->seq = (char *) xrealloc(array[j]->seq, sizeof(char) * (ARRAY + k));
00229                                 array[j]->seq[k] = file[i];
00230                                 k++; i++;
00231                         }
00232                         array[j]->seq = (char *) xrealloc(array[j]->seq, sizeof(char) * (ARRAY + k));
00233                         array[j]->seq[k] = '\0';
00234                         k = 0;
00235                         i--;
00236                         j++;
00237                 }
00238         }
00239 
00240         free(file);
00241         array = (struct fasta **) xrealloc(array, sizeof(struct fasta *) * j);
00242         array[j] = NULL;
00243 
00244         /* find the start and end points for the alignments */
00245         for (i = 0; array[i] != NULL; i++) {
00246                 populate(array[i]);
00247         }
00248 
00249         return array;
00250 }
00251 
00252 int
00253 main (int argc, char **argv) {
00254         FILE *fh;
00255         struct fasta **array;
00256         int i = 0, j;
00257 
00258         /* Read in the FASTA file */
00259         if (argc == 2) 
00260                 fh = xfopen(argv[1], "r");
00261         else
00262                 fh = stdin;
00263 
00264         array = read_fasta(fh);
00265         fclose(fh);
00266 
00267         check_length(array);
00268         
00269         /* start the OC output */
00270         while (array[i] != NULL)
00271                 i++;
00272 
00273         fprintf(stdout, "%i\n", i);
00274 
00275         for (i = 0; array[i] != NULL; i++)
00276                 fprintf(stdout, "%s\n", array[i]->id);
00277         
00278         /* do the pairwise comparison */
00279         for (i = 0; array[i] != NULL; i++) {
00280                 for (j = i + 1; array[j] != NULL; j++) {
00281                         printf("%f\n", pairwise(array[i], array[j]));
00282                 }
00283         }
00284 
00285         return EXIT_SUCCESS;
00286 }

Generated on Thu Jul 24 12:17:49 2003 for pairwise by doxygen 1.3.2