/*----------------------------------------------------------- / / Program: sov.c / / Secondary structure prediction accuracy evaluation / / SOV (Segment OVerlap) measure / / Copyright by Adam Zemla (11/16/1996) / Email: adamz@llnl.gov / /------------------------------------------------------------ / / Compile: cc sov.c -o sov -lm / /------------------------------------------------------------*/ #include #include #include #include #define MAXRES 2000 typedef struct { int input; int order; int q3_what; int sov_what; int sov_method; float sov_delta; float sov_delta_s; int sov_out; char fname[80]; } parameters; char *letter_AA = "ARNDCQEGHILKMFPSTWYV-?X"; /* 23 chars */ void default_parameters (parameters *); int read_aa_osec_psec (char[MAXRES], char[MAXRES], char[MAXRES], parameters *, char *); float sov (int, char[MAXRES], char[MAXRES], parameters *); float q3 (int, char[MAXRES], char[MAXRES], parameters *); int check_aa (char, char *, int); int main (int argc, char *argv[]) { int i, n_aa, sov_method; char c, aa[MAXRES], osec[MAXRES], psec[MAXRES]; parameters pdata; float out0, out1, out2, out3; if (argc < 2) { printf (" Usage: sov \n"); printf (" HELP: sov -h\n"); exit (0); } if (!strncmp (argv[1], "-h\0", 2) || !strncmp (argv[1], "help\0", 5) || !strncmp (argv[1], "-help\0", 6)) { system ("more ./README.sov"); printf ("\n"); exit (0); } default_parameters (&pdata); strcpy (pdata.fname, argv[1]); n_aa = read_aa_osec_psec (aa, osec, psec, &pdata, letter_AA); if (pdata.input == 1) { n_aa = read_aa_osec_psec (aa, osec, psec, &pdata, letter_AA); } if (pdata.order == 1) { for (i = 0; i < n_aa; i++) { c = osec[i]; osec[i] = psec[i]; psec[i] = c; } } if (n_aa <= 0) { printf ("\n ERROR! There is no 'AA OSEC PSEC' data in submited prediction."); printf ("\n The submission should contain an observed and predicted"); printf ("\n secondary structure in COLUMN format.\n"); exit (0); } printf ("\n\n SECONDARY STRUCTURE PREDICTION"); printf ("\n NUMBER OF RESIDUES PREDICTED: LENGTH = %d", n_aa); printf ("\n AA OSEC PSEC NUM"); for (i = 0; i < n_aa; i++) { printf ("\n %1c %1c %1c %4d", aa[i], osec[i], psec[i], i + 1); } printf ("\n -----------------------\n"); printf ("\n SECONDARY STRUCTURE PREDICTION ACCURACY EVALUATION. N_AA = %4d\n", n_aa); if (pdata.sov_out >= 1) { printf ("\n SOV parameters: DELTA = %5.2f DELTA-S = %5.2f\n", pdata.sov_delta, pdata.sov_delta_s); } printf ("\n ALL HELIX STRAND COIL\n"); pdata.q3_what = 0; out0 = q3 (n_aa, osec, psec, &pdata); pdata.q3_what = 1; out1 = q3 (n_aa, osec, psec, &pdata); pdata.q3_what = 2; out2 = q3 (n_aa, osec, psec, &pdata); pdata.q3_what = 3; out3 = q3 (n_aa, osec, psec, &pdata); printf ("\n Q3 : %6.1f %6.1f %6.1f %6.1f", out0 * 100.0, out1 * 100.0, out2 * 100.0, out3 * 100.0); printf ("\n"); sov_method = pdata.sov_method; if (sov_method != 0) pdata.sov_method = 1; if (pdata.sov_method == 1) { pdata.sov_what = 0; out0 = sov (n_aa, osec, psec, &pdata); pdata.sov_what = 1; out1 = sov (n_aa, osec, psec, &pdata); pdata.sov_what = 2; out2 = sov (n_aa, osec, psec, &pdata); pdata.sov_what = 3; out3 = sov (n_aa, osec, psec, &pdata); printf ("\n SOV : %6.1f %6.1f %6.1f %6.1f", out0 * 100.0, out1 * 100.0, out2 * 100.0, out3 * 100.0); printf ("\n"); } if (sov_method != 1) pdata.sov_method = 0; if (pdata.sov_method == 0) { pdata.sov_delta = 1.0; pdata.sov_what = 0; out0 = sov (n_aa, osec, psec, &pdata); pdata.sov_what = 1; out1 = sov (n_aa, osec, psec, &pdata); pdata.sov_what = 2; out2 = sov (n_aa, osec, psec, &pdata); pdata.sov_what = 3; out3 = sov (n_aa, osec, psec, &pdata); printf ("\n SOV (1994 JMB. [delta=50%%]): %6.1f %6.1f %6.1f %6.1f", out0 * 100.0, out1 * 100.0, out2 * 100.0, out3 * 100.0); pdata.sov_delta = 0.0; pdata.sov_what = 0; out0 = sov (n_aa, osec, psec, &pdata); pdata.sov_what = 1; out1 = sov (n_aa, osec, psec, &pdata); pdata.sov_what = 2; out2 = sov (n_aa, osec, psec, &pdata); pdata.sov_what = 3; out3 = sov (n_aa, osec, psec, &pdata); printf ("\n SOV (1994 JMB. [delta=0]) : %6.1f %6.1f %6.1f %6.1f", out0 * 100.0, out1 * 100.0, out2 * 100.0, out3 * 100.0); printf ("\n"); } printf ("\n -----------------------\n"); exit (0); } /*----------------------------------------------------------- / / check_aa - checks an amino acid / /------------------------------------------------------------*/ int check_aa (char token, char *letter, int n) { int i; for (i = 0; i < n; i++) { if (letter[i] == token) return i; } return n; } /*----------------------------------------------------------- / / read_aa_osec_psec - read secondary structure segments file / /------------------------------------------------------------*/ int read_aa_osec_psec (char aa[MAXRES], char sss1[MAXRES], char sss2[MAXRES], parameters * pdata, char *letter) { int i, j, coil, n_aa, n_aa_1, n_aa_2, n_aa_3, f_seq; float x; char line[1000], keyword[250], first[250], second[250], third[250], junk[250]; FILE *fp; if ((fp = fopen (pdata->fname, "r")) == NULL) { printf ("\n# error opening file %s for read\n\n", pdata->fname); exit (0); } f_seq = 0; pdata->input = 0; n_aa = 0; n_aa_1 = 0; n_aa_2 = 0; n_aa_3 = 0; coil = 0; while (fgets (line, 1000, fp) != NULL) { strcpy (keyword, " "); strcpy (first, " "); strcpy (second, " "); strcpy (third, " "); strcpy (junk, " "); i = 0; j = 0; while (line[i] == ' ' && line[i] != '\n' && line[i] != '\0' && i < 250) i++; if (i < 250) { j = i; while (line[i] != ' ' && line[i] != '\n' && line[i] != '\0' && i < 250) i++; } j = i - j; if (j < 250 && j > 0) { sscanf (line, "%s", keyword); } if (!strncmp (keyword, "#", 1)) { } else if (!strncmp (keyword, "-----", 5)) { } else if (!strncmp (keyword, "NUMBER\0", 7)) { } else if (!strncmp (keyword, "SECONDARY\0", 10)) { } else if (!strncmp (keyword, "END\0", 4) && f_seq == 0) { fclose (fp); return n_aa; } else if (!strncmp (keyword, "AA-OSEC-PSEC\0", 13)) { printf ("%s", line); sscanf (line, "%s %s", keyword, first); strcpy (pdata->fname, first); pdata->input = 1; } else if (line[0] == '\n' || !strncmp (keyword, " \0", 4)) { } else if (!strncmp (keyword, "AA\0", 3) && f_seq == 0) { sscanf (line, "%s %s %s", keyword, first, second); if (!strncmp (keyword, "AA\0", 3) && !strncmp (first, "PSEC\0", 5) && !strncmp (second, "OSEC\0", 5)) { pdata->order = 1; } } else if (!strncmp (keyword, "SOV-DELTA\0", 10)) { printf ("%s", line); sscanf (line, "%s %f", keyword, &x); pdata->sov_delta = x; } else if (!strncmp (keyword, "SOV-DELTA-S\0", 12)) { printf ("%s", line); sscanf (line, "%s %f", keyword, &x); pdata->sov_delta_s = x; } else if (!strncmp (keyword, "SOV-METHOD\0", 9)) { printf ("%s", line); sscanf (line, "%s %d", keyword, &i); pdata->sov_method = i; } else if (!strncmp (keyword, "SOV-OUTPUT\0", 9)) { printf ("%s", line); sscanf (line, "%s %d", keyword, &i); pdata->sov_out = i; } else if (line[0] == '>') { printf ("%s", line); if (f_seq < 2) n_aa = 0; f_seq++; } else if (f_seq == 0) { if (j > 1) { if (!strncmp (keyword, "SSP\0", 4)) { sscanf (line, "%s %s %s %s %s", keyword, junk, first, second, third); } else { printf ("\n ERROR! (line: %d) Check COLUMN format of your prediction!\n", n_aa + 1); fclose (fp); exit (0); } } else { sscanf (line, "%s %s %s", first, second, third); } aa[n_aa] = first[0]; sss1[n_aa] = second[0]; sss2[n_aa] = third[0]; if (check_aa (aa[n_aa], letter, 23) == 23) { printf ("\n# ERROR!\n%s", line); printf ("\n# ERROR! (line: %d) Check amino acid code %c\n", n_aa + 1, aa[n_aa]); fclose (fp); exit (0); } if (sss1[n_aa] == ' ' || sss2[n_aa] == ' ') { printf ("\n# ERROR!\n%s", line); printf ("\n# ERROR! (line: %d) Check secondary structure code\n", n_aa + 1); fclose (fp); exit (0); } if (sss1[n_aa] == 'L') { sss1[n_aa] = 'C'; if (coil == 0) { printf ("\n# WARNING! (line: %d) The 'L' characters are interpreted as 'C' (coil)", n_aa + 1); coil = 1; } } if (sss2[n_aa] == 'L') { sss2[n_aa] = 'C'; if (coil == 0) { printf ("\n# WARNING! (line: %d) The 'L' characters are interpreted as 'C' (coil)", n_aa + 1); coil = 1; } } if (sss1[n_aa] != 'C' && sss1[n_aa] != 'E' && sss1[n_aa] != 'H') { printf ("\n# ERROR!\n%s", line); printf ("\n# ERROR! (line: %d) Check secondary structure code %c\n", n_aa + 1, sss1[n_aa]); fclose (fp); exit (0); } if (sss2[n_aa] != 'C' && sss2[n_aa] != 'E' && sss2[n_aa] != 'H') { printf ("\n# ERROR!\n%s", line); printf ("\n# ERROR! (line: %d) Check secondary structure code %c\n", n_aa + 1, sss2[n_aa]); fclose (fp); exit (0); } n_aa++; if (n_aa >= MAXRES) { printf ("\n# ERROR! Check number of amino acid lines. (MAX = %d)\n\n", MAXRES); fclose (fp); exit (0); } } else if (f_seq == 1) { i = 0; while (line[i] != '\n') { if (line[i] != ' ' && line[i] != '\t' && line[i] != '\0' && line[i] != '\a' && line[i] != '\b' && line[i] != '\f' && line[i] != '\r' && line[i] != '\v' && i < 1000) { aa[n_aa] = 'X'; sss1[n_aa] = line[i]; if (sss1[n_aa] == 'L') { sss1[n_aa] = 'C'; if (coil == 0) { printf ("\n# WARNING! The 'L' characters are interpreted as 'C' (coil)"); coil = 1; } } if (sss1[n_aa] != 'C' && sss1[n_aa] != 'E' && sss1[n_aa] != 'H') { printf ("\n# ERROR!\n%s", line); printf ("\n# ERROR! Check secondary structure code: %c\n", sss1[n_aa]); fclose (fp); exit (0); } n_aa++; if (n_aa >= MAXRES) { printf ("\n# ERROR! Check number of residues. (MAX = %d)\n\n", MAXRES); fclose (fp); exit (0); } } i++; } n_aa_1 = n_aa; } else if (f_seq == 2) { i = 0; while (line[i] != '\n') { if (line[i] != ' ' && line[i] != '\t' && line[i] != '\0' && line[i] != '\a' && line[i] != '\b' && line[i] != '\f' && line[i] != '\r' && line[i] != '\v' && i < 1000) { aa[n_aa] = 'X'; sss2[n_aa] = line[i]; if (sss2[n_aa] == 'L') { sss2[n_aa] = 'C'; if (coil == 0) { printf ("\n# WARNING! The 'L' characters are interpreted as 'C' (coil)"); coil = 1; } } if (sss2[n_aa] != 'C' && sss2[n_aa] != 'E' && sss2[n_aa] != 'H') { printf ("\n# ERROR!\n%s", line); printf ("\n# ERROR! Check secondary structure code: %c\n", sss2[n_aa]); fclose (fp); exit (0); } n_aa++; if (n_aa >= MAXRES) { printf ("\n# ERROR! Check number of residues. (MAX = %d)\n\n", MAXRES); fclose (fp); exit (0); } } i++; } n_aa_2 = n_aa; } else if (f_seq == 3) { i = 0; while (line[i] != '\n') { if (line[i] != ' ' && line[i] != '\t' && line[i] != '\0' && line[i] != '\a' && line[i] != '\b' && line[i] != '\f' && line[i] != '\r' && line[i] != '\v' && i < 1000) { aa[n_aa_3] = line[i]; if (check_aa (aa[n_aa_3], letter, 23) == 23) { printf ("\n# ERROR!\n%s", line); printf ("\n# ERROR! (N_res: %d) Check amino acid code %c\n", n_aa_3 + 1, aa[n_aa_3]); fclose (fp); exit (0); } n_aa_3++; if (n_aa_3 >= MAXRES) { printf ("\n# ERROR! Check number of residues. (MAX = %d)\n\n", MAXRES); fclose (fp); exit (0); } } i++; } } } if (n_aa_1 != n_aa_2) { printf ("\n# ERROR! Check format of your submission.\n"); fclose (fp); exit (0); } return n_aa; } /*------------------------------------------------------------- / / default_parameters - default parameters for SOV program / /------------------------------------------------------------*/ void default_parameters (parameters * pdata) { pdata->input = 0; pdata->order = 0; pdata->sov_method = 1; pdata->sov_delta = 1.0; pdata->sov_delta_s = 0.5; pdata->sov_out = 0; return; } /*----------------------------------------------------------- / / sov - evaluate SSp by the Segment OVerlap quantity (SOV) / Input: secondary structure segments / /------------------------------------------------------------*/ float sov (int n_aa, char sss1[MAXRES], char sss2[MAXRES], parameters * pdata) { int i, k, length1, length2, beg_s1, end_s1, beg_s2, end_s2; int j1, j2, k1, k2, minov, maxov, d, d1, d2, n, multiple; char s1, s2, sse[3]; float out; double s, x; sse[0] = '#'; sse[1] = '#'; sse[2] = '#'; if (pdata->sov_what == 0) { sse[0] = 'H'; sse[1] = 'E'; sse[2] = 'C'; } if (pdata->sov_what == 1) { sse[0] = 'H'; sse[1] = 'H'; sse[2] = 'H'; } if (pdata->sov_what == 2) { sse[0] = 'E'; sse[1] = 'E'; sse[2] = 'E'; } if (pdata->sov_what == 3) { sse[0] = 'C'; sse[1] = 'C'; sse[2] = 'C'; } n = 0; for (i = 0; i < n_aa; i++) { s1 = sss1[i]; if (s1 == sse[0] || s1 == sse[1] || s1 == sse[2]) { n++; } } out = 0.0; s = 0.0; length1 = 0; length2 = 0; i = 0; while (i < n_aa) { beg_s1 = i; s1 = sss1[i]; while (sss1[i] == s1 && i < n_aa) { i++; } end_s1 = i - 1; length1 = end_s1 - beg_s1 + 1; multiple = 0; k = 0; while (k < n_aa) { beg_s2 = k; s2 = sss2[k]; while (sss2[k] == s2 && k < n_aa) { k++; } end_s2 = k - 1; length2 = end_s2 - beg_s2 + 1; if (s1 == sse[0] || s1 == sse[1] || s1 == sse[2]) { if (s1 == s2 && end_s2 >= beg_s1 && beg_s2 <= end_s1) { if (multiple > 0 && pdata->sov_method == 1) { n = n + length1; } multiple++; if (beg_s1 > beg_s2) { j1 = beg_s1; j2 = beg_s2; } else { j1 = beg_s2; j2 = beg_s1; } if (end_s1 < end_s2) { k1 = end_s1; k2 = end_s2; } else { k1 = end_s2; k2 = end_s1; } minov = k1 - j1 + 1; maxov = k2 - j2 + 1; d1 = floor (length1 * pdata->sov_delta_s); d2 = floor (length2 * pdata->sov_delta_s); if (d1 > d2) d = d2; if (d1 <= d2 || pdata->sov_method == 0) d = d1; if (d > minov) { d = minov; } if (d > maxov - minov) { d = maxov - minov; } x = pdata->sov_delta * d; x = (minov + x) * length1; if (maxov > 0) { s = s + x / maxov; } else { printf ("\n ERROR! minov = %-4d maxov = %-4d length = %-4d d = %-4d %4d %4d %4d %4d", minov, maxov, length1, d, beg_s1 + 1, end_s1 + 1, beg_s2 + 1, end_s2 + 1); } if (pdata->sov_out == 2) { printf ("\n TEST: minov = %-4d maxov = %-4d length = %-4d d = %-4d %4d %4d %4d %4d", minov, maxov, length1, d, beg_s1 + 1, end_s1 + 1, beg_s2 + 1, end_s2 + 1); } } } } } if (pdata->sov_out == 2) { printf ("\n TEST: Number of considered residues = %d", n); } if (n > 0) { out = s / n; } else { out = 1.0; } return out; } /*----------------------------------------------------------- / / Q3 - evaluate SSp by the residues predicted correctly (Q3) / Input: secondary structure segments / /------------------------------------------------------------*/ float q3 (int n_aa, char sss1[MAXRES], char sss2[MAXRES], parameters * pdata) { int i, n; float out; char s, sse[3]; sse[0] = '#'; sse[1] = '#'; sse[2] = '#'; if (pdata->q3_what == 0) { sse[0] = 'H'; sse[1] = 'E'; sse[2] = 'C'; } if (pdata->q3_what == 1) { sse[0] = 'H'; sse[1] = 'H'; sse[2] = 'H'; } if (pdata->q3_what == 2) { sse[0] = 'E'; sse[1] = 'E'; sse[2] = 'E'; } if (pdata->q3_what == 3) { sse[0] = 'C'; sse[1] = 'C'; sse[2] = 'C'; } n = 0; out = 0.0; for (i = 0; i < n_aa; i++) { s = sss1[i]; if (s == sse[0] || s == sse[1] || s == sse[2]) { n++; if (sss1[i] == sss2[i]) { out = out + 1.0; } } } if (n > 0) { out = out / n; } else { out = 1.0; } return out; }