X-Git-Url: http://source.jalview.org/gitweb/?p=jpred.git;a=blobdiff_plain;f=sources%2Fsov%2Fsov.c;fp=sources%2Fsov%2Fsov.c;h=d4f4ae3f33f98732f309ddc32031c08d72aaa60c;hp=0000000000000000000000000000000000000000;hb=9aa768067094f24f46f273077f867348e6143711;hpb=eb3001dc41bf6cd46e20fd13fe3efbe9dedf6013 diff --git a/sources/sov/sov.c b/sources/sov/sov.c new file mode 100644 index 0000000..d4f4ae3 --- /dev/null +++ b/sources/sov/sov.c @@ -0,0 +1,618 @@ +/*----------------------------------------------------------- +/ +/ Program: sov.c +/ +/ Secondary structure prediction accuracy evaluation +/ +/ SOV (Segment OVerlap) measure +/ +/ Copyright by Adam Zemla (11/16/1996) +/ Email: adamz@llnl.gov +/ +/------------------------------------------------------------ +/ +/ Compile: cc sov.c -o sov -lm +/ +/------------------------------------------------------------*/ +#include +#include +#include +#include + +#define MAXRES 2000 + +typedef struct +{ + int input; + int order; + int q3_what; + int sov_what; + int sov_method; + float sov_delta; + float sov_delta_s; + int sov_out; + char fname[80]; +} parameters; + +char *letter_AA = "ARNDCQEGHILKMFPSTWYV-?X"; /* 23 chars */ + +void default_parameters (parameters *); +int read_aa_osec_psec (char[MAXRES], char[MAXRES], char[MAXRES], parameters *, char *); +float sov (int, char[MAXRES], char[MAXRES], parameters *); +float q3 (int, char[MAXRES], char[MAXRES], parameters *); +int check_aa (char, char *, int); + +int +main (int argc, char *argv[]) +{ + int i, n_aa, sov_method; + char c, aa[MAXRES], osec[MAXRES], psec[MAXRES]; + parameters pdata; + float out0, out1, out2, out3; + + if (argc < 2) { + printf (" Usage: sov \n"); + printf (" HELP: sov -h\n"); + exit (0); + } + if (!strncmp (argv[1], "-h\0", 2) || !strncmp (argv[1], "help\0", 5) || !strncmp (argv[1], "-help\0", 6)) { + system ("more ./README.sov"); + printf ("\n"); + exit (0); + } + + default_parameters (&pdata); + + strcpy (pdata.fname, argv[1]); + + n_aa = read_aa_osec_psec (aa, osec, psec, &pdata, letter_AA); + + if (pdata.input == 1) { + n_aa = read_aa_osec_psec (aa, osec, psec, &pdata, letter_AA); + } + + if (pdata.order == 1) { + for (i = 0; i < n_aa; i++) { + c = osec[i]; + osec[i] = psec[i]; + psec[i] = c; + } + } + + if (n_aa <= 0) { + printf ("\n ERROR! There is no 'AA OSEC PSEC' data in submited prediction."); + printf ("\n The submission should contain an observed and predicted"); + printf ("\n secondary structure in COLUMN format.\n"); + exit (0); + } + + printf ("\n\n SECONDARY STRUCTURE PREDICTION"); + printf ("\n NUMBER OF RESIDUES PREDICTED: LENGTH = %d", n_aa); + printf ("\n AA OSEC PSEC NUM"); + for (i = 0; i < n_aa; i++) { + printf ("\n %1c %1c %1c %4d", aa[i], osec[i], psec[i], i + 1); + } + printf ("\n -----------------------\n"); + printf ("\n SECONDARY STRUCTURE PREDICTION ACCURACY EVALUATION. N_AA = %4d\n", n_aa); + if (pdata.sov_out >= 1) { + printf ("\n SOV parameters: DELTA = %5.2f DELTA-S = %5.2f\n", pdata.sov_delta, pdata.sov_delta_s); + } + + printf ("\n ALL HELIX STRAND COIL\n"); + + pdata.q3_what = 0; + out0 = q3 (n_aa, osec, psec, &pdata); + pdata.q3_what = 1; + out1 = q3 (n_aa, osec, psec, &pdata); + pdata.q3_what = 2; + out2 = q3 (n_aa, osec, psec, &pdata); + pdata.q3_what = 3; + out3 = q3 (n_aa, osec, psec, &pdata); + printf ("\n Q3 : %6.1f %6.1f %6.1f %6.1f", out0 * 100.0, out1 * 100.0, out2 * 100.0, out3 * 100.0); + printf ("\n"); + + sov_method = pdata.sov_method; + + if (sov_method != 0) + pdata.sov_method = 1; + + if (pdata.sov_method == 1) { + pdata.sov_what = 0; + out0 = sov (n_aa, osec, psec, &pdata); + pdata.sov_what = 1; + out1 = sov (n_aa, osec, psec, &pdata); + pdata.sov_what = 2; + out2 = sov (n_aa, osec, psec, &pdata); + pdata.sov_what = 3; + out3 = sov (n_aa, osec, psec, &pdata); + printf ("\n SOV : %6.1f %6.1f %6.1f %6.1f", out0 * 100.0, out1 * 100.0, out2 * 100.0, out3 * 100.0); + printf ("\n"); + } + + if (sov_method != 1) + pdata.sov_method = 0; + + if (pdata.sov_method == 0) { + pdata.sov_delta = 1.0; + + pdata.sov_what = 0; + out0 = sov (n_aa, osec, psec, &pdata); + pdata.sov_what = 1; + out1 = sov (n_aa, osec, psec, &pdata); + pdata.sov_what = 2; + out2 = sov (n_aa, osec, psec, &pdata); + pdata.sov_what = 3; + out3 = sov (n_aa, osec, psec, &pdata); + printf ("\n SOV (1994 JMB. [delta=50%%]): %6.1f %6.1f %6.1f %6.1f", out0 * 100.0, out1 * 100.0, out2 * 100.0, out3 * 100.0); + + pdata.sov_delta = 0.0; + + pdata.sov_what = 0; + out0 = sov (n_aa, osec, psec, &pdata); + pdata.sov_what = 1; + out1 = sov (n_aa, osec, psec, &pdata); + pdata.sov_what = 2; + out2 = sov (n_aa, osec, psec, &pdata); + pdata.sov_what = 3; + out3 = sov (n_aa, osec, psec, &pdata); + printf ("\n SOV (1994 JMB. [delta=0]) : %6.1f %6.1f %6.1f %6.1f", out0 * 100.0, out1 * 100.0, out2 * 100.0, out3 * 100.0); + + printf ("\n"); + } + + printf ("\n -----------------------\n"); + + exit (0); +} + +/*----------------------------------------------------------- +/ +/ check_aa - checks an amino acid +/ +/------------------------------------------------------------*/ +int +check_aa (char token, char *letter, int n) +{ + int i; + + for (i = 0; i < n; i++) { + if (letter[i] == token) + return i; + } + return n; +} + +/*----------------------------------------------------------- +/ +/ read_aa_osec_psec - read secondary structure segments file +/ +/------------------------------------------------------------*/ +int +read_aa_osec_psec (char aa[MAXRES], char sss1[MAXRES], char sss2[MAXRES], parameters * pdata, char *letter) +{ + int i, j, coil, n_aa, n_aa_1, n_aa_2, n_aa_3, f_seq; + float x; + char line[1000], keyword[250], first[250], second[250], third[250], junk[250]; + FILE *fp; + + if ((fp = fopen (pdata->fname, "r")) == NULL) { + printf ("\n# error opening file %s for read\n\n", pdata->fname); + exit (0); + } + + f_seq = 0; + pdata->input = 0; + n_aa = 0; + n_aa_1 = 0; + n_aa_2 = 0; + n_aa_3 = 0; + coil = 0; + + while (fgets (line, 1000, fp) != NULL) { + strcpy (keyword, " "); + strcpy (first, " "); + strcpy (second, " "); + strcpy (third, " "); + strcpy (junk, " "); + i = 0; + j = 0; + while (line[i] == ' ' && line[i] != '\n' && line[i] != '\0' && i < 250) + i++; + if (i < 250) { + j = i; + while (line[i] != ' ' && line[i] != '\n' && line[i] != '\0' && i < 250) + i++; + } + j = i - j; + if (j < 250 && j > 0) { + sscanf (line, "%s", keyword); + } + if (!strncmp (keyword, "#", 1)) { + } else if (!strncmp (keyword, "-----", 5)) { + } else if (!strncmp (keyword, "NUMBER\0", 7)) { + } else if (!strncmp (keyword, "SECONDARY\0", 10)) { + } else if (!strncmp (keyword, "END\0", 4) && f_seq == 0) { + fclose (fp); + return n_aa; + } else if (!strncmp (keyword, "AA-OSEC-PSEC\0", 13)) { + printf ("%s", line); + sscanf (line, "%s %s", keyword, first); + strcpy (pdata->fname, first); + pdata->input = 1; + } else if (line[0] == '\n' || !strncmp (keyword, " \0", 4)) { + } else if (!strncmp (keyword, "AA\0", 3) && f_seq == 0) { + sscanf (line, "%s %s %s", keyword, first, second); + if (!strncmp (keyword, "AA\0", 3) && !strncmp (first, "PSEC\0", 5) && !strncmp (second, "OSEC\0", 5)) { + pdata->order = 1; + } + } else if (!strncmp (keyword, "SOV-DELTA\0", 10)) { + printf ("%s", line); + sscanf (line, "%s %f", keyword, &x); + pdata->sov_delta = x; + } else if (!strncmp (keyword, "SOV-DELTA-S\0", 12)) { + printf ("%s", line); + sscanf (line, "%s %f", keyword, &x); + pdata->sov_delta_s = x; + } else if (!strncmp (keyword, "SOV-METHOD\0", 9)) { + printf ("%s", line); + sscanf (line, "%s %d", keyword, &i); + pdata->sov_method = i; + } else if (!strncmp (keyword, "SOV-OUTPUT\0", 9)) { + printf ("%s", line); + sscanf (line, "%s %d", keyword, &i); + pdata->sov_out = i; + } else if (line[0] == '>') { + printf ("%s", line); + if (f_seq < 2) + n_aa = 0; + f_seq++; + } else if (f_seq == 0) { + if (j > 1) { + if (!strncmp (keyword, "SSP\0", 4)) { + sscanf (line, "%s %s %s %s %s", keyword, junk, first, second, third); + } else { + printf ("\n ERROR! (line: %d) Check COLUMN format of your prediction!\n", n_aa + 1); + fclose (fp); + exit (0); + } + } else { + sscanf (line, "%s %s %s", first, second, third); + } + aa[n_aa] = first[0]; + sss1[n_aa] = second[0]; + sss2[n_aa] = third[0]; + if (check_aa (aa[n_aa], letter, 23) == 23) { + printf ("\n# ERROR!\n%s", line); + printf ("\n# ERROR! (line: %d) Check amino acid code %c\n", n_aa + 1, aa[n_aa]); + fclose (fp); + exit (0); + } + if (sss1[n_aa] == ' ' || sss2[n_aa] == ' ') { + printf ("\n# ERROR!\n%s", line); + printf ("\n# ERROR! (line: %d) Check secondary structure code\n", n_aa + 1); + fclose (fp); + exit (0); + } + if (sss1[n_aa] == 'L') { + sss1[n_aa] = 'C'; + if (coil == 0) { + printf ("\n# WARNING! (line: %d) The 'L' characters are interpreted as 'C' (coil)", n_aa + 1); + coil = 1; + } + } + if (sss2[n_aa] == 'L') { + sss2[n_aa] = 'C'; + if (coil == 0) { + printf ("\n# WARNING! (line: %d) The 'L' characters are interpreted as 'C' (coil)", n_aa + 1); + coil = 1; + } + } + if (sss1[n_aa] != 'C' && sss1[n_aa] != 'E' && sss1[n_aa] != 'H') { + printf ("\n# ERROR!\n%s", line); + printf ("\n# ERROR! (line: %d) Check secondary structure code %c\n", n_aa + 1, sss1[n_aa]); + fclose (fp); + exit (0); + } + if (sss2[n_aa] != 'C' && sss2[n_aa] != 'E' && sss2[n_aa] != 'H') { + printf ("\n# ERROR!\n%s", line); + printf ("\n# ERROR! (line: %d) Check secondary structure code %c\n", n_aa + 1, sss2[n_aa]); + fclose (fp); + exit (0); + } + n_aa++; + if (n_aa >= MAXRES) { + printf ("\n# ERROR! Check number of amino acid lines. (MAX = %d)\n\n", MAXRES); + fclose (fp); + exit (0); + } + } else if (f_seq == 1) { + i = 0; + while (line[i] != '\n') { + if (line[i] != ' ' && line[i] != '\t' && line[i] != '\0' && line[i] != '\a' && line[i] != '\b' && line[i] != '\f' && line[i] != '\r' && line[i] != '\v' && i < 1000) { + aa[n_aa] = 'X'; + sss1[n_aa] = line[i]; + if (sss1[n_aa] == 'L') { + sss1[n_aa] = 'C'; + if (coil == 0) { + printf ("\n# WARNING! The 'L' characters are interpreted as 'C' (coil)"); + coil = 1; + } + } + if (sss1[n_aa] != 'C' && sss1[n_aa] != 'E' && sss1[n_aa] != 'H') { + printf ("\n# ERROR!\n%s", line); + printf ("\n# ERROR! Check secondary structure code: %c\n", sss1[n_aa]); + fclose (fp); + exit (0); + } + n_aa++; + if (n_aa >= MAXRES) { + printf ("\n# ERROR! Check number of residues. (MAX = %d)\n\n", MAXRES); + fclose (fp); + exit (0); + } + } + i++; + } + n_aa_1 = n_aa; + } else if (f_seq == 2) { + i = 0; + while (line[i] != '\n') { + if (line[i] != ' ' && line[i] != '\t' && line[i] != '\0' && line[i] != '\a' && line[i] != '\b' && line[i] != '\f' && line[i] != '\r' && line[i] != '\v' && i < 1000) { + aa[n_aa] = 'X'; + sss2[n_aa] = line[i]; + if (sss2[n_aa] == 'L') { + sss2[n_aa] = 'C'; + if (coil == 0) { + printf ("\n# WARNING! The 'L' characters are interpreted as 'C' (coil)"); + coil = 1; + } + } + if (sss2[n_aa] != 'C' && sss2[n_aa] != 'E' && sss2[n_aa] != 'H') { + printf ("\n# ERROR!\n%s", line); + printf ("\n# ERROR! Check secondary structure code: %c\n", sss2[n_aa]); + fclose (fp); + exit (0); + } + n_aa++; + if (n_aa >= MAXRES) { + printf ("\n# ERROR! Check number of residues. (MAX = %d)\n\n", MAXRES); + fclose (fp); + exit (0); + } + } + i++; + } + n_aa_2 = n_aa; + } else if (f_seq == 3) { + i = 0; + while (line[i] != '\n') { + if (line[i] != ' ' && line[i] != '\t' && line[i] != '\0' && line[i] != '\a' && line[i] != '\b' && line[i] != '\f' && line[i] != '\r' && line[i] != '\v' && i < 1000) { + aa[n_aa_3] = line[i]; + if (check_aa (aa[n_aa_3], letter, 23) == 23) { + printf ("\n# ERROR!\n%s", line); + printf ("\n# ERROR! (N_res: %d) Check amino acid code %c\n", n_aa_3 + 1, aa[n_aa_3]); + fclose (fp); + exit (0); + } + n_aa_3++; + if (n_aa_3 >= MAXRES) { + printf ("\n# ERROR! Check number of residues. (MAX = %d)\n\n", MAXRES); + fclose (fp); + exit (0); + } + } + i++; + } + } + } + if (n_aa_1 != n_aa_2) { + printf ("\n# ERROR! Check format of your submission.\n"); + fclose (fp); + exit (0); + } + return n_aa; +} + +/*------------------------------------------------------------- +/ +/ default_parameters - default parameters for SOV program +/ +/------------------------------------------------------------*/ +void +default_parameters (parameters * pdata) +{ + pdata->input = 0; + pdata->order = 0; + pdata->sov_method = 1; + pdata->sov_delta = 1.0; + pdata->sov_delta_s = 0.5; + pdata->sov_out = 0; + + return; +} + +/*----------------------------------------------------------- +/ +/ sov - evaluate SSp by the Segment OVerlap quantity (SOV) +/ Input: secondary structure segments +/ +/------------------------------------------------------------*/ +float +sov (int n_aa, char sss1[MAXRES], char sss2[MAXRES], parameters * pdata) +{ + int i, k, length1, length2, beg_s1, end_s1, beg_s2, end_s2; + int j1, j2, k1, k2, minov, maxov, d, d1, d2, n, multiple; + char s1, s2, sse[3]; + float out; + double s, x; + + sse[0] = '#'; + sse[1] = '#'; + sse[2] = '#'; + + if (pdata->sov_what == 0) { + sse[0] = 'H'; + sse[1] = 'E'; + sse[2] = 'C'; + } + if (pdata->sov_what == 1) { + sse[0] = 'H'; + sse[1] = 'H'; + sse[2] = 'H'; + } + if (pdata->sov_what == 2) { + sse[0] = 'E'; + sse[1] = 'E'; + sse[2] = 'E'; + } + if (pdata->sov_what == 3) { + sse[0] = 'C'; + sse[1] = 'C'; + sse[2] = 'C'; + } + n = 0; + for (i = 0; i < n_aa; i++) { + s1 = sss1[i]; + if (s1 == sse[0] || s1 == sse[1] || s1 == sse[2]) { + n++; + } + } + out = 0.0; + s = 0.0; + length1 = 0; + length2 = 0; + i = 0; + while (i < n_aa) { + beg_s1 = i; + s1 = sss1[i]; + while (sss1[i] == s1 && i < n_aa) { + i++; + } + end_s1 = i - 1; + length1 = end_s1 - beg_s1 + 1; + multiple = 0; + k = 0; + while (k < n_aa) { + beg_s2 = k; + s2 = sss2[k]; + while (sss2[k] == s2 && k < n_aa) { + k++; + } + end_s2 = k - 1; + length2 = end_s2 - beg_s2 + 1; + if (s1 == sse[0] || s1 == sse[1] || s1 == sse[2]) { + if (s1 == s2 && end_s2 >= beg_s1 && beg_s2 <= end_s1) { + if (multiple > 0 && pdata->sov_method == 1) { + n = n + length1; + } + multiple++; + if (beg_s1 > beg_s2) { + j1 = beg_s1; + j2 = beg_s2; + } else { + j1 = beg_s2; + j2 = beg_s1; + } + if (end_s1 < end_s2) { + k1 = end_s1; + k2 = end_s2; + } else { + k1 = end_s2; + k2 = end_s1; + } + minov = k1 - j1 + 1; + maxov = k2 - j2 + 1; + d1 = floor (length1 * pdata->sov_delta_s); + d2 = floor (length2 * pdata->sov_delta_s); + if (d1 > d2) + d = d2; + if (d1 <= d2 || pdata->sov_method == 0) + d = d1; + if (d > minov) { + d = minov; + } + if (d > maxov - minov) { + d = maxov - minov; + } + x = pdata->sov_delta * d; + x = (minov + x) * length1; + if (maxov > 0) { + s = s + x / maxov; + } else { + printf ("\n ERROR! minov = %-4d maxov = %-4d length = %-4d d = %-4d %4d %4d %4d %4d", minov, maxov, length1, d, beg_s1 + 1, end_s1 + 1, beg_s2 + 1, end_s2 + 1); + } + if (pdata->sov_out == 2) { + printf ("\n TEST: minov = %-4d maxov = %-4d length = %-4d d = %-4d %4d %4d %4d %4d", minov, maxov, length1, d, beg_s1 + 1, end_s1 + 1, beg_s2 + 1, end_s2 + 1); + } + } + } + } + } + if (pdata->sov_out == 2) { + printf ("\n TEST: Number of considered residues = %d", n); + } + if (n > 0) { + out = s / n; + } else { + out = 1.0; + } + return out; +} + +/*----------------------------------------------------------- +/ +/ Q3 - evaluate SSp by the residues predicted correctly (Q3) +/ Input: secondary structure segments +/ +/------------------------------------------------------------*/ +float +q3 (int n_aa, char sss1[MAXRES], char sss2[MAXRES], parameters * pdata) +{ + int i, n; + float out; + char s, sse[3]; + + sse[0] = '#'; + sse[1] = '#'; + sse[2] = '#'; + + if (pdata->q3_what == 0) { + sse[0] = 'H'; + sse[1] = 'E'; + sse[2] = 'C'; + } + if (pdata->q3_what == 1) { + sse[0] = 'H'; + sse[1] = 'H'; + sse[2] = 'H'; + } + if (pdata->q3_what == 2) { + sse[0] = 'E'; + sse[1] = 'E'; + sse[2] = 'E'; + } + if (pdata->q3_what == 3) { + sse[0] = 'C'; + sse[1] = 'C'; + sse[2] = 'C'; + } + + n = 0; + out = 0.0; + for (i = 0; i < n_aa; i++) { + s = sss1[i]; + if (s == sse[0] || s == sse[1] || s == sse[2]) { + n++; + if (sss1[i] == sss2[i]) { + out = out + 1.0; + } + } + } + if (n > 0) { + out = out / n; + } else { + out = 1.0; + } + + return out; +}