--- /dev/null
+/*-----------------------------------------------------------
+/
+/ Program: sov.c
+/
+/ Secondary structure prediction accuracy evaluation
+/
+/ SOV (Segment OVerlap) measure
+/
+/ Copyright by Adam Zemla (11/16/1996)
+/ Email: adamz@llnl.gov
+/
+/------------------------------------------------------------
+/
+/ Compile: cc sov.c -o sov -lm
+/
+/------------------------------------------------------------*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#define MAXRES 2000
+
+typedef struct
+{
+ int input;
+ int order;
+ int q3_what;
+ int sov_what;
+ int sov_method;
+ float sov_delta;
+ float sov_delta_s;
+ int sov_out;
+ char fname[80];
+} parameters;
+
+char *letter_AA = "ARNDCQEGHILKMFPSTWYV-?X"; /* 23 chars */
+
+void default_parameters (parameters *);
+int read_aa_osec_psec (char[MAXRES], char[MAXRES], char[MAXRES], parameters *, char *);
+float sov (int, char[MAXRES], char[MAXRES], parameters *);
+float q3 (int, char[MAXRES], char[MAXRES], parameters *);
+int check_aa (char, char *, int);
+
+int
+main (int argc, char *argv[])
+{
+ int i, n_aa, sov_method;
+ char c, aa[MAXRES], osec[MAXRES], psec[MAXRES];
+ parameters pdata;
+ float out0, out1, out2, out3;
+
+ if (argc < 2) {
+ printf (" Usage: sov <input_data>\n");
+ printf (" HELP: sov -h\n");
+ exit (0);
+ }
+ if (!strncmp (argv[1], "-h\0", 2) || !strncmp (argv[1], "help\0", 5) || !strncmp (argv[1], "-help\0", 6)) {
+ system ("more ./README.sov");
+ printf ("\n");
+ exit (0);
+ }
+
+ default_parameters (&pdata);
+
+ strcpy (pdata.fname, argv[1]);
+
+ n_aa = read_aa_osec_psec (aa, osec, psec, &pdata, letter_AA);
+
+ if (pdata.input == 1) {
+ n_aa = read_aa_osec_psec (aa, osec, psec, &pdata, letter_AA);
+ }
+
+ if (pdata.order == 1) {
+ for (i = 0; i < n_aa; i++) {
+ c = osec[i];
+ osec[i] = psec[i];
+ psec[i] = c;
+ }
+ }
+
+ if (n_aa <= 0) {
+ printf ("\n ERROR! There is no 'AA OSEC PSEC' data in submited prediction.");
+ printf ("\n The submission should contain an observed and predicted");
+ printf ("\n secondary structure in COLUMN format.\n");
+ exit (0);
+ }
+
+ printf ("\n\n SECONDARY STRUCTURE PREDICTION");
+ printf ("\n NUMBER OF RESIDUES PREDICTED: LENGTH = %d", n_aa);
+ printf ("\n AA OSEC PSEC NUM");
+ for (i = 0; i < n_aa; i++) {
+ printf ("\n %1c %1c %1c %4d", aa[i], osec[i], psec[i], i + 1);
+ }
+ printf ("\n -----------------------\n");
+ printf ("\n SECONDARY STRUCTURE PREDICTION ACCURACY EVALUATION. N_AA = %4d\n", n_aa);
+ if (pdata.sov_out >= 1) {
+ printf ("\n SOV parameters: DELTA = %5.2f DELTA-S = %5.2f\n", pdata.sov_delta, pdata.sov_delta_s);
+ }
+
+ printf ("\n ALL HELIX STRAND COIL\n");
+
+ pdata.q3_what = 0;
+ out0 = q3 (n_aa, osec, psec, &pdata);
+ pdata.q3_what = 1;
+ out1 = q3 (n_aa, osec, psec, &pdata);
+ pdata.q3_what = 2;
+ out2 = q3 (n_aa, osec, psec, &pdata);
+ pdata.q3_what = 3;
+ out3 = q3 (n_aa, osec, psec, &pdata);
+ printf ("\n Q3 : %6.1f %6.1f %6.1f %6.1f", out0 * 100.0, out1 * 100.0, out2 * 100.0, out3 * 100.0);
+ printf ("\n");
+
+ sov_method = pdata.sov_method;
+
+ if (sov_method != 0)
+ pdata.sov_method = 1;
+
+ if (pdata.sov_method == 1) {
+ pdata.sov_what = 0;
+ out0 = sov (n_aa, osec, psec, &pdata);
+ pdata.sov_what = 1;
+ out1 = sov (n_aa, osec, psec, &pdata);
+ pdata.sov_what = 2;
+ out2 = sov (n_aa, osec, psec, &pdata);
+ pdata.sov_what = 3;
+ out3 = sov (n_aa, osec, psec, &pdata);
+ printf ("\n SOV : %6.1f %6.1f %6.1f %6.1f", out0 * 100.0, out1 * 100.0, out2 * 100.0, out3 * 100.0);
+ printf ("\n");
+ }
+
+ if (sov_method != 1)
+ pdata.sov_method = 0;
+
+ if (pdata.sov_method == 0) {
+ pdata.sov_delta = 1.0;
+
+ pdata.sov_what = 0;
+ out0 = sov (n_aa, osec, psec, &pdata);
+ pdata.sov_what = 1;
+ out1 = sov (n_aa, osec, psec, &pdata);
+ pdata.sov_what = 2;
+ out2 = sov (n_aa, osec, psec, &pdata);
+ pdata.sov_what = 3;
+ out3 = sov (n_aa, osec, psec, &pdata);
+ printf ("\n SOV (1994 JMB. [delta=50%%]): %6.1f %6.1f %6.1f %6.1f", out0 * 100.0, out1 * 100.0, out2 * 100.0, out3 * 100.0);
+
+ pdata.sov_delta = 0.0;
+
+ pdata.sov_what = 0;
+ out0 = sov (n_aa, osec, psec, &pdata);
+ pdata.sov_what = 1;
+ out1 = sov (n_aa, osec, psec, &pdata);
+ pdata.sov_what = 2;
+ out2 = sov (n_aa, osec, psec, &pdata);
+ pdata.sov_what = 3;
+ out3 = sov (n_aa, osec, psec, &pdata);
+ printf ("\n SOV (1994 JMB. [delta=0]) : %6.1f %6.1f %6.1f %6.1f", out0 * 100.0, out1 * 100.0, out2 * 100.0, out3 * 100.0);
+
+ printf ("\n");
+ }
+
+ printf ("\n -----------------------\n");
+
+ exit (0);
+}
+
+/*-----------------------------------------------------------
+/
+/ check_aa - checks an amino acid
+/
+/------------------------------------------------------------*/
+int
+check_aa (char token, char *letter, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++) {
+ if (letter[i] == token)
+ return i;
+ }
+ return n;
+}
+
+/*-----------------------------------------------------------
+/
+/ read_aa_osec_psec - read secondary structure segments file
+/
+/------------------------------------------------------------*/
+int
+read_aa_osec_psec (char aa[MAXRES], char sss1[MAXRES], char sss2[MAXRES], parameters * pdata, char *letter)
+{
+ int i, j, coil, n_aa, n_aa_1, n_aa_2, n_aa_3, f_seq;
+ float x;
+ char line[1000], keyword[250], first[250], second[250], third[250], junk[250];
+ FILE *fp;
+
+ if ((fp = fopen (pdata->fname, "r")) == NULL) {
+ printf ("\n# error opening file %s for read\n\n", pdata->fname);
+ exit (0);
+ }
+
+ f_seq = 0;
+ pdata->input = 0;
+ n_aa = 0;
+ n_aa_1 = 0;
+ n_aa_2 = 0;
+ n_aa_3 = 0;
+ coil = 0;
+
+ while (fgets (line, 1000, fp) != NULL) {
+ strcpy (keyword, " ");
+ strcpy (first, " ");
+ strcpy (second, " ");
+ strcpy (third, " ");
+ strcpy (junk, " ");
+ i = 0;
+ j = 0;
+ while (line[i] == ' ' && line[i] != '\n' && line[i] != '\0' && i < 250)
+ i++;
+ if (i < 250) {
+ j = i;
+ while (line[i] != ' ' && line[i] != '\n' && line[i] != '\0' && i < 250)
+ i++;
+ }
+ j = i - j;
+ if (j < 250 && j > 0) {
+ sscanf (line, "%s", keyword);
+ }
+ if (!strncmp (keyword, "#", 1)) {
+ } else if (!strncmp (keyword, "-----", 5)) {
+ } else if (!strncmp (keyword, "NUMBER\0", 7)) {
+ } else if (!strncmp (keyword, "SECONDARY\0", 10)) {
+ } else if (!strncmp (keyword, "END\0", 4) && f_seq == 0) {
+ fclose (fp);
+ return n_aa;
+ } else if (!strncmp (keyword, "AA-OSEC-PSEC\0", 13)) {
+ printf ("%s", line);
+ sscanf (line, "%s %s", keyword, first);
+ strcpy (pdata->fname, first);
+ pdata->input = 1;
+ } else if (line[0] == '\n' || !strncmp (keyword, " \0", 4)) {
+ } else if (!strncmp (keyword, "AA\0", 3) && f_seq == 0) {
+ sscanf (line, "%s %s %s", keyword, first, second);
+ if (!strncmp (keyword, "AA\0", 3) && !strncmp (first, "PSEC\0", 5) && !strncmp (second, "OSEC\0", 5)) {
+ pdata->order = 1;
+ }
+ } else if (!strncmp (keyword, "SOV-DELTA\0", 10)) {
+ printf ("%s", line);
+ sscanf (line, "%s %f", keyword, &x);
+ pdata->sov_delta = x;
+ } else if (!strncmp (keyword, "SOV-DELTA-S\0", 12)) {
+ printf ("%s", line);
+ sscanf (line, "%s %f", keyword, &x);
+ pdata->sov_delta_s = x;
+ } else if (!strncmp (keyword, "SOV-METHOD\0", 9)) {
+ printf ("%s", line);
+ sscanf (line, "%s %d", keyword, &i);
+ pdata->sov_method = i;
+ } else if (!strncmp (keyword, "SOV-OUTPUT\0", 9)) {
+ printf ("%s", line);
+ sscanf (line, "%s %d", keyword, &i);
+ pdata->sov_out = i;
+ } else if (line[0] == '>') {
+ printf ("%s", line);
+ if (f_seq < 2)
+ n_aa = 0;
+ f_seq++;
+ } else if (f_seq == 0) {
+ if (j > 1) {
+ if (!strncmp (keyword, "SSP\0", 4)) {
+ sscanf (line, "%s %s %s %s %s", keyword, junk, first, second, third);
+ } else {
+ printf ("\n ERROR! (line: %d) Check COLUMN format of your prediction!\n", n_aa + 1);
+ fclose (fp);
+ exit (0);
+ }
+ } else {
+ sscanf (line, "%s %s %s", first, second, third);
+ }
+ aa[n_aa] = first[0];
+ sss1[n_aa] = second[0];
+ sss2[n_aa] = third[0];
+ if (check_aa (aa[n_aa], letter, 23) == 23) {
+ printf ("\n# ERROR!\n%s", line);
+ printf ("\n# ERROR! (line: %d) Check amino acid code %c\n", n_aa + 1, aa[n_aa]);
+ fclose (fp);
+ exit (0);
+ }
+ if (sss1[n_aa] == ' ' || sss2[n_aa] == ' ') {
+ printf ("\n# ERROR!\n%s", line);
+ printf ("\n# ERROR! (line: %d) Check secondary structure code\n", n_aa + 1);
+ fclose (fp);
+ exit (0);
+ }
+ if (sss1[n_aa] == 'L') {
+ sss1[n_aa] = 'C';
+ if (coil == 0) {
+ printf ("\n# WARNING! (line: %d) The 'L' characters are interpreted as 'C' (coil)", n_aa + 1);
+ coil = 1;
+ }
+ }
+ if (sss2[n_aa] == 'L') {
+ sss2[n_aa] = 'C';
+ if (coil == 0) {
+ printf ("\n# WARNING! (line: %d) The 'L' characters are interpreted as 'C' (coil)", n_aa + 1);
+ coil = 1;
+ }
+ }
+ if (sss1[n_aa] != 'C' && sss1[n_aa] != 'E' && sss1[n_aa] != 'H') {
+ printf ("\n# ERROR!\n%s", line);
+ printf ("\n# ERROR! (line: %d) Check secondary structure code %c\n", n_aa + 1, sss1[n_aa]);
+ fclose (fp);
+ exit (0);
+ }
+ if (sss2[n_aa] != 'C' && sss2[n_aa] != 'E' && sss2[n_aa] != 'H') {
+ printf ("\n# ERROR!\n%s", line);
+ printf ("\n# ERROR! (line: %d) Check secondary structure code %c\n", n_aa + 1, sss2[n_aa]);
+ fclose (fp);
+ exit (0);
+ }
+ n_aa++;
+ if (n_aa >= MAXRES) {
+ printf ("\n# ERROR! Check number of amino acid lines. (MAX = %d)\n\n", MAXRES);
+ fclose (fp);
+ exit (0);
+ }
+ } else if (f_seq == 1) {
+ i = 0;
+ while (line[i] != '\n') {
+ if (line[i] != ' ' && line[i] != '\t' && line[i] != '\0' && line[i] != '\a' && line[i] != '\b' && line[i] != '\f' && line[i] != '\r' && line[i] != '\v' && i < 1000) {
+ aa[n_aa] = 'X';
+ sss1[n_aa] = line[i];
+ if (sss1[n_aa] == 'L') {
+ sss1[n_aa] = 'C';
+ if (coil == 0) {
+ printf ("\n# WARNING! The 'L' characters are interpreted as 'C' (coil)");
+ coil = 1;
+ }
+ }
+ if (sss1[n_aa] != 'C' && sss1[n_aa] != 'E' && sss1[n_aa] != 'H') {
+ printf ("\n# ERROR!\n%s", line);
+ printf ("\n# ERROR! Check secondary structure code: %c\n", sss1[n_aa]);
+ fclose (fp);
+ exit (0);
+ }
+ n_aa++;
+ if (n_aa >= MAXRES) {
+ printf ("\n# ERROR! Check number of residues. (MAX = %d)\n\n", MAXRES);
+ fclose (fp);
+ exit (0);
+ }
+ }
+ i++;
+ }
+ n_aa_1 = n_aa;
+ } else if (f_seq == 2) {
+ i = 0;
+ while (line[i] != '\n') {
+ if (line[i] != ' ' && line[i] != '\t' && line[i] != '\0' && line[i] != '\a' && line[i] != '\b' && line[i] != '\f' && line[i] != '\r' && line[i] != '\v' && i < 1000) {
+ aa[n_aa] = 'X';
+ sss2[n_aa] = line[i];
+ if (sss2[n_aa] == 'L') {
+ sss2[n_aa] = 'C';
+ if (coil == 0) {
+ printf ("\n# WARNING! The 'L' characters are interpreted as 'C' (coil)");
+ coil = 1;
+ }
+ }
+ if (sss2[n_aa] != 'C' && sss2[n_aa] != 'E' && sss2[n_aa] != 'H') {
+ printf ("\n# ERROR!\n%s", line);
+ printf ("\n# ERROR! Check secondary structure code: %c\n", sss2[n_aa]);
+ fclose (fp);
+ exit (0);
+ }
+ n_aa++;
+ if (n_aa >= MAXRES) {
+ printf ("\n# ERROR! Check number of residues. (MAX = %d)\n\n", MAXRES);
+ fclose (fp);
+ exit (0);
+ }
+ }
+ i++;
+ }
+ n_aa_2 = n_aa;
+ } else if (f_seq == 3) {
+ i = 0;
+ while (line[i] != '\n') {
+ if (line[i] != ' ' && line[i] != '\t' && line[i] != '\0' && line[i] != '\a' && line[i] != '\b' && line[i] != '\f' && line[i] != '\r' && line[i] != '\v' && i < 1000) {
+ aa[n_aa_3] = line[i];
+ if (check_aa (aa[n_aa_3], letter, 23) == 23) {
+ printf ("\n# ERROR!\n%s", line);
+ printf ("\n# ERROR! (N_res: %d) Check amino acid code %c\n", n_aa_3 + 1, aa[n_aa_3]);
+ fclose (fp);
+ exit (0);
+ }
+ n_aa_3++;
+ if (n_aa_3 >= MAXRES) {
+ printf ("\n# ERROR! Check number of residues. (MAX = %d)\n\n", MAXRES);
+ fclose (fp);
+ exit (0);
+ }
+ }
+ i++;
+ }
+ }
+ }
+ if (n_aa_1 != n_aa_2) {
+ printf ("\n# ERROR! Check format of your submission.\n");
+ fclose (fp);
+ exit (0);
+ }
+ return n_aa;
+}
+
+/*-------------------------------------------------------------
+/
+/ default_parameters - default parameters for SOV program
+/
+/------------------------------------------------------------*/
+void
+default_parameters (parameters * pdata)
+{
+ pdata->input = 0;
+ pdata->order = 0;
+ pdata->sov_method = 1;
+ pdata->sov_delta = 1.0;
+ pdata->sov_delta_s = 0.5;
+ pdata->sov_out = 0;
+
+ return;
+}
+
+/*-----------------------------------------------------------
+/
+/ sov - evaluate SSp by the Segment OVerlap quantity (SOV)
+/ Input: secondary structure segments
+/
+/------------------------------------------------------------*/
+float
+sov (int n_aa, char sss1[MAXRES], char sss2[MAXRES], parameters * pdata)
+{
+ int i, k, length1, length2, beg_s1, end_s1, beg_s2, end_s2;
+ int j1, j2, k1, k2, minov, maxov, d, d1, d2, n, multiple;
+ char s1, s2, sse[3];
+ float out;
+ double s, x;
+
+ sse[0] = '#';
+ sse[1] = '#';
+ sse[2] = '#';
+
+ if (pdata->sov_what == 0) {
+ sse[0] = 'H';
+ sse[1] = 'E';
+ sse[2] = 'C';
+ }
+ if (pdata->sov_what == 1) {
+ sse[0] = 'H';
+ sse[1] = 'H';
+ sse[2] = 'H';
+ }
+ if (pdata->sov_what == 2) {
+ sse[0] = 'E';
+ sse[1] = 'E';
+ sse[2] = 'E';
+ }
+ if (pdata->sov_what == 3) {
+ sse[0] = 'C';
+ sse[1] = 'C';
+ sse[2] = 'C';
+ }
+ n = 0;
+ for (i = 0; i < n_aa; i++) {
+ s1 = sss1[i];
+ if (s1 == sse[0] || s1 == sse[1] || s1 == sse[2]) {
+ n++;
+ }
+ }
+ out = 0.0;
+ s = 0.0;
+ length1 = 0;
+ length2 = 0;
+ i = 0;
+ while (i < n_aa) {
+ beg_s1 = i;
+ s1 = sss1[i];
+ while (sss1[i] == s1 && i < n_aa) {
+ i++;
+ }
+ end_s1 = i - 1;
+ length1 = end_s1 - beg_s1 + 1;
+ multiple = 0;
+ k = 0;
+ while (k < n_aa) {
+ beg_s2 = k;
+ s2 = sss2[k];
+ while (sss2[k] == s2 && k < n_aa) {
+ k++;
+ }
+ end_s2 = k - 1;
+ length2 = end_s2 - beg_s2 + 1;
+ if (s1 == sse[0] || s1 == sse[1] || s1 == sse[2]) {
+ if (s1 == s2 && end_s2 >= beg_s1 && beg_s2 <= end_s1) {
+ if (multiple > 0 && pdata->sov_method == 1) {
+ n = n + length1;
+ }
+ multiple++;
+ if (beg_s1 > beg_s2) {
+ j1 = beg_s1;
+ j2 = beg_s2;
+ } else {
+ j1 = beg_s2;
+ j2 = beg_s1;
+ }
+ if (end_s1 < end_s2) {
+ k1 = end_s1;
+ k2 = end_s2;
+ } else {
+ k1 = end_s2;
+ k2 = end_s1;
+ }
+ minov = k1 - j1 + 1;
+ maxov = k2 - j2 + 1;
+ d1 = floor (length1 * pdata->sov_delta_s);
+ d2 = floor (length2 * pdata->sov_delta_s);
+ if (d1 > d2)
+ d = d2;
+ if (d1 <= d2 || pdata->sov_method == 0)
+ d = d1;
+ if (d > minov) {
+ d = minov;
+ }
+ if (d > maxov - minov) {
+ d = maxov - minov;
+ }
+ x = pdata->sov_delta * d;
+ x = (minov + x) * length1;
+ if (maxov > 0) {
+ s = s + x / maxov;
+ } else {
+ printf ("\n ERROR! minov = %-4d maxov = %-4d length = %-4d d = %-4d %4d %4d %4d %4d", minov, maxov, length1, d, beg_s1 + 1, end_s1 + 1, beg_s2 + 1, end_s2 + 1);
+ }
+ if (pdata->sov_out == 2) {
+ printf ("\n TEST: minov = %-4d maxov = %-4d length = %-4d d = %-4d %4d %4d %4d %4d", minov, maxov, length1, d, beg_s1 + 1, end_s1 + 1, beg_s2 + 1, end_s2 + 1);
+ }
+ }
+ }
+ }
+ }
+ if (pdata->sov_out == 2) {
+ printf ("\n TEST: Number of considered residues = %d", n);
+ }
+ if (n > 0) {
+ out = s / n;
+ } else {
+ out = 1.0;
+ }
+ return out;
+}
+
+/*-----------------------------------------------------------
+/
+/ Q3 - evaluate SSp by the residues predicted correctly (Q3)
+/ Input: secondary structure segments
+/
+/------------------------------------------------------------*/
+float
+q3 (int n_aa, char sss1[MAXRES], char sss2[MAXRES], parameters * pdata)
+{
+ int i, n;
+ float out;
+ char s, sse[3];
+
+ sse[0] = '#';
+ sse[1] = '#';
+ sse[2] = '#';
+
+ if (pdata->q3_what == 0) {
+ sse[0] = 'H';
+ sse[1] = 'E';
+ sse[2] = 'C';
+ }
+ if (pdata->q3_what == 1) {
+ sse[0] = 'H';
+ sse[1] = 'H';
+ sse[2] = 'H';
+ }
+ if (pdata->q3_what == 2) {
+ sse[0] = 'E';
+ sse[1] = 'E';
+ sse[2] = 'E';
+ }
+ if (pdata->q3_what == 3) {
+ sse[0] = 'C';
+ sse[1] = 'C';
+ sse[2] = 'C';
+ }
+
+ n = 0;
+ out = 0.0;
+ for (i = 0; i < n_aa; i++) {
+ s = sss1[i];
+ if (s == sse[0] || s == sse[1] || s == sse[2]) {
+ n++;
+ if (sss1[i] == sss2[i]) {
+ out = out + 1.0;
+ }
+ }
+ }
+ if (n > 0) {
+ out = out / n;
+ } else {
+ out = 1.0;
+ }
+
+ return out;
+}