1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * SRE, Tue Aug 30 10:35:31 1994
14 * Compare RNA secondary structures.
15 * RCS $Id: compstruct_main.c,v 1.1.1.1 2005/03/22 08:34:22 cmzmasek Exp $
24 static char banner[] = "compalign - compare test RNA secondary structure predictions to trusted set";
27 Usage: compstruct [-options] <trusted file> <test file>\n\
28 Both files must contain secondary structure markup (e.g. Stockholm, SQUID,\n\
29 SELEX formats), and sequences must occur in the same order in the two files.\n\
31 Available options are:\n\
32 -h : print short help and usage info\n\
35 static char experts[] = "\
36 --informat <s> : specify that both alignments are in format <s> (SELEX, for instance)\n\
37 --quiet : suppress verbose header (used in regression testing)\n\
40 struct opt_s OPTIONS[] = {
41 { "-h", TRUE, sqdARG_NONE },
42 { "--informat", FALSE, sqdARG_STRING },
43 { "--quiet", FALSE, sqdARG_NONE },
45 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
48 static int KHS2ct(char *ss, int **ret_ct);
49 /* static void WriteCT(FILE *fp, char *seq, int *ct, int len); */
52 main(int argc, char **argv)
54 char *kfile, *tfile; /* known, test structure file */
55 int format; /* expected format of kfile, tfile */
56 SQFILE *kfp, *tfp; /* open kfile, tfile */
57 char *kseq, *tseq; /* known, test sequence */
58 SQINFO kinfo, tinfo; /* known, test info */
59 int *kct, *tct; /* known, test CT rep of structure */
63 int correct; /* count of correct base pair predictions */
64 int missedpair; /* count of false negatives */
65 int falsepair; /* count of false positives */
66 int tot_trusted; /* total base pairs in trusted structure */
67 int tot_predicted; /* total base pairs in predicted structure*/
68 int tot_correct; /* cumulative total correct pairs */
70 int dscorrect; /* count of correct 2-state paired prediction */
71 int sscorrect; /* count of correct 2-state unpaired prediction */
76 int quiet; /* TRUE to silence verbose banner */
82 /***********************************************
84 ***********************************************/
86 format = MSAFILE_UNKNOWN;
89 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
90 &optind, &optname, &optarg))
92 if (strcmp(optname, "--quiet") == 0) quiet = TRUE;
93 else if (strcmp(optname, "--informat") == 0) {
94 format = String2SeqfileFormat(optarg);
95 if (format == MSAFILE_UNKNOWN)
96 Die("unrecognized sequence file format \"%s\"", optarg);
97 if (! IsAlignmentFormat(format))
98 Die("%s is an unaligned format, can't read as an alignment", optarg);
100 else if (strcmp(optname, "-h") == 0) {
101 Banner(stdout, banner);
108 if (argc - optind != 2)
109 Die("Incorrect number of command line arguments.\n%s\n", usage);
111 kfile = argv[optind++];
112 tfile = argv[optind];
114 if (! quiet) Banner(stdout, banner);
116 /***********************************************
118 ***********************************************/
120 if ((kfp = SeqfileOpen(kfile, format, NULL)) == NULL)
121 Die("Failed to open trusted structure file %s for reading", kfile);
122 if ((tfp = SeqfileOpen(tfile, format, NULL)) == NULL)
123 Die("Failed to open test structure file %s for reading", tfile);
125 /***********************************************
126 * Do structure comparisons, one seq at a time
127 ***********************************************/
129 tot_trusted = tot_predicted = tot_correct = 0;
130 tot_dscorrect = tot_sscorrect = tot_positions = 0;
132 while (ReadSeq(kfp, kfp->format, &kseq, &kinfo) && ReadSeq(tfp, tfp->format, &tseq, &tinfo))
134 if (!quiet && strcmp(tinfo.name, kinfo.name) != 0)
135 Warn("Trusted sequence %s, test sequence %s -- names not identical\n",
136 kinfo.name, tinfo.name);
137 if (!quiet && strcmp(kseq, tseq) != 0)
138 Warn("Trusted sequence %s, test sequence %s -- sequences not identical\n",
139 kinfo.name, tinfo.name);
141 printf("%s %s\n", kinfo.name, (kinfo.flags & SQINFO_DESC) ? kinfo.desc : "");
143 if (! (tinfo.flags & SQINFO_SS) && ! (kinfo.flags & SQINFO_SS))
144 printf("[no test or trusted structure]\n\n");
145 else if (! (tinfo.flags & SQINFO_SS))
146 printf("[no test structure]\n\n");
147 else if (! (kinfo.flags & SQINFO_SS))
148 printf("[no trusted structure]\n\n");
151 if (! KHS2ct(kinfo.ss, &kct))
152 { printf("[bad trusted structure]\n"); goto CLEANUP;}
153 if (! KHS2ct(tinfo.ss, &tct))
154 { printf("[bad test structure]\n"); free(kct); goto CLEANUP; }
156 /* WriteCT(stdout, tseq, tct, tinfo.len); */
157 /* WriteCT(stdout, tseq, kct, tinfo.len); */
159 correct = falsepair = missedpair = 0;
160 dscorrect = sscorrect = 0;
161 for (pos = 0; pos < kinfo.len; pos++)
163 /* check if actual base pair is predicted */
164 if (kct[pos] >= 0 && kct[pos] == tct[pos])
166 else if (kct[pos] >= 0)
169 if (tct[pos] >= 0 && kct[pos] != tct[pos])
172 /* 2 state prediction */
173 if (kct[pos] >= 0 && tct[pos] >= 0)
175 else if (kct[pos] < 0 && tct[pos] < 0)
179 tot_trusted += correct + missedpair;
180 tot_predicted += correct + falsepair;
181 tot_correct += correct;
183 tot_dscorrect += dscorrect;
184 tot_sscorrect += sscorrect;
185 tot_positions += kinfo.len;
187 /* print out per sequence info */
188 printf(" %d/%d trusted pairs predicted (%.2f%% sensitivity)\n",
189 correct, correct+missedpair,
190 100. * (float) correct/ (float) (correct + missedpair));
191 printf(" %d/%d predicted pairs correct (%.2f%% specificity)\n",
192 correct, correct + falsepair,
193 100. * (float) correct/ (float) (correct + falsepair));
195 printf(" Two state: %d/%d positions correctly predicted (%.2f%% accuracy)\n",
196 dscorrect + sscorrect,
198 100. * (float) (dscorrect + sscorrect) / (float) kinfo.len);
207 FreeSequence(kseq, &kinfo);
208 FreeSequence(tseq, &tinfo);
211 /* And the final summary:
214 printf("Overall structure prediction accuracy (%d sequences, %d positions)\n",
215 nseq, tot_positions);
216 printf(" %d/%d trusted pairs predicted (%.2f%% sensitivity)\n",
217 tot_correct, tot_trusted,
218 100. * (float) tot_correct/ (float) tot_trusted);
219 printf(" %d/%d predicted pairs correct (%.2f%% specificity)\n",
220 tot_correct, tot_predicted,
221 100. * (float) tot_correct/ (float) tot_predicted);
222 printf(" Two state: %d/%d positions correctly predicted (%.2f%% accuracy)\n",
223 tot_dscorrect + tot_sscorrect, tot_positions,
224 100. * (float) (tot_dscorrect + tot_sscorrect) / (float) tot_positions);
233 /* Function: KHS2ct()
235 * Purpose: Convert a secondary structure string to an array of integers
236 * representing what position each position is base-paired
237 * to (0..len-1), or -1 if none. This is off-by-one from a
238 * Zuker .ct file representation.
240 * The .ct representation can accomodate pseudoknots but the
241 * secondary structure string cannot easily; the string contains
242 * "Aa", "Bb", etc. pairs as a limited representation of
243 * pseudoknots. The string contains "><" for base pairs.
244 * Other symbols are ignored.
246 * Return: ret_ct is allocated here and must be free'd by caller.
247 * Returns 1 on success, 0 if ss is somehow inconsistent.
250 KHS2ct(char *ss, int **ret_ct)
252 struct intstack_s *dolist[27];
256 int status = 1; /* success or failure return status */
259 for (i = 0; i < 27; i++)
260 dolist[i] = InitIntStack();
263 if ((ct = (int *) malloc (len * sizeof(int))) == NULL)
264 Die("malloc failed");
265 for (pos = 0; pos < len; pos++)
268 for (pos = 0; ss[pos] != '\0'; pos++)
270 if (ss[pos] == '>') /* left side of a pair: push onto stack 0 */
271 PushIntStack(dolist[0], pos);
272 else if (ss[pos] == '<') /* right side of a pair; resolve pair */
274 if (! PopIntStack(dolist[0], &pair))
282 /* same stuff for pseudoknots */
283 else if (isupper((int) ss[pos]))
284 PushIntStack(dolist[ss[pos] - 'A' + 1], pos);
285 else if (islower((int) ss[pos]))
287 if (! PopIntStack(dolist[ss[pos] - 'a' + 1], &pair))
295 else if (!isgap(ss[pos])) status = 0; /* bad character */
298 for (i = 0; i < 27; i++)
299 if ( FreeIntStack(dolist[i]) > 0)
308 /* Function: WriteCT()
310 * Purpose: Write a CT representation of a structure.
311 * Written in 1..len sense, with 0 for unpaired
315 WriteCT(FILE *fp, char *seq, int *ct, int len)
318 for (pos = 0; pos < len; pos++)
319 fprintf(fp, "%d %c %d\n", pos+1, seq[pos], ct[pos]+1);