6 #use Hash::Util qw(lock_keys);
8 use File::Temp qw(tempfile);
13 use base qw(Exporter);
14 our @EXPORT_OK = qw(q3 sov sov_wrapper matthews);
15 our %EXPORT_TAGS = (all => [ @EXPORT_OK ]);
19 Quality.pm - Module for assessing the quality of a secondary structure prediction.
23 use Quality.pm qw(:all)
25 my $pred_seq = "HHHHEEEELLLL";
26 my $obs_seq = "HEHHEEEELELL";
28 print q3($pred_seq, $obs_seq); # Should print 0.8333
30 print sov($pred_seq, $obs_seq); # Should print
32 print sov_wrapper($pred_seq, $obs_seq);
36 A collection of functions for calculating the accuracy of a particular secondary structure predicion. Provided are functions for calculating Q3, Segment overlap and the Matthews correlation.
38 =head2 $q3_score = q3($predicted_sequence, $observed_sequence)
40 Both sequences should be scalars of equal length.
42 Passing an optional third argument (a single character) indicates that you don't want the Q3 score but the Q-index score for a particular state.
47 my ($pred, $obs, $i) = @_;
49 croak "Undefined parameter" if grep { not defined $_ } $pred, $obs;
51 my @pred = split //, $pred;
52 my @obs = split //, $obs;
54 croak "Predicted sequence and observed sequence are not the same length" unless @pred == @obs;
55 croak "Sequences are of zero length" unless @pred;
58 my $correct = grep { $pred[$_] eq $i and $pred[$_] eq $obs[$_] } 0..$#pred;
59 my $number = grep { $_ eq $i } @obs;
60 return $number ? $correct / $number : 1;
63 my $correct = grep { $pred[$_] eq $obs[$_] } 0..$#pred;
64 return $correct / @pred;
68 =head1 $sov = sov($pred, $obs);
70 A pure perl implementation of the SOV algorithim as found in the C SOV program
71 from CASP. Not finished.
78 return $one > $two ? ($one, $two) : ($two, $one);
82 my ($pred, $obs, $state) = @_;
84 die "This function has not been error checked yet";
86 croak "Undefined parameter" if grep { not defined $_ } $pred, $obs;
88 my @pred = split //, $pred;
89 my @obs = split //, $obs;
91 croak "Predicted sequence and observed sequence are not the same length" unless @pred == @obs;
92 croak "Sequences are of zero length" unless @pred;
94 my (%seg_one, %seg_two);
95 #lock_keys %seg_one, qw(start state end length);
96 #lock_keys %seg_two, qw(start state end length);
97 my ($sov_method, $sov_delta, $sov_delta_s) = (1, 1, 0.5);
99 my $number = $state ? grep { $state eq $_ } 0..$#obs : @obs;
100 return 1 unless $number;
102 my ($s, $n, $i) = (0, 0, 0);
103 while ($i < $#pred) {
105 ($seg_one{start}, $seg_one{state}) = ($i, $obs[$i]);
106 $i++ while $obs[$i] eq $seg_one{state} and $i < $#pred;
107 ($seg_one{end}, $seg_one{length}) = ($i - 1, $i - $seg_one{start});
109 my ($multiple, $k) = (0, 0);
110 while ($k < $#pred) {
112 ($seg_two{start}, $seg_two{state}) = ($k, $pred[$k]);
113 $k++ while $pred[$k] eq $seg_two{state} and $k < $#pred;
114 ($seg_two{end}, $seg_two{length}) = ($k - 1, $k - $seg_two{start});
116 if ($state ? $seg_one{state} eq $state : 1) {
118 $seg_one{state} eq $seg_two{state} and
119 $seg_two{end} >= $seg_one{start} and
120 $seg_two{start} <= $seg_one{start}
122 $n += $seg_one{length} if $multiple > 0 and $sov_method == 1;
124 my ($j1, $j2) = max_min $seg_one{start}, $seg_two{start};
125 my ($k1, $k2) = reverse max_min $seg_one{end}, $seg_two{end};
127 my ($minov, $maxov) = ($k1 - $j1 + 1, $k2 - $j2 + 1);
128 my ($d1, $d2) = map { floor $_ * $sov_delta_s } $seg_one{length}, $seg_two{length};
131 if ($d1 > $d2) { $d = $d2 }
132 elsif ($d1 <= $d2 or $sov_method == 0) { $d = $d1 }
134 if ($d > $minov) { $d = $minov }
135 if ($d > $maxov - $minov) { $d = $maxov - $minov }
137 my $x = ($minov + $sov_delta * $d) * $seg_one{length};
139 if ($maxov > 0) { $s += $x / $maxov }
140 else { carp "Negative maxov\n" }
149 =head1 $sov = sov_wrapper($aa, $pred, $obs);
151 This is a wrapper to the CASP SOV C program.
153 All three arguments are required, the first is the amino acid sequence, the
154 second is the predicted sequence, the third is the observed sequence.
156 A object of class SOV will be returned on success.
160 sub sov_wrapper ($$$) {
161 my ($seq, $pred, $obs) = @_;
163 croak "Sequences of different length" if grep { length $_ != length $seq } $obs, $pred;
165 map { y/-/C/ } $obs, $pred;
167 my $file = "AA OSEC PSEC\n";
168 $file .= join "\n", map {
169 substr($obs, $_, 1).' '.
170 substr($obs, $_, 1).' '.
174 my $sovres = SOV->new;
176 my ($fh, $fn) = tempfile;
181 my $pid = open my $rdr, "$sov $fn|" or die $!;
191 =head1 $matthews = matthews($pred, $obs)
193 Calculates the Matthews coffeicent of a predicted secondary structure.
197 sub matthews ($$;$) {
198 my ($pred, $obs, $i) = @_;
200 croak "Undefined parameter" if grep { not defined $_ } $pred, $obs, $i;
202 my @pred = split //, $pred;
203 my @obs = split //, $obs;
205 croak "Predicted sequence and observed sequence are not the same length" unless @pred == @obs;
206 croak "Sequences are of zero length" unless @pred;
208 my ($p, $n, $o, $u) = (0, 0, 0, 0);
212 if ($pred[$_] eq $obs[$_]) {
214 if ($pred[$_] eq $i) { $p++ }
219 # Incorrect prediction
222 if ($pred[$_] eq $i) { $o++ }
228 return ( ($p * $n) - ($u * $o) ) /
229 sqrt( ($n + $u) * ($n + $o) * ($p + $u) * ($p + $o) );
234 http://predictioncenter.llnl.gov/casp3/results/ab-measures.html
238 Jonathan Barber <jon@compbio.dundee.ac.uk>