6 use File::Temp qw(tempfile);
7 use File::Find qw(find);
11 our @EXPORT_OK = qw(profile reduce_dssp conf jury find_jury read_file_list concise_record seq2int int2seq abs_solv_acc rel_solv_acc array_shuffle array_shuffle_in_place array_split $split_resid sort_resid find_files);
12 push @EXPORT_OK, map { "reduce_dssp_$_" } qw(a b c jpred);
14 # Reduce eight state DSSP definition to three states
15 sub reduce_dssp ($;&) {
16 my ($seq, $code_ref) = @_;
18 $code_ref ||= \&reduce_dssp_jpred;
19 return $code_ref->($seq);
22 # reduce_dssp_[abc] are the methods mentioned in the Jpred
24 sub reduce_dssp_a ($) {
25 my ($sec_string) = @_;
27 $sec_string =~ tr/EBGH/EEHH/;
29 $sec_string =~ tr/EH/-/c;
33 sub reduce_dssp_b ($) {
34 my ($sec_string) = @_;
36 $sec_string =~ tr/EH/-/c;
37 $sec_string =~ s/(?<!E)EE(?!E)/--/g;
38 $sec_string =~ s/(?<!H)HHHH(?!H)/----/g;
40 $sec_string =~ tr/EH/-/c;
44 sub reduce_dssp_c ($) {
45 my ($sec_string) = @_;
47 $sec_string =~ s/GGGHHHH/HHHHHHH/g;
48 $sec_string =~ tr/B/-/;
49 $sec_string =~ s/GGG/---/g;
51 $sec_string =~ tr/EH/-/c;
55 =head2 reduce_dssp_jpred()
57 # This is the method actually used in the q3anal.version2
59 # q3anal{,_dev} convert G to H.
63 sub reduce_dssp_jpred ($) {
64 my ($sec_string) = @_;
66 $sec_string =~ tr/ST_bGIC? /-/;
67 $sec_string =~ tr/B/E/;
68 $sec_string =~ tr/EH/-/c;
75 # Select the subset of a prediction where the confidence is greater than some
77 # Pass at least three scalars:
78 # First is a string containing a list of integers between 0 and 9 which
79 # correspond to the confidence at that position.
80 # Second is a lower limit for the confidence.
81 # Other arguments are strings of the sequence to be selected.
86 my ($conf, $limit, @seqs) = @_;
88 croak "Sequences of different length" if grep { length $_ ne length $conf } @seqs;
90 for my $pos (reverse 0..length($conf) - 1) {
91 if (substr($conf, $pos, 1) >= $limit) {
93 $_ = substr($_, 0, $pos).substr($_, $pos)
103 Find the positions where there is jury agreement
104 First argument is the jury sequence, the rest are sequences to be subselected.
109 my ($jury, @seqs) = @_;
111 croak "Sequences of different length" if grep { length $_ != length $jury } @seqs;
113 # All positions are jury
114 if ($jury =~ y/*// == length $jury) {
117 # Goes through sequences and removes those positions that are not jury positions.
118 for my $pos (reverse 0..length($jury)- 1) {
119 if (substr($jury, $pos, 1) eq '*') {
120 $_ = substr($_, 0, $pos).substr($_, $pos + 1) for @seqs;
121 $_ = substr($_, 0, $pos).substr($_, $pos + 1) for $jury;
128 =head2 find_jury(@seqs)
130 Find those positions in a prediction that are in a jury position (those positions where the predictions agree).
132 Pass an array of sequences (represented as strings). Returns a string with the jury position represented by a space and non-jury positions as "*".
139 my $length = length $seqs[0];
140 croak "Sequences of different length" if grep { $length != length $_ } @seqs;
143 for my $pos (0..$length - 1) {
144 # Are all of the positions the same?
145 if (keys %{ { map { substr($_, $pos, 1), undef } @seqs } } == 1) {
156 =head2 @AoA = read_file_list($path)
158 Reads in a file from the $path of the format:
166 Returns an array of arrays of the list of files. For the above file, this would be:
169 [ "file1a", "file2a" ],
170 [ "file1b", "file2b" ],
171 [ "file1c", "file2c" ],
172 [ "file1d", "file2d" ],
173 [ "file1e", "file2e" ]
178 sub read_file_list ($) {
181 local ($/, $_) = "\n";
183 open my $fh, $file or croak "Can't open file $file";
188 push @files, [ split ];
193 =head2 @AoA = profile(qw(PROTEIN MSA----))
195 Pass an array of proteins sequences represented as strings of one letter residues codes, returns an array of arrays of the frequences of each residue.
197 This is calculated using all of the sequences in the alignment, and doesn't count gaps, 'X' or 'Z'.
201 # Does a profile on a sequence
206 # Check that all the sequences are the same length
207 my $length = length $seqs[0];
208 croak "Not all sequences are the same length" if grep { length $_ != $length } @seqs;
210 # Convert residues to integers
211 my @ar = map { [ seq2int(split //, $_) ] } @seqs;
213 # Find out how many of each type of residue occur at a position and create
215 for my $i (0..$length - 1) { # for each position
216 # The number of elements should be one larger than the max size in
218 my @countup = (0) x 20;
220 $ar[$_][$i] < 20 && $countup[ $ar[$_][$i] ]++ for 0..$#ar; # for each sequence
223 $ar[$_][$i] < 20 && $countup[ $ar[$_][$i] ]++
227 $total += $_ for @countup;
228 #$total += $countup[$_] for 0..$#countup;
232 # Check $total for div/0
233 # sprintf "%2.f", ($total ? $countup[$_] / $total * 10 : 0)
235 sprintf "%2.f", ($total ? $_ / $total * 10 : 0)
243 =head2 @integers = seq2int(@residues)
245 Converts a sequence into integer values. @residues should be an array of characters of the one letter amino acids codes.
247 =head2 @residues = int2seq(@integers)
249 Converts an array of integers into one letter amino acids codes produced by seq2int().
254 # This values should be as seq2int() in jnet.c
255 my (%seq2int, %int2seq);
256 @seq2int{split //, 'ARNDCQEGHILKMFPSTWYVBZX.-'} = (0..23, 23);
257 $seq2int{U} = $seq2int{C};
258 %int2seq = reverse %seq2int;
262 exists $seq2int{uc $_} ?
264 croak "Residue '$_' not recognised"
270 exists $int2seq{$_} ?
272 croak "Residue '$_' not recognised"
277 =head2 $concise = concise_record($title, @data)
279 Simple function to give a line in a concise file. Pass the title of the concise file, and the data that you want in the record. Returns a string in the concise format with a newline.
283 sub concise_record ($@) {
284 my ($title, @items) = @_;
285 return "$title:".join(",", @items)."\n";
290 'A' => '118', 'B' => '162', 'C' => '146', 'D' => '159',
291 'E' => '186', 'F' => '223', 'G' => '88', 'H' => '203', 'I' => '181', 'K' => '226', 'L' => '193', 'M' => '204',
292 'N' => '166', 'P' => '147', 'Q' => '193', 'R' => '256',
293 'S' => '130', 'T' => '153', 'V' => '165', 'W' => '266',
294 'X' => '200', 'Y' => '237', 'Z' => '189', 'c' => '146'
297 =head2 $solv = rel_solv_acc($residue, $accesibility);
299 Pass the residue and the accessiblilty of the residue, and the relative solvent accesibility of the residue will be returned.
303 sub rel_solv_acc ($$) {
304 my ($residue, $acc) = @_;
305 return unless $data{uc $residue};
306 return $acc / $data{uc $residue};
309 =head2 $solv = abs_solv_acc($residue, $accesibility);
311 Pass the residue and the accessiblilty of the residue, and the absolute solvent accesibility of the residue will be returned.
315 sub abs_solv_acc ($$) {
316 my ($residue, $acc) = @_;
317 return unless $data{uc $residue};
318 return $data{uc $residue} * $acc;
322 =head2 @shuffled_data = array_shuffle(@data)
324 Randomly shuffles an array.
328 sub array_shuffle (@) {
331 my ($foo, $bar) = (int(rand $#data), int(rand $#data));
332 @data[$foo, $bar] = @data[$bar, $foo];
337 =head2 array_shuffle_in_place(@data)
339 Randomly shuffle an array in place.
343 sub array_shuffle_in_place (@) {
345 my ($foo, $bar) = (int(rand $#_), int(rand $#_));
346 @_[$foo, $bar] = @_[$bar, $foo];
350 =head2 @AoA = array_split($size, @data)
352 Splits an array into $size sets of data returned in an AoA.
356 sub array_split ($@) {
357 my ($size, @array) = @_;
359 while (@array) { push @{ $temp[$i++ % $size] }, pop @array }
364 =head2 $split_resid = qr/(-?\d+)([[:alpha:]])/;
366 Regular expression for dividing PDB residue offsets into the offset and letter components.
370 our $split_resid = qr/(-?\d+)([[:alpha:]])?/;
374 A subroutine for the sorting of PDB residue offsets, allows for the correct sorting of IDs such as "1A".
381 (my ($a_off, $a_id) = $a =~ $split_resid);
382 (my ($b_off, $b_id) = $b =~ $split_resid);
384 $a_off <=> $b_off || $a_id cmp $b_id;
388 =head2 @files = find_files($dir)
390 Find all files under $dir.
400 push @files, $File::Find::dir."/".$_ if -e $_;