binaries/src/jpred/lib/Utils.pm

   1 package Utils;
   2
   3 use strict;
   4 use warnings;
   5 use Carp;
   6 use File::Temp qw(tempfile);
   7 use File::Find qw(find);
   8
   9 use base qw(Exporter);
  10
  11 our @EXPORT_OK = qw(profile reduce_dssp conf jury find_jury read_file_list concise_record seq2int int2seq abs_solv_acc rel_solv_acc array_shuffle array_shuffle_in_place array_split $split_resid sort_resid find_files);
  12 push @EXPORT_OK, map { "reduce_dssp_$_" } qw(a b c jpred);
  13
  14 # Reduce eight state DSSP definition to three states
  15 sub reduce_dssp ($;&) {
  16         my ($seq, $code_ref) = @_;
  17
  18         $code_ref ||= \&reduce_dssp_jpred;
  19         return $code_ref->($seq);
  20 }
  21
  22 # reduce_dssp_[abc] are the methods mentioned in the Jpred
  23 # paper
  24 sub reduce_dssp_a ($) {
  25         my ($sec_string) = @_;
  26
  27         $sec_string =~ tr/EBGH/EEHH/;
  28
  29         $sec_string =~ tr/EH/-/c;
  30         return $sec_string;
  31 }
  32
  33 sub reduce_dssp_b ($) {
  34         my ($sec_string) = @_;
  35
  36         $sec_string =~ tr/EH/-/c;
  37         $sec_string =~ s/(?<!E)EE(?!E)/--/g;
  38         $sec_string =~ s/(?<!H)HHHH(?!H)/----/g;
  39
  40         $sec_string =~ tr/EH/-/c;
  41         return $sec_string;
  42 }
  43
  44 sub reduce_dssp_c ($) {
  45         my ($sec_string) = @_;
  46
  47         $sec_string =~ s/GGGHHHH/HHHHHHH/g;
  48         $sec_string =~ tr/B/-/;
  49         $sec_string =~ s/GGG/---/g;
  50
  51         $sec_string =~ tr/EH/-/c;
  52         return $sec_string;
  53 }
  54
  55 =head2 reduce_dssp_jpred()
  56
  57 # This is the method actually used in the q3anal.version2
  58 # program.
  59 # q3anal{,_dev} convert G to H.
  60
  61 =cut
  62
  63 sub reduce_dssp_jpred ($) {
  64         my ($sec_string) = @_;
  65
  66         $sec_string =~ tr/ST_bGIC? /-/;
  67         $sec_string =~ tr/B/E/;
  68         $sec_string =~ tr/EH/-/c;
  69
  70         return $sec_string;
  71 }
  72
  73 =head2 conf()
  74
  75 # Select the subset of a prediction where the confidence is greater than some
  76 # limit.
  77 # Pass at least three scalars:
  78 # First is a string containing a list of integers between 0 and 9 which
  79 # correspond to the confidence at that position.
  80 # Second is a lower limit for the confidence.
  81 # Other arguments are strings of the sequence to be selected.
  82
  83 =cut
  84
  85 sub conf ($$$;@) {
  86         my ($conf, $limit, @seqs) = @_;
  87
  88         croak "Sequences of different length" if grep { length $_ ne length $conf } @seqs;
  89
  90         for my $pos (reverse 0..length($conf) - 1) {
  91                 if (substr($conf, $pos, 1) >= $limit) {
  92                         for (@seqs, $conf) {
  93                                 $_ = substr($_, 0, $pos).substr($_, $pos)
  94                         }
  95                 }
  96         }
  97
  98         return @seqs;
  99 }
 100
 101 =head2 jury();
 102
 103 Find the positions where there is jury agreement
 104 First argument is the jury sequence, the rest are sequences to be subselected.
 105
 106 =cut
 107
 108 sub jury ($$;@) {
 109         my ($jury, @seqs) = @_;
 110
 111         croak "Sequences of different length" if grep { length $_ != length $jury } @seqs;
 112
 113         # All positions are jury
 114         if ($jury  =~ y/*// == length $jury) {
 115                 return @seqs;
 116         }
 117         # Goes through sequences and removes those positions that are not jury positions.
 118         for my $pos (reverse 0..length($jury)- 1) {
 119                 if (substr($jury, $pos, 1) eq '*') {
 120                         $_ = substr($_, 0, $pos).substr($_, $pos + 1) for @seqs;
 121                         $_ = substr($_, 0, $pos).substr($_, $pos + 1) for $jury;
 122                 }
 123         }
 124
 125         return @seqs;
 126 }
 127
 128 =head2 find_jury(@seqs)
 129
 130 Find those positions in a prediction that are in a jury position (those positions where the predictions agree).
 131
 132 Pass an array of sequences (represented as strings). Returns a string with the jury position represented by a space and non-jury positions as "*".
 133
 134 =cut
 135
 136 sub find_jury (@) {
 137         my (@seqs) = @_;
 138
 139         my $length = length $seqs[0];
 140         croak "Sequences of different length" if grep { $length != length $_ } @seqs;
 141
 142         my $jury;
 143         for my $pos (0..$length - 1) {
 144                 # Are all of the positions the same?
 145                 if (keys %{ { map { substr($_, $pos, 1), undef } @seqs } } == 1) {
 146                         $jury .= " "
 147                 }
 148                 else {
 149                         $jury .= "*"
 150                 }
 151         }
 152
 153         return $jury;
 154 }
 155
 156 =head2 @AoA = read_file_list($path)
 157
 158 Reads in a file from the $path of the format:
 159
 160   file1a file2a
 161   file1b file2b
 162   file1c file2c
 163   file1d file2d
 164   file1e file2e
 165
 166 Returns an array of arrays of the list of files. For the above file, this would be:
 167
 168   [
 169     [ "file1a", "file2a" ],
 170     [ "file1b", "file2b" ],
 171     [ "file1c", "file2c" ],
 172     [ "file1d", "file2d" ],
 173     [ "file1e", "file2e" ]
 174   ]
 175
 176 =cut
 177
 178 sub read_file_list ($) {
 179         my ($file) = @_;
 180
 181         local ($/, $_) = "\n";
 182
 183         open my $fh, $file or croak "Can't open file $file";
 184
 185         my @files;
 186         while (<$fh>) {
 187                 chomp;
 188                 push @files, [ split ];
 189         }
 190         return @files
 191 }
 192
 193 =head2 @AoA = profile(qw(PROTEIN MSA----))
 194
 195 Pass an array of proteins sequences represented as strings of one letter residues codes, returns an array of arrays of the frequences of each residue.
 196
 197 This is calculated using all of the sequences in the alignment, and doesn't count gaps, 'X' or 'Z'.
 198
 199 =cut
 200
 201 # Does a profile on a sequence
 202 sub profile (@) {
 203         my (@seqs) = @_;
 204         my @results;
 205
 206         # Check that all the sequences are the same length
 207         my $length = length $seqs[0];
 208         croak "Not all sequences are the same length" if grep { length $_ != $length } @seqs;
 209
 210         # Convert residues to integers
 211         my @ar = map { [ seq2int(split //, $_) ] } @seqs;
 212
 213         # Find out how many of each type of residue occur at a position and create
 214         # the profile
 215         for my $i (0..$length - 1) { # for each position
 216                 # The number of elements should be one larger than the max size in
 217                 # seq2int
 218                 my @countup = (0) x 20;
 219
 220                 $ar[$_][$i] < 20 && $countup[ $ar[$_][$i] ]++ for 0..$#ar; # for each sequence
 221
 222                 for (0..$#ar) {
 223                         $ar[$_][$i] < 20 && $countup[ $ar[$_][$i] ]++
 224                 }
 225
 226                 my $total = 0;
 227                 $total += $_ for @countup;
 228                 #$total += $countup[$_] for 0..$#countup;
 229
 230                 push @results, [
 231                         map {
 232                                 # Check $total for div/0
 233 #                               sprintf "%2.f", ($total ? $countup[$_] / $total * 10 : 0)
 234 #                       } 0..$#countup
 235                                 sprintf "%2.f", ($total ? $_ / $total * 10 : 0)
 236                         } @countup
 237                 ];
 238         }
 239
 240         return @results;
 241 }
 242
 243 =head2 @integers = seq2int(@residues)
 244
 245 Converts a sequence into integer values. @residues should be an array of characters of the one letter amino acids codes.
 246
 247 =head2 @residues = int2seq(@integers)
 248
 249 Converts an array of integers into one letter amino acids codes produced by seq2int().
 250
 251 =cut
 252
 253 {
 254         # This values should be as seq2int() in jnet.c
 255         my (%seq2int, %int2seq);
 256         @seq2int{split //, 'ARNDCQEGHILKMFPSTWYVBZX.-'} = (0..23, 23);
 257         $seq2int{U} = $seq2int{C};
 258         %int2seq = reverse %seq2int;
 259
 260         sub seq2int (@) {
 261                 map {
 262                         exists $seq2int{uc $_} ?
 263                                 $seq2int{uc $_} :
 264                                 croak "Residue '$_' not recognised"
 265                 } @_
 266         }
 267
 268         sub int2seq (@) {
 269                 map {
 270                         exists $int2seq{$_} ?
 271                                 $int2seq{$_} :
 272                                 croak "Residue '$_' not recognised"
 273                 } @_
 274         }
 275 }
 276
 277 =head2 $concise = concise_record($title, @data)
 278
 279 Simple function to give a line in a concise file. Pass the title of the concise file, and the data that you want in the record. Returns a string in the concise format with a newline.
 280
 281 =cut
 282
 283 sub concise_record ($@) {
 284         my ($title, @items) = @_;
 285         return "$title:".join(",", @items)."\n";
 286 }
 287
 288 {
 289         my %data = (
 290                 'A' => '118', 'B' => '162', 'C' => '146', 'D' => '159',
 291                 'E' => '186', 'F' => '223', 'G' => '88', 'H' => '203',                          'I' => '181', 'K' => '226', 'L' => '193', 'M' => '204',
 292                 'N' => '166', 'P' => '147', 'Q' => '193', 'R' => '256',
 293                 'S' => '130', 'T' => '153', 'V' => '165', 'W' => '266',
 294                 'X' => '200', 'Y' => '237', 'Z' => '189', 'c' => '146'
 295         );
 296
 297 =head2 $solv = rel_solv_acc($residue, $accesibility);
 298
 299 Pass the residue and the accessiblilty of the residue, and the relative solvent accesibility of the residue will be returned.
 300
 301 =cut
 302
 303         sub rel_solv_acc ($$) {
 304                 my ($residue, $acc) = @_;
 305                 return unless $data{uc $residue};
 306                 return $acc / $data{uc $residue};
 307         }
 308
 309 =head2 $solv = abs_solv_acc($residue, $accesibility);
 310
 311 Pass the residue and the accessiblilty of the residue, and the absolute solvent accesibility of the residue will be returned.
 312
 313 =cut
 314
 315         sub abs_solv_acc ($$) {
 316                 my ($residue, $acc) = @_;
 317                 return unless $data{uc $residue};
 318                 return $data{uc $residue} * $acc;
 319         }
 320 }
 321
 322 =head2 @shuffled_data = array_shuffle(@data)
 323
 324 Randomly shuffles an array.
 325
 326 =cut
 327
 328 sub array_shuffle (@) {
 329         my (@data) = @_;
 330         for (0..$#data) {
 331                 my ($foo, $bar) = (int(rand $#data), int(rand $#data));
 332                 @data[$foo, $bar] = @data[$bar, $foo];
 333         }
 334         return @data;
 335 }
 336
 337 =head2 array_shuffle_in_place(@data)
 338
 339 Randomly shuffle an array in place.
 340
 341 =cut
 342
 343 sub array_shuffle_in_place (@) {
 344         for (0..$#_) {
 345                 my ($foo, $bar) = (int(rand $#_), int(rand $#_));
 346                 @_[$foo, $bar] = @_[$bar, $foo];
 347         }
 348 }
 349
 350 =head2 @AoA = array_split($size, @data)
 351
 352 Splits an array into $size sets of data returned in an AoA.
 353
 354 =cut
 355
 356 sub array_split ($@) {
 357         my ($size, @array) = @_;
 358         my ($i, @temp) = 0;
 359         while (@array) { push @{ $temp[$i++ % $size] }, pop @array }
 360         return @temp;
 361 }
 362
 363
 364 =head2 $split_resid = qr/(-?\d+)([[:alpha:]])/;
 365
 366 Regular expression for dividing PDB residue offsets into the offset and letter components.
 367
 368 =cut
 369
 370 our $split_resid = qr/(-?\d+)([[:alpha:]])?/;
 371
 372 =head2 sort_resid
 373
 374 A subroutine for the sorting of PDB residue offsets, allows for the correct sorting of IDs such as "1A".
 375
 376 =cut
 377
 378 {
 379         no warnings;
 380         sub sort_resid {
 381                 (my ($a_off, $a_id) = $a =~ $split_resid);
 382                 (my ($b_off, $b_id) = $b =~ $split_resid);
 383
 384                 $a_off <=> $b_off || $a_id cmp $b_id;
 385         }
 386 }
 387
 388 =head2 @files = find_files($dir)
 389
 390 Find all files under $dir.
 391
 392 =cut
 393
 394 sub find_files ($) {
 395     my ($dir) = @_;
 396     my @files;
 397
 398     find(
 399         sub {
 400             push @files, $File::Find::dir."/".$_ if -e $_;
 401         },
 402         $dir
 403     );
 404
 405     return @files;
 406 }
 407
 408 1;