jpred/lib/Utils.pm

   1 package Utils;
   2
   3 use strict;
   4 use warnings;
   5 use Carp;
   6 use File::Temp qw(tempfile);
   7 use File::Find qw(find);
   8
   9 use base qw(Exporter);
  10
  11 our @EXPORT_OK = qw(xsystem profile reduce_dssp conf jury find_jury read_file_list concise_record seq2int int2seq abs_solv_acc rel_solv_acc array_shuffle array_shuffle_in_place array_split $split_resid sort_resid find_files);
  12 push @EXPORT_OK, map { "reduce_dssp_$_" } qw(a b c jpred);
  13
  14 # Reduce eight state DSSP definition to three states
  15 sub reduce_dssp ($;&) {
  16         my ($seq, $code_ref) = @_;
  17
  18         $code_ref ||= \&reduce_dssp_jpred;
  19         return $code_ref->($seq);
  20 }
  21
  22 # reduce_dssp_[abc] are the methods mentioned in the Jpred
  23 # paper
  24 sub reduce_dssp_a ($) {
  25         my ($sec_string) = @_;
  26
  27         $sec_string =~ tr/EBGH/EEHH/;
  28
  29         $sec_string =~ tr/EH/-/c;
  30         return $sec_string;
  31 }
  32
  33 sub reduce_dssp_b ($) {
  34         my ($sec_string) = @_;
  35
  36         $sec_string =~ tr/EH/-/c;
  37         $sec_string =~ s/(?<!E)EE(?!E)/--/g;
  38         $sec_string =~ s/(?<!H)HHHH(?!H)/----/g;
  39
  40         $sec_string =~ tr/EH/-/c;
  41         return $sec_string;
  42 }
  43
  44 sub reduce_dssp_c ($) {
  45         my ($sec_string) = @_;
  46
  47         $sec_string =~ s/GGGHHHH/HHHHHHH/g;
  48         $sec_string =~ tr/B/-/;
  49         $sec_string =~ s/GGG/---/g;
  50
  51         $sec_string =~ tr/EH/-/c;
  52         return $sec_string;
  53 }
  54
  55 =head2 reduce_dssp_jpred()
  56
  57 # This is the method actually used in the q3anal.version2
  58 # program.
  59 # q3anal{,_dev} convert G to H.
  60
  61 =cut
  62
  63 sub reduce_dssp_jpred ($) {
  64         my ($sec_string) = @_;
  65
  66         $sec_string =~ tr/ST_bGIC? /-/;
  67         $sec_string =~ tr/B/E/;
  68         $sec_string =~ tr/EH/-/c;
  69
  70         return $sec_string;
  71 }
  72
  73 =head2 conf()
  74
  75 # Select the subset of a prediction where the confidence is greater than some
  76 # limit.
  77 # Pass at least three scalars:
  78 # First is a string containing a list of integers between 0 and 9 which
  79 # correspond to the confidence at that position.
  80 # Second is a lower limit for the confidence.
  81 # Other arguments are strings of the sequence to be selected.
  82
  83 =cut
  84
  85 sub conf ($$$;@) {
  86         my ($conf, $limit, @seqs) = @_;
  87
  88         croak "Sequences of different length" if grep { length $_ ne length $conf } @seqs;
  89
  90         for my $pos (reverse 0..length($conf) - 1) {
  91                 if (substr($conf, $pos, 1) >= $limit) {
  92                         for (@seqs, $conf) {
  93                                 $_ = substr($_, 0, $pos).substr($_, $pos)
  94                         }
  95                 }
  96         }
  97
  98         return @seqs;
  99 }
 100
 101 =head2 jury();
 102
 103 Find the positions where there is jury agreement
 104 First argument is the jury sequence, the rest are sequences to be subselected.
 105
 106 =cut
 107
 108 sub jury ($$;@) {
 109         my ($jury, @seqs) = @_;
 110
 111         croak "Sequences of different length" if grep { length $_ != length $jury } @seqs;
 112
 113         # All positions are jury
 114         if ($jury  =~ y/*// == length $jury) {
 115                 return @seqs;
 116         }
 117         # Goes through sequences and removes those positions that are not jury positions.
 118         for my $pos (reverse 0..length($jury)- 1) {
 119                 if (substr($jury, $pos, 1) eq '*') {
 120                         $_ = substr($_, 0, $pos).substr($_, $pos + 1) for @seqs;
 121                         $_ = substr($_, 0, $pos).substr($_, $pos + 1) for $jury;
 122                 }
 123         }
 124
 125         return @seqs;
 126 }
 127
 128 =head2 find_jury(@seqs)
 129
 130 Find those positions in a prediction that are in a jury position (those positions where the predictions agree).
 131
 132 Pass an array of sequences (represented as strings). Returns a string with the jury position represented by a space and non-jury positions as "*".
 133
 134 =cut
 135
 136 sub find_jury (@) {
 137         my (@seqs) = @_;
 138
 139         my $length = length $seqs[0];
 140         croak "Sequences of different length" if grep { $length != length $_ } @seqs;
 141
 142         my $jury;
 143         for my $pos (0..$length - 1) {
 144                 # Are all of the positions the same?
 145                 if (keys %{ { map { substr($_, $pos, 1), undef } @seqs } } == 1) {
 146                         $jury .= " "
 147                 }
 148                 else {
 149                         $jury .= "*"
 150                 }
 151         }
 152
 153         return $jury;
 154 }
 155
 156 =head2 @AoA = read_file_list($path)
 157
 158 Reads in a file from the $path of the format:
 159
 160   file1a file2a
 161   file1b file2b
 162   file1c file2c
 163   file1d file2d
 164   file1e file2e
 165
 166 Returns an array of arrays of the list of files. For the above file, this would be:
 167
 168   [
 169     [ "file1a", "file2a" ],
 170     [ "file1b", "file2b" ],
 171     [ "file1c", "file2c" ],
 172     [ "file1d", "file2d" ],
 173     [ "file1e", "file2e" ]
 174   ]
 175
 176 =cut
 177
 178 sub read_file_list ($) {
 179         my ($file) = @_;
 180
 181         local ($/, $_) = "\n";
 182
 183         open my $fh, $file or croak "Can't open file $file";
 184
 185         my @files;
 186         while (<$fh>) {
 187                 chomp;
 188                 push @files, [ split ];
 189         }
 190         return @files
 191 }
 192
 193 =head2 @AoA = profile(qw(PROTEIN MSA----))
 194
 195 Pass an array of proteins sequences represented as strings of one letter residues codes, returns an array of arrays of the frequences of each residue.
 196
 197 This is calculated using all of the sequences in the alignment, and doesn't count gaps, 'X' or 'Z'.
 198
 199 =cut
 200
 201 # Does a profile on a sequence
 202 sub profile (@) {
 203         my (@seqs) = @_;
 204         my @results;
 205
 206         # Check that all the sequences are the same length
 207         my $length = length $seqs[0];
 208         croak "Not all sequences are the same length" if grep { length $_ != $length } @seqs;
 209
 210         # Convert residues to integers
 211         my @ar = map { [ seq2int(split //, $_) ] } @seqs;
 212
 213         # Find out how many of each type of residue occur at a position and create
 214         # the profile
 215         for my $i (0..$length - 1) { # for each position
 216                 # The number of elements should be one larger than the max size in
 217                 # seq2int
 218                 my @countup = (0) x 20;
 219
 220                 $ar[$_][$i] < 20 && $countup[ $ar[$_][$i] ]++ for 0..$#ar; # for each sequence
 221
 222                 for (0..$#ar) {
 223                         $ar[$_][$i] < 20 && $countup[ $ar[$_][$i] ]++
 224                 }
 225
 226                 my $total = 0;
 227                 $total += $_ for @countup;
 228                 #$total += $countup[$_] for 0..$#countup;
 229
 230                 push @results, [
 231                         map {
 232                                 # Check $total for div/0
 233 #                               sprintf "%2.f", ($total ? $countup[$_] / $total * 10 : 0)
 234 #                       } 0..$#countup
 235                                 sprintf "%2.f", ($total ? $_ / $total * 10 : 0)
 236                         } @countup
 237                 ];
 238         }
 239
 240         return @results;
 241 }
 242
 243 =head2 @integers = seq2int(@residues)
 244
 245 Converts a sequence into integer values. @residues should be an array of characters of the one letter amino acids codes.
 246
 247 =head2 @residues = int2seq(@integers)
 248
 249 Converts an array of integers into one letter amino acids codes produced by seq2int().
 250
 251 =cut
 252
 253 {
 254         # This values should be as seq2int() in jnet.c
 255         my (%seq2int, %int2seq);
 256         @seq2int{split //, 'ARNDCQEGHILKMFPSTWYVBZX.-'} = (0..23, 23);
 257         $seq2int{U} = $seq2int{C};
 258         %int2seq = reverse %seq2int;
 259
 260         sub seq2int (@) {
 261                 map {
 262                         exists $seq2int{uc $_} ?
 263                                 $seq2int{uc $_} :
 264                                 croak "Residue '$_' not recognised"
 265                 } @_
 266         }
 267
 268         sub int2seq (@) {
 269                 map {
 270                         exists $int2seq{$_} ?
 271                                 $int2seq{$_} :
 272                                 croak "Residue '$_' not recognised"
 273                 } @_
 274         }
 275 }
 276
 277 =head2 xsystem($system)
 278
 279 Wraps the perl system() function. Returns undef if the command in $system doesn't work, perhaps, look at the code.
 280
 281 =cut
 282
 283 sub xsystem ($) {
 284         my $status = system(@_);
 285         if ($status == -1) {
 286                 carp "Failed to run command '@_': $!";
 287                 return;
 288         }
 289         elsif ($? & 127) {
 290                 printf "child died with signal %d, %s coredump\n", ($? & 127),  ($? & 128) ? 'with' : 'without';
 291                 return;
 292         }
 293         else {
 294                 #printf "child exited with value %d\n", $? >> 8
 295                 return $? >> 8;
 296         }
 297 }
 298
 299 =head2 $concise = concise_record($title, @data)
 300
 301 Simple function to give a line in a concise file. Pass the title of the concise file, and the data that you want in the record. Returns a string in the concise format with a newline.
 302
 303 =cut
 304
 305 sub concise_record ($@) {
 306         my ($title, @items) = @_;
 307         return "$title:".join(",", @items)."\n";
 308 }
 309
 310 {
 311         my %data = (
 312                 'A' => '118', 'B' => '162', 'C' => '146', 'D' => '159',
 313                 'E' => '186', 'F' => '223', 'G' => '88', 'H' => '203',                          'I' => '181', 'K' => '226', 'L' => '193', 'M' => '204',
 314                 'N' => '166', 'P' => '147', 'Q' => '193', 'R' => '256',
 315                 'S' => '130', 'T' => '153', 'V' => '165', 'W' => '266',
 316                 'X' => '200', 'Y' => '237', 'Z' => '189', 'c' => '146'
 317         );
 318
 319 =head2 $solv = rel_solv_acc($residue, $accesibility);
 320
 321 Pass the residue and the accessiblilty of the residue, and the relative solvent accesibility of the residue will be returned.
 322
 323 =cut
 324
 325         sub rel_solv_acc ($$) {
 326                 my ($residue, $acc) = @_;
 327                 return unless $data{uc $residue};
 328                 return $acc / $data{uc $residue};
 329         }
 330
 331 =head2 $solv = abs_solv_acc($residue, $accesibility);
 332
 333 Pass the residue and the accessiblilty of the residue, and the absolute solvent accesibility of the residue will be returned.
 334
 335 =cut
 336
 337         sub abs_solv_acc ($$) {
 338                 my ($residue, $acc) = @_;
 339                 return unless $data{uc $residue};
 340                 return $data{uc $residue} * $acc;
 341         }
 342 }
 343
 344 =head2 @shuffled_data = array_shuffle(@data)
 345
 346 Randomly shuffles an array.
 347
 348 =cut
 349
 350 sub array_shuffle (@) {
 351         my (@data) = @_;
 352         for (0..$#data) {
 353                 my ($foo, $bar) = (int(rand $#data), int(rand $#data));
 354                 @data[$foo, $bar] = @data[$bar, $foo];
 355         }
 356         return @data;
 357 }
 358
 359 =head2 array_shuffle_in_place(@data)
 360
 361 Randomly shuffle an array in place.
 362
 363 =cut
 364
 365 sub array_shuffle_in_place (@) {
 366         for (0..$#_) {
 367                 my ($foo, $bar) = (int(rand $#_), int(rand $#_));
 368                 @_[$foo, $bar] = @_[$bar, $foo];
 369         }
 370 }
 371
 372 =head2 @AoA = array_split($size, @data)
 373
 374 Splits an array into $size sets of data returned in an AoA.
 375
 376 =cut
 377
 378 sub array_split ($@) {
 379         my ($size, @array) = @_;
 380         my ($i, @temp) = 0;
 381         while (@array) { push @{ $temp[$i++ % $size] }, pop @array }
 382         return @temp;
 383 }
 384
 385
 386 =head2 $split_resid = qr/(-?\d+)([[:alpha:]])/;
 387
 388 Regular expression for dividing PDB residue offsets into the offset and letter components.
 389
 390 =cut
 391
 392 our $split_resid = qr/(-?\d+)([[:alpha:]])?/;
 393
 394 =head2 sort_resid
 395
 396 A subroutine for the sorting of PDB residue offsets, allows for the correct sorting of IDs such as "1A".
 397
 398 =cut
 399
 400 {
 401         no warnings;
 402         sub sort_resid {
 403                 (my ($a_off, $a_id) = $a =~ $split_resid);
 404                 (my ($b_off, $b_id) = $b =~ $split_resid);
 405
 406                 $a_off <=> $b_off || $a_id cmp $b_id;
 407         }
 408 }
 409
 410 =head2 @files = find_files($dir)
 411
 412 Find all files under $dir.
 413
 414 =cut
 415
 416 sub find_files ($) {
 417     my ($dir) = @_;
 418     my @files;
 419
 420     find(
 421         sub {
 422             push @files, $File::Find::dir."/".$_ if -e $_;
 423         },
 424         $dir
 425     );
 426
 427     return @files;
 428 }
 429
 430 1;