package Utils; use strict; use warnings; use Carp; use File::Temp qw(tempfile); use File::Find qw(find); use base qw(Exporter); our @EXPORT_OK = qw(xsystem profile reduce_dssp conf jury find_jury read_file_list concise_record seq2int int2seq abs_solv_acc rel_solv_acc array_shuffle array_shuffle_in_place array_split $split_resid sort_resid find_files); push @EXPORT_OK, map { "reduce_dssp_$_" } qw(a b c jpred); # Reduce eight state DSSP definition to three states sub reduce_dssp ($;&) { my ($seq, $code_ref) = @_; $code_ref ||= \&reduce_dssp_jpred; return $code_ref->($seq); } # reduce_dssp_[abc] are the methods mentioned in the Jpred # paper sub reduce_dssp_a ($) { my ($sec_string) = @_; $sec_string =~ tr/EBGH/EEHH/; $sec_string =~ tr/EH/-/c; return $sec_string; } sub reduce_dssp_b ($) { my ($sec_string) = @_; $sec_string =~ tr/EH/-/c; $sec_string =~ s/(?= $limit) { for (@seqs, $conf) { $_ = substr($_, 0, $pos).substr($_, $pos) } } } return @seqs; } =head2 jury(); Find the positions where there is jury agreement First argument is the jury sequence, the rest are sequences to be subselected. =cut sub jury ($$;@) { my ($jury, @seqs) = @_; croak "Sequences of different length" if grep { length $_ != length $jury } @seqs; # All positions are jury if ($jury =~ y/*// == length $jury) { return @seqs; } # Goes through sequences and removes those positions that are not jury positions. for my $pos (reverse 0..length($jury)- 1) { if (substr($jury, $pos, 1) eq '*') { $_ = substr($_, 0, $pos).substr($_, $pos + 1) for @seqs; $_ = substr($_, 0, $pos).substr($_, $pos + 1) for $jury; } } return @seqs; } =head2 find_jury(@seqs) Find those positions in a prediction that are in a jury position (those positions where the predictions agree). Pass an array of sequences (represented as strings). Returns a string with the jury position represented by a space and non-jury positions as "*". =cut sub find_jury (@) { my (@seqs) = @_; my $length = length $seqs[0]; croak "Sequences of different length" if grep { $length != length $_ } @seqs; my $jury; for my $pos (0..$length - 1) { # Are all of the positions the same? if (keys %{ { map { substr($_, $pos, 1), undef } @seqs } } == 1) { $jury .= " " } else { $jury .= "*" } } return $jury; } =head2 @AoA = read_file_list($path) Reads in a file from the $path of the format: file1a file2a file1b file2b file1c file2c file1d file2d file1e file2e Returns an array of arrays of the list of files. For the above file, this would be: [ [ "file1a", "file2a" ], [ "file1b", "file2b" ], [ "file1c", "file2c" ], [ "file1d", "file2d" ], [ "file1e", "file2e" ] ] =cut sub read_file_list ($) { my ($file) = @_; local ($/, $_) = "\n"; open my $fh, $file or croak "Can't open file $file"; my @files; while (<$fh>) { chomp; push @files, [ split ]; } return @files } =head2 @AoA = profile(qw(PROTEIN MSA----)) Pass an array of proteins sequences represented as strings of one letter residues codes, returns an array of arrays of the frequences of each residue. This is calculated using all of the sequences in the alignment, and doesn't count gaps, 'X' or 'Z'. =cut # Does a profile on a sequence sub profile (@) { my (@seqs) = @_; my @results; # Check that all the sequences are the same length my $length = length $seqs[0]; croak "Not all sequences are the same length" if grep { length $_ != $length } @seqs; # Convert residues to integers my @ar = map { [ seq2int(split //, $_) ] } @seqs; # Find out how many of each type of residue occur at a position and create # the profile for my $i (0..$length - 1) { # for each position # The number of elements should be one larger than the max size in # seq2int my @countup = (0) x 20; $ar[$_][$i] < 20 && $countup[ $ar[$_][$i] ]++ for 0..$#ar; # for each sequence for (0..$#ar) { $ar[$_][$i] < 20 && $countup[ $ar[$_][$i] ]++ } my $total = 0; $total += $_ for @countup; #$total += $countup[$_] for 0..$#countup; push @results, [ map { # Check $total for div/0 # sprintf "%2.f", ($total ? $countup[$_] / $total * 10 : 0) # } 0..$#countup sprintf "%2.f", ($total ? $_ / $total * 10 : 0) } @countup ]; } return @results; } =head2 @integers = seq2int(@residues) Converts a sequence into integer values. @residues should be an array of characters of the one letter amino acids codes. =head2 @residues = int2seq(@integers) Converts an array of integers into one letter amino acids codes produced by seq2int(). =cut { # This values should be as seq2int() in jnet.c my (%seq2int, %int2seq); @seq2int{split //, 'ARNDCQEGHILKMFPSTWYVBZX.-'} = (0..23, 23); $seq2int{U} = $seq2int{C}; %int2seq = reverse %seq2int; sub seq2int (@) { map { exists $seq2int{uc $_} ? $seq2int{uc $_} : croak "Residue '$_' not recognised" } @_ } sub int2seq (@) { map { exists $int2seq{$_} ? $int2seq{$_} : croak "Residue '$_' not recognised" } @_ } } =head2 xsystem($system) Wraps the perl system() function. Returns undef if the command in $system doesn't work, perhaps, look at the code. =cut sub xsystem ($) { my $status = system(@_); if ($status == -1) { carp "Failed to run command '@_': $!"; return; } elsif ($? & 127) { printf "child died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; return; } else { #printf "child exited with value %d\n", $? >> 8 return $? >> 8; } } =head2 $concise = concise_record($title, @data) Simple function to give a line in a concise file. Pass the title of the concise file, and the data that you want in the record. Returns a string in the concise format with a newline. =cut sub concise_record ($@) { my ($title, @items) = @_; return "$title:".join(",", @items)."\n"; } { my %data = ( 'A' => '118', 'B' => '162', 'C' => '146', 'D' => '159', 'E' => '186', 'F' => '223', 'G' => '88', 'H' => '203', 'I' => '181', 'K' => '226', 'L' => '193', 'M' => '204', 'N' => '166', 'P' => '147', 'Q' => '193', 'R' => '256', 'S' => '130', 'T' => '153', 'V' => '165', 'W' => '266', 'X' => '200', 'Y' => '237', 'Z' => '189', 'c' => '146' ); =head2 $solv = rel_solv_acc($residue, $accesibility); Pass the residue and the accessiblilty of the residue, and the relative solvent accesibility of the residue will be returned. =cut sub rel_solv_acc ($$) { my ($residue, $acc) = @_; return unless $data{uc $residue}; return $acc / $data{uc $residue}; } =head2 $solv = abs_solv_acc($residue, $accesibility); Pass the residue and the accessiblilty of the residue, and the absolute solvent accesibility of the residue will be returned. =cut sub abs_solv_acc ($$) { my ($residue, $acc) = @_; return unless $data{uc $residue}; return $data{uc $residue} * $acc; } } =head2 @shuffled_data = array_shuffle(@data) Randomly shuffles an array. =cut sub array_shuffle (@) { my (@data) = @_; for (0..$#data) { my ($foo, $bar) = (int(rand $#data), int(rand $#data)); @data[$foo, $bar] = @data[$bar, $foo]; } return @data; } =head2 array_shuffle_in_place(@data) Randomly shuffle an array in place. =cut sub array_shuffle_in_place (@) { for (0..$#_) { my ($foo, $bar) = (int(rand $#_), int(rand $#_)); @_[$foo, $bar] = @_[$bar, $foo]; } } =head2 @AoA = array_split($size, @data) Splits an array into $size sets of data returned in an AoA. =cut sub array_split ($@) { my ($size, @array) = @_; my ($i, @temp) = 0; while (@array) { push @{ $temp[$i++ % $size] }, pop @array } return @temp; } =head2 $split_resid = qr/(-?\d+)([[:alpha:]])/; Regular expression for dividing PDB residue offsets into the offset and letter components. =cut our $split_resid = qr/(-?\d+)([[:alpha:]])?/; =head2 sort_resid A subroutine for the sorting of PDB residue offsets, allows for the correct sorting of IDs such as "1A". =cut { no warnings; sub sort_resid { (my ($a_off, $a_id) = $a =~ $split_resid); (my ($b_off, $b_id) = $b =~ $split_resid); $a_off <=> $b_off || $a_id cmp $b_id; } } =head2 @files = find_files($dir) Find all files under $dir. =cut sub find_files ($) { my ($dir) = @_; my @files; find( sub { push @files, $File::Find::dir."/".$_ if -e $_; }, $dir ); return @files; } 1;