Jpred secondary structure prediction results

#!/usr/bin/perl use strict; use warnings; use Jpred; use Getopt::Long; my ($in, $out); my $format = 'seq'; GetOptions( "in=s" => \$in, "out=s" => \$out, "format=s" => \$format ); if (!$in) { $in = "-"; } if (!$out) { $out = "-"; } # Read in the concise CSV file. Data is stored in hash's with the key # being the name of the type of data and the value being the sequence # data, the order of the alignments is also stored for display in the # order with which they are aligned. my (@align, @seq, %predict, %prob); my @predictions = qw(Lupas_14 Lupas_21 Lupas_28 jnetpred JNETHMM JNETALIGN JNETPSSM JNETJURY JNETSOL25 JNETSOL5 JNETSOL0 JNETCONF); my @prob = qw(JNETPROPH JNETPROPB JNETPROPC); # Arrays that holds the order in which things are printed my @print_pred = qw(jnetpred JNETALIGN JNETHMM JNETPSSM); my @coils = qw(Lupas_14 Lupas_21 Lupas_28); my @solvent = qw(JNETSOL25 JNETSOL5 JNETSOL0); # Hash that relates data names to printed names my %convert = ( JNETALIGN => "jalign", JNETHMM => "jhmm", jnetpred => "Jnet", JNETPSSM => "jpssm", Lupas_14 => "Lupas 14", Lupas_21 => "Lupas 21", Lupas_28 => "Lupas 28", JNETSOL25 => "Jnet_25", JNETSOL5 => "Jnet_5", JNETSOL0 => "Jnet_0", JNETCONF => "Jnet Rel" ); # Read in the data open(IN, "<$in") or die($!); while () { if (/:/) { my ($id, $seq) = split(":", $_); if ($id =~ /;/) { # Find sequence alignments my ($align, $seqid) = split(";", $id); push @align, $seqid; $seq =~ s/,//g; chomp($seq); push @seq, $seq; } foreach (@predictions) { # Find the predictions if ($id eq $_) { if ($predict{$id}) { die("Same ID ($id) found twice in concise file!"); } $seq =~ s/,//g; chomp($seq); $predict{$id} = $seq; } } foreach (@prob) { if ($id eq $_) { if ($prob{$id}) { die("Same ID ($id) found twice in concise file!"); } chomp($seq); @_ = split(",", $seq); foreach (@_) { if ($_ >= 0.85) { $_ = 9; } else { $_ = int(($_ * 10) + 0.5); } } my $temp = join("", @_); $prob{$id} = $temp; } } } } close(IN); ############################################ # # Sets up the references for prediction # methods and deffn methods # my @predname = qw(jnetpred JNETALIGN JNETPSSM JNETHMM); foreach (@predname) { if (!$predict{$_}) { warn("\"$_\" data not present\n"); next; } $predict{$_} =~ s/[TCYWXZ]/-/g; } ###################################################################### # Print out the sequences to HTML ###################################################################### open(HTML, ">$out") or die($!); print HTML "Jpred secondary structure prediction results

";

#
# Prints the sequence passed to it as a reference to an array, second
# array contains the "conservation", which varies between 0 and 1 for
# each position.
#

sub printseq {
	my $seqref = shift @_;
	my $scoreref = shift @_;

	unless ($scoreref) {
		foreach (0..$#{$seqref}) {
			print HTML ${$seqref}[$_];
		}
		return;
	}
	
#	foreach (0..$#{$seqref} - 1) {
	foreach (0..$#{$seqref}) {
		my $colour = "#";	# Work out the colour
		if (${$scoreref}[$_] == 1) {
			$colour .= "0000ff";
		}
		else {
			$colour .= (sprintf "%-2.2lx", ${$scoreref}[$_] * 255).(sprintf "%-2.2lx", 0).(sprintf "%-2.2lx", 0);
		}

		print HTML "${$seqref}[$_]";
	}
}

# Prints the gap at the begining of the line between the sequence ID
# and the ":". First argument is the length of the ID and the second is
# the maximimum length.

sub printgap {
	my ($strlen, $max_len) = @_;
	foreach ($strlen .. $max_len) {
		print HTML " ";
		}
	print HTML ": ";
	}

# Works out a consensus score according to Zvelebil et al. (1987)
# J. Mol. Biol. 195, pp957, and does the smoothing

sub cons {
	my %conservation = (
		I => [1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
		L => [1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
		V => [1, 0, 0, 0, 0, 1, 0, 1, 0, 0],
		C => [1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
		A => [1, 0, 0, 0, 0, 1, 1, 0, 0, 0],
		G => [1, 0, 0, 0, 0, 1, 1, 0, 0, 0],
		M => [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
		F => [1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
		Y => [1, 0, 0, 1, 0, 0, 0, 0, 1, 0],
		W => [1, 0, 0, 1, 0, 0, 0, 0, 1, 0],
		H => [1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
		K => [1, 1, 0, 1, 1, 0, 0, 0, 0, 0],
		R => [0, 1, 0, 1, 1, 0, 0, 0, 0, 0],
		E => [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
		Q => [0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
		D => [0, 0, 1, 1, 0, 1, 0, 0, 0, 0],
		N => [0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
		S => [0, 0, 0, 1, 0, 1, 1, 0, 0, 0],
		T => [1, 0, 0, 1, 0, 1, 0, 0, 0, 0],
		P => [0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
		Z => [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
		'-' => [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
		'_' => [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
		'.' => [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
		X => [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
		B => [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # Not mentioned, so use X
		U => [1, 0, 0, 0, 0, 1, 0, 0, 0, 0], # Not mentioned, so use C
		);

	my $aref = shift @_;
	my (@cons, @align);

	# Get each sequence
	foreach (@{$aref}) {
		push @align, $_;
	}

	my $seq_len = length($align[0]);

	# Foreach position
	foreach my $pos (0..$seq_len - 1) {
		my @temp;

		# Foreach sequence, get the residue for this position
		foreach (@align) { push @temp, substr($_, $pos, 1); } 

		my ($p, $i, $j);
		for ($i = 0; $i < $#temp; $i++) {
			my $one = uc($temp[$i]);
			for ($j = $i + 1; $j < $#temp; $j++) {
				my $two = uc($temp[$j]);
				foreach (0..9) {
					# This debugging is in here
					# incase a residue occurs that we
					# haven't got in the hash above
					unless (defined $conservation{$one}[$_]) {
#						warn "first $one, $_\n";
#						die "'$conservation{$one}[$_]'\n";
						next;
					}
					unless (defined $conservation{$two}[$_]) {
#						warn "second $two, $_\n";
#						die "'$conservation{$two}[$_]'\n";
						next;
					}
					if ($conservation{$one}[$_] != $conservation{$two}[$_]) { $p++; }
				}
			}
		}

		# This loop looks to be v. slow
		if ($p) {
			if (($p = 0.9 - 0.1 * $p) < 0) { $p = 0; }
		}
		else {
			for ($i = 1; $i < $#temp; $i++) {
				if ($temp[$i] ne $temp[0]) {
					$p++;
					last;
				}
			}
			if ($p) { $p = 0.9; }
			else { $p = 1; }
		}

		push @cons, $p;
	}

	# Average the result over 3 residues
#	my (@final, $i);
#	push @final, $cons[0];
#	for ($i	= 1; $i < ($#cons); $i++) {
#		push @final, ($cons[$i - 1] + $cons[$i] + $cons[$i + 1]) / 3;
#		}
#	push @final, $cons[$#cons + 1];

	return @cons;
}

############################################################
# Uncomment this to get the conservation scoring worked out
############################################################
#my @score = &cons(\@seq);

#
# Print the sequences
#

# Find the longest identifier and set the space at the start of the line

my $max_len = 0;
foreach (@align) {
	if (length($_) > $max_len) { $max_len = length($_); }
	}
if ($max_len <= 6) { $max_len = 11; }

foreach (0..$#align) {
	# Begining of line
	if ($_ == 0) {
		print HTML "$align[$_]";
		}
	else {
	   if ($format eq 'seq') {
	      ## differentiate between Uniprot and Uniparc (UniRef90_UNI\w+) IDs as they require
	      ## different SRS query strings
	      if ($align[$_] =~ /UniRef90_(UPI\w+)/) {
	         print HTML "$align[$_]";
	      } else {
	         print HTML "$align[$_]";
	      }
	   } else {
	      print HTML "$align[$_]";
	   }
		}
	&printgap(length($align[$_]) ,$max_len);

	# Print the sequence
	@_ = split("", $seq[$_] );
#	&printseq([@_], [@score]);
	&printseq([@_]);

	# End of the line
	if ($_ == 0) {
		print HTML " : $align[$_] \n";
		}
	else {
	   if ($format eq 'seq') {
              ## differentiate between Uniprot and Uniparc IDs as they require
              ## different SRS query strings
              if ($align[$_] =~ /UniRef90_(UPI\w+)/) {
                 print HTML " : $align[$_]\n";
              } else {
                 print HTML " : $align[$_]\n";
              }
	   } else {
	      print HTML " : $align[$_]\n";
	   }
		}
	}
print HTML "\n";

######################################################################

#
# Create a guide sequence of the original query
# (of the form 1----------11--- etc.)
#

my ($guide, $i);
my $seq_len = length($seq[0]);

for ($i = 1; $i < $seq_len; $i += 10) {
	my $dash;
	my $no_dash = 9 - length($i);
	foreach (0..$no_dash) { $dash .= "-"; }
	$guide .= $i.$dash;
	}

while (length($guide) > $seq_len) {	# Trim spare guide away
	chop($guide);
	}

printgap(0, $max_len);
print HTML $guide." :\n";

print HTML "OrigSeq";
printgap(7, $max_len);
print HTML $seq[0];
print HTML " : OrigSeq\n\n";

# Print out secondary structure predictions, goes through in order of
# @print_pred so comes out right

foreach (@print_pred) {
	if (!$predict{$_}) { next; }
	if ($_ eq "JNETALIGN") {	#  Add a line before JNETALIGN which will put jnetpred on its own
		print HTML "\n";
		}

	print HTML $convert{$_};
	printgap(length($convert{$_}), $max_len);
	@_ = split("", $predict{$_});
	foreach (@_) {
		if ($_ eq "E") {
			print HTML "$_";
			}
		elsif ($_ eq "H") {
			print HTML "$_";
			}
		elsif ($_ eq "-") {
			print HTML $_;
			}
		}
	print HTML " : $convert{$_}\n";
	}
print HTML "\n";

foreach (@coils) {
	if (!$predict{$_}) { next; }
	print HTML $convert{$_};
	printgap(length($convert{$_}), $max_len); 
	@_ = split("", $predict{$_});
	foreach (@_) {
		if ($_ eq "C") { print HTML "$_"; }
		else { print HTML $_; }
		}
	print HTML " : $convert{$_}\n";
	}
print HTML "\n";

foreach (@solvent) {
	if (!$predict{$_}) { next; }
	print HTML $convert{$_};
	printgap(length($convert{$_}), $max_len);
	@_ = split("", $predict{$_});
	foreach (@_) {
		if ($_ eq "B") {
			print HTML "$_";
			}
		else { print HTML $_; }
		}
	print HTML " : $convert{$_}\n";
	}

printf HTML $convert{"JNETCONF"};
printgap(8, $max_len);

@_ = split("", $predict{"JNETCONF"});
foreach (@_) {
	if ($_ >= 7) { print HTML "$_"; }
	else { print HTML "$_"; }
	}
print HTML " : ".$convert{"JNETCONF"}."\n";

print HTML "

"; print HTML "

"; # # Key to data # print HTML "

";
print HTML "\nNotes
Key:
Colour code for alignment:
Blue            - Complete identity at a position
Shades of red   - The more red a position is, the higher the level of
                  conservation of chemical properties of the amino acids
Jnet            - Final secondary structure prediction for query
jalign          - Jnet alignment prediction
jhmm            - Jnet hmm profile prediction
jpssm           - Jnet PSIBLAST pssm profile prediction

Lupas           - Lupas Coil prediction (window size of 14, 21 and 28)

Note on coiled coil predictions  - = less than 50% probability
                                 c = between 50% and 90% probability
                                 C = greater than 90% probability

Jnet_25         - Jnet prediction of burial, less than 25% solvent accesibility
Jnet_5          - Jnet prediction of burial, less than 5% exposure
Jnet_0          - Jnet prediction of burial, 0% exposure
Jnet Rel        - Jnet reliability of prediction accuracy, ranges from 0 to 9, bigger is better.
";

print HTML "

$JPREDFOOT"; close(HTML);