Jpred secondary structure prediction results

X-Git-Url: http://source.jalview.org/gitweb/?p=jpred.git;a=blobdiff_plain;f=websoft%2Fbin%2Fconcise2html;fp=websoft%2Fbin%2Fconcise2html;h=105501bb53cb29956a11a96083233f126b0d4f9b;hp=0000000000000000000000000000000000000000;hb=443c228bf0712d71e7fa34b5a2dc4b2b2e79f13f;hpb=9aa768067094f24f46f273077f867348e6143711 diff --git a/websoft/bin/concise2html b/websoft/bin/concise2html new file mode 100755 index 0000000..105501b --- /dev/null +++ b/websoft/bin/concise2html @@ -0,0 +1,445 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Jpred; +use Getopt::Long; + +my ($in, $out); +my $format = 'seq'; +GetOptions( + "in=s" => \$in, + "out=s" => \$out, + "format=s" => \$format +); + +if (!$in) { $in = "-"; } +if (!$out) { $out = "-"; } + +# Read in the concise CSV file. Data is stored in hash's with the key +# being the name of the type of data and the value being the sequence +# data, the order of the alignments is also stored for display in the +# order with which they are aligned. + +my (@align, @seq, %predict, %prob); + +my @predictions = qw(Lupas_14 Lupas_21 Lupas_28 jnetpred JNETHMM JNETALIGN JNETPSSM JNETJURY JNETSOL25 JNETSOL5 JNETSOL0 JNETCONF); + +my @prob = qw(JNETPROPH JNETPROPB JNETPROPC); + +# Arrays that holds the order in which things are printed +my @print_pred = qw(jnetpred JNETALIGN JNETHMM JNETPSSM); +my @coils = qw(Lupas_14 Lupas_21 Lupas_28); +my @solvent = qw(JNETSOL25 JNETSOL5 JNETSOL0); + +# Hash that relates data names to printed names +my %convert = ( + JNETALIGN => "jalign", + JNETHMM => "jhmm", + jnetpred => "Jnet", + JNETPSSM => "jpssm", + + Lupas_14 => "Lupas 14", + Lupas_21 => "Lupas 21", + Lupas_28 => "Lupas 28", + + JNETSOL25 => "Jnet_25", + JNETSOL5 => "Jnet_5", + JNETSOL0 => "Jnet_0", + JNETCONF => "Jnet Rel" +); + +# Read in the data +open(IN, "<$in") or die($!); +while () { + if (/:/) { + my ($id, $seq) = split(":", $_); + + if ($id =~ /;/) { # Find sequence alignments + my ($align, $seqid) = split(";", $id); + push @align, $seqid; + $seq =~ s/,//g; + chomp($seq); + push @seq, $seq; + } + + foreach (@predictions) { # Find the predictions + if ($id eq $_) { + if ($predict{$id}) { die("Same ID ($id) found twice in concise file!"); } + $seq =~ s/,//g; + chomp($seq); + $predict{$id} = $seq; + } + } + + foreach (@prob) { + if ($id eq $_) { + if ($prob{$id}) { die("Same ID ($id) found twice in concise file!"); } + chomp($seq); + @_ = split(",", $seq); + foreach (@_) { + if ($_ >= 0.85) { $_ = 9; } + else { $_ = int(($_ * 10) + 0.5); } + } + my $temp = join("", @_); + $prob{$id} = $temp; + } + } + } + } +close(IN); + +############################################ +# +# Sets up the references for prediction +# methods and deffn methods +# + +my @predname = qw(jnetpred JNETALIGN JNETPSSM JNETHMM); +foreach (@predname) { + if (!$predict{$_}) { warn("\"$_\" data not present\n"); next; } + $predict{$_} =~ s/[TCYWXZ]/-/g; + } + +###################################################################### +# Print out the sequences to HTML +###################################################################### + +open(HTML, ">$out") or die($!); +print HTML "Jpred secondary structure prediction results

";
+
+#
+# Prints the sequence passed to it as a reference to an array, second
+# array contains the "conservation", which varies between 0 and 1 for
+# each position.
+#
+
+sub printseq {
+	my $seqref = shift @_;
+	my $scoreref = shift @_;
+
+	unless ($scoreref) {
+		foreach (0..$#{$seqref}) {
+			print HTML ${$seqref}[$_];
+		}
+		return;
+	}
+	
+#	foreach (0..$#{$seqref} - 1) {
+	foreach (0..$#{$seqref}) {
+		my $colour = "#";	# Work out the colour
+		if (${$scoreref}[$_] == 1) {
+			$colour .= "0000ff";
+		}
+		else {
+			$colour .= (sprintf "%-2.2lx", ${$scoreref}[$_] * 255).(sprintf "%-2.2lx", 0).(sprintf "%-2.2lx", 0);
+		}
+
+		print HTML "${$seqref}[$_]";
+	}
+}
+
+# Prints the gap at the begining of the line between the sequence ID
+# and the ":". First argument is the length of the ID and the second is
+# the maximimum length.
+
+sub printgap {
+	my ($strlen, $max_len) = @_;
+	foreach ($strlen .. $max_len) {
+		print HTML " ";
+		}
+	print HTML ": ";
+	}
+
+# Works out a consensus score according to Zvelebil et al. (1987)
+# J. Mol. Biol. 195, pp957, and does the smoothing
+
+sub cons {
+	my %conservation = (
+		I => [1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
+		L => [1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
+		V => [1, 0, 0, 0, 0, 1, 0, 1, 0, 0],
+		C => [1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
+		A => [1, 0, 0, 0, 0, 1, 1, 0, 0, 0],
+		G => [1, 0, 0, 0, 0, 1, 1, 0, 0, 0],
+		M => [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+		F => [1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+		Y => [1, 0, 0, 1, 0, 0, 0, 0, 1, 0],
+		W => [1, 0, 0, 1, 0, 0, 0, 0, 1, 0],
+		H => [1, 1, 0, 1, 1, 0, 0, 0, 1, 0],
+		K => [1, 1, 0, 1, 1, 0, 0, 0, 0, 0],
+		R => [0, 1, 0, 1, 1, 0, 0, 0, 0, 0],
+		E => [0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
+		Q => [0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
+		D => [0, 0, 1, 1, 0, 1, 0, 0, 0, 0],
+		N => [0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
+		S => [0, 0, 0, 1, 0, 1, 1, 0, 0, 0],
+		T => [1, 0, 0, 1, 0, 1, 0, 0, 0, 0],
+		P => [0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
+		Z => [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
+		'-' => [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+		'_' => [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+		'.' => [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+		X => [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+		B => [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # Not mentioned, so use X
+		U => [1, 0, 0, 0, 0, 1, 0, 0, 0, 0], # Not mentioned, so use C
+		);
+
+	my $aref = shift @_;
+	my (@cons, @align);
+
+	# Get each sequence
+	foreach (@{$aref}) {
+		push @align, $_;
+	}
+
+	my $seq_len = length($align[0]);
+
+	# Foreach position
+	foreach my $pos (0..$seq_len - 1) {
+		my @temp;
+
+		# Foreach sequence, get the residue for this position
+		foreach (@align) { push @temp, substr($_, $pos, 1); } 
+
+		my ($p, $i, $j);
+		for ($i = 0; $i < $#temp; $i++) {
+			my $one = uc($temp[$i]);
+			for ($j = $i + 1; $j < $#temp; $j++) {
+				my $two = uc($temp[$j]);
+				foreach (0..9) {
+					# This debugging is in here
+					# incase a residue occurs that we
+					# haven't got in the hash above
+					unless (defined $conservation{$one}[$_]) {
+#						warn "first $one, $_\n";
+#						die "'$conservation{$one}[$_]'\n";
+						next;
+					}
+					unless (defined $conservation{$two}[$_]) {
+#						warn "second $two, $_\n";
+#						die "'$conservation{$two}[$_]'\n";
+						next;
+					}
+					if ($conservation{$one}[$_] != $conservation{$two}[$_]) { $p++; }
+				}
+			}
+		}
+
+		# This loop looks to be v. slow
+		if ($p) {
+			if (($p = 0.9 - 0.1 * $p) < 0) { $p = 0; }
+		}
+		else {
+			for ($i = 1; $i < $#temp; $i++) {
+				if ($temp[$i] ne $temp[0]) {
+					$p++;
+					last;
+				}
+			}
+			if ($p) { $p = 0.9; }
+			else { $p = 1; }
+		}
+
+		push @cons, $p;
+	}
+
+	# Average the result over 3 residues
+#	my (@final, $i);
+#	push @final, $cons[0];
+#	for ($i	= 1; $i < ($#cons); $i++) {
+#		push @final, ($cons[$i - 1] + $cons[$i] + $cons[$i + 1]) / 3;
+#		}
+#	push @final, $cons[$#cons + 1];
+
+	return @cons;
+}
+
+############################################################
+# Uncomment this to get the conservation scoring worked out
+############################################################
+#my @score = &cons(\@seq);
+
+#
+# Print the sequences
+#
+
+# Find the longest identifier and set the space at the start of the line
+
+my $max_len = 0;
+foreach (@align) {
+	if (length($_) > $max_len) { $max_len = length($_); }
+	}
+if ($max_len <= 6) { $max_len = 11; }
+
+foreach (0..$#align) {
+	# Begining of line
+	if ($_ == 0) {
+		print HTML "$align[$_]";
+		}
+	else {
+	   if ($format eq 'seq') {
+	      ## differentiate between Uniprot and Uniparc (UniRef90_UNI\w+) IDs as they require
+	      ## different SRS query strings
+	      if ($align[$_] =~ /UniRef90_(UPI\w+)/) {
+	         print HTML "$align[$_]";
+	      } else {
+	         print HTML "$align[$_]";
+	      }
+	   } else {
+	      print HTML "$align[$_]";
+	   }
+		}
+	&printgap(length($align[$_]) ,$max_len);
+
+	# Print the sequence
+	@_ = split("", $seq[$_] );
+#	&printseq([@_], [@score]);
+	&printseq([@_]);
+
+	# End of the line
+	if ($_ == 0) {
+		print HTML " : $align[$_] \n";
+		}
+	else {
+	   if ($format eq 'seq') {
+              ## differentiate between Uniprot and Uniparc IDs as they require
+              ## different SRS query strings
+              if ($align[$_] =~ /UniRef90_(UPI\w+)/) {
+                 print HTML " : $align[$_]\n";
+              } else {
+                 print HTML " : $align[$_]\n";
+              }
+	   } else {
+	      print HTML " : $align[$_]\n";
+	   }
+		}
+	}
+print HTML "\n";
+
+######################################################################
+
+#
+# Create a guide sequence of the original query
+# (of the form 1----------11--- etc.)
+#
+
+my ($guide, $i);
+my $seq_len = length($seq[0]);
+
+for ($i = 1; $i < $seq_len; $i += 10) {
+	my $dash;
+	my $no_dash = 9 - length($i);
+	foreach (0..$no_dash) { $dash .= "-"; }
+	$guide .= $i.$dash;
+	}
+
+while (length($guide) > $seq_len) {	# Trim spare guide away
+	chop($guide);
+	}
+
+printgap(0, $max_len);
+print HTML $guide." :\n";
+
+print HTML "OrigSeq";
+printgap(7, $max_len);
+print HTML $seq[0];
+print HTML " : OrigSeq\n\n";
+
+# Print out secondary structure predictions, goes through in order of
+# @print_pred so comes out right
+
+foreach (@print_pred) {
+	if (!$predict{$_}) { next; }
+	if ($_ eq "JNETALIGN") {	#  Add a line before JNETALIGN which will put jnetpred on its own
+		print HTML "\n";
+		}
+
+	print HTML $convert{$_};
+	printgap(length($convert{$_}), $max_len);
+	@_ = split("", $predict{$_});
+	foreach (@_) {
+		if ($_ eq "E") {
+			print HTML "$_";
+			}
+		elsif ($_ eq "H") {
+			print HTML "$_";
+			}
+		elsif ($_ eq "-") {
+			print HTML $_;
+			}
+		}
+	print HTML " : $convert{$_}\n";
+	}
+print HTML "\n";
+
+foreach (@coils) {
+	if (!$predict{$_}) { next; }
+	print HTML $convert{$_};
+	printgap(length($convert{$_}), $max_len); 
+	@_ = split("", $predict{$_});
+	foreach (@_) {
+		if ($_ eq "C") { print HTML "$_"; }
+		else { print HTML $_; }
+		}
+	print HTML " : $convert{$_}\n";
+	}
+print HTML "\n";
+
+foreach (@solvent) {
+	if (!$predict{$_}) { next; }
+	print HTML $convert{$_};
+	printgap(length($convert{$_}), $max_len);
+	@_ = split("", $predict{$_});
+	foreach (@_) {
+		if ($_ eq "B") {
+			print HTML "$_";
+			}
+		else { print HTML $_; }
+		}
+	print HTML " : $convert{$_}\n";
+	}
+
+printf HTML $convert{"JNETCONF"};
+printgap(8, $max_len);
+
+@_ = split("", $predict{"JNETCONF"});
+foreach (@_) {
+	if ($_ >= 7) { print HTML "$_"; }
+	else { print HTML "$_"; }
+	}
+print HTML " : ".$convert{"JNETCONF"}."\n";
+
+print HTML "

"; +print HTML "

"; + +# +# Key to data +# + +print HTML "

";
+print HTML "\nNotes
+Key:
+Colour code for alignment:
+Blue            - Complete identity at a position
+Shades of red   - The more red a position is, the higher the level of
+                  conservation of chemical properties of the amino acids
+Jnet            - Final secondary structure prediction for query
+jalign          - Jnet alignment prediction
+jhmm            - Jnet hmm profile prediction
+jpssm           - Jnet PSIBLAST pssm profile prediction
+
+Lupas           - Lupas Coil prediction (window size of 14, 21 and 28)
+
+Note on coiled coil predictions  - = less than 50% probability
+                                 c = between 50% and 90% probability
+                                 C = greater than 90% probability
+
+Jnet_25         - Jnet prediction of burial, less than 25% solvent accesibility
+Jnet_5          - Jnet prediction of burial, less than 5% exposure
+Jnet_0          - Jnet prediction of burial, 0% exposure
+Jnet Rel        - Jnet reliability of prediction accuracy, ranges from 0 to 9, bigger is better.
+";
+
+print HTML "

$JPREDFOOT"; +close(HTML);