X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=binaries%2Fsrc%2FViennaRNA%2FUtils%2Fcoloraln.pl;fp=binaries%2Fsrc%2FViennaRNA%2FUtils%2Fcoloraln.pl;h=4663e8319fbf06c104db4f315a996912023565e4;hb=7522ace91fc0804a9719dbac9f68bc8154da3132;hp=0000000000000000000000000000000000000000;hpb=8116c0444fe98e8eb21bcdd8ded06e1429085823;p=jabaws.git diff --git a/binaries/src/ViennaRNA/Utils/coloraln.pl b/binaries/src/ViennaRNA/Utils/coloraln.pl new file mode 100644 index 0000000..4663e83 --- /dev/null +++ b/binaries/src/ViennaRNA/Utils/coloraln.pl @@ -0,0 +1,564 @@ +#!/usr/bin/perl -w + +use Getopt::Long; +use Pod::Usage; +#use POSIX; +#use GD; +use strict; + +my $ssFile='alirna.ps'; +my $columnWidth=120; +my $formatT=0; +my $man=0; +my $ruler=0; +my $resnum=0; + +GetOptions ('-s:s' => \$ssFile, + '-c:i' => \$columnWidth, + '-t' => \$formatT, + '-r' => \$ruler, + '-n' => \$resnum, + "help"=> \$man, + "man" => \$man, + "h"=> sub {pod2usage(-verbose => 1)}, + ) || pod2usage(-verbose => 0); + +pod2usage(-verbose => 2) if $man; + +if (!-e $ssFile) { + pod2usage(-msg => "RNAalifold secondary structure file $ssFile not found (use option -s)", + -verbose => 0); + exit(1); +} + +my $aln=readClustal(); + +my @ss=(); +for my $i (1..length($aln->[0]->{seq})) { + push @ss,'.'; +} + +my $consStruc=''; + +open(ALIRNA,"<$ssFile"); +my $pairsFlag=0; +my $sequenceFlag=0; +while () { + $pairsFlag=1 if (/\/pairs/); + if ($pairsFlag and /\[(\d+) (\d+)\]/) { + $ss[$1-1]='('; + $ss[$2-1]=')'; + } + $pairsFlag=0 if ($pairsFlag and /def/); +} + +$consStruc=join('',@ss); + +for my $row (@$aln) { + $row->{seq}=uc($row->{seq}); + if ($formatT) { + $row->{seq}=~s/U/T/g; + } else { + $row->{seq}=~s/T/U/g; + } +} + + +my $consSeq=consensusSeq($aln); + +#print "$consSeq\n$consStruc\n"; + +plotAln($aln,$consSeq,$consStruc); + + + +###################################################################### +# +# consensusSeq(\@aln alnref) +# +# Returns consensus sequence of alignment +# +###################################################################### + +sub consensusSeq{ + + my @aln=@{$_[0]}; + my $out=''; + + for my $i (0..length($aln[0]->{seq})-1) { + + my %countHash=('A'=>0,'C'=>0,'G'=>0,'T'=>0,'U'=>0,'-'=>0); + #print %countHash,"\n"; + for my $j (0..$#aln) { + my $c=substr($aln[$j]->{seq},$i,1); + $countHash{$c}++; + } + #print %countHash,"\n"; + my $maxCount=0; + my $maxChar=''; + + for my $c ('A','C','G','T','U','-') { + if ($countHash{$c}>=$maxCount) { + $maxChar=$c; + $maxCount=$countHash{$c}; + } + } + $out.=$maxChar; + } + return $out; +} + + +###################################################################### +# +# readClustal(filehandle) +# +# Reads Clustal W formatted alignment file and returns it in list of +# hash references with keys "name" and "seq" for the name and the sequence, +# resp. +# +###################################################################### + +sub readClustal{ + # my $fh=shift; + my @out=(); + my (%order, $order, %alignments); + while (<>) { + next if ( /^\s+$/ ); + my ($seqname, $aln_line) = ('', ''); + if ( /^\s*(\S+)\s*\/\s*(\d+)-(\d+)\s+(\S+)\s*$/ ) { + # clustal 1.4 format + ($seqname,$aln_line) = ("$1/$2-$3",$4); + } elsif ( /^(\S+)\s+([A-Z\-]+)\s*\d*$/ ) { + ($seqname,$aln_line) = ($1,$2); + } else { + next; + } + if ( !exists $order{$seqname} ) { + $order{$seqname} = $order++; + } + $alignments{$seqname} .= $aln_line; + } + + foreach my $name ( sort { $order{$a} <=> $order{$b} } keys %alignments ) { + if ( $name =~ /(\S+):(\d+)-(\d+)/ ) { + (my $sname,my $start, my $end) = ($1,$2,$3); + } else { + (my $sname, my $start) = ($name,1); + my $str = $alignments{$name}; + $str =~ s/[^A-Za-z]//g; + my $end = length($str); + } + my $seq=$alignments{$name}; + push @out, {name=>$name,seq=>$seq}; + } + return [@out]; +} + +###################################################################### +# +# getPairs(\@aln alnref, $ss string) +# +# Evalutates the pairing of an alignment according to a given +# consensus secondary structure +# +# Returns list of all base pairs which is a hash with the following +# keys: +# +# open ... column in the alignment of the first base in the pair "(" +# close ... column in the alignment of the second base in the pair ")" +# all ... list of all basepairs in the different sequences in the alignment +# pairing ... list of all different pairing basepairs +# nonpairing ... list of all incompatible basepairs +# +###################################################################### + +sub getPairs{ + + my @inputAln=@{$_[0]}; + my $ss=$_[1]; + + # return nothing if there are no pairs + if (!($ss=~tr/(/(/)) { + return (); + } + + my @aln=(); + foreach my $row (@inputAln) { + my $seq=$row->{seq}; + $seq=uc($seq); + $seq=~s/T/U/g; + my @tmp=split(//,$seq); + push @aln,\@tmp; + } + my @ss=split(//,$ss); + + my @pairs=(); + my @stack=(); + + foreach my $column (0..$#ss) { + + my $currChar=$ss[$column]; + + if ($currChar eq '(') { + push @stack,$column; + } + + if ($currChar eq ')') { + my $openedCol=pop @stack; + push @pairs,{open=>$openedCol,close=>$column}; + } + } + + @pairs=sort {$a->{open} <=> $b->{open}} @pairs; + + foreach my $i (0..$#pairs) { + #print "$i: $pairs[$i]->{open} - $pairs[$i]->{close}\n"; + + my @all=(); + my @pairing=(); + my @nonpairing=(); + + for my $j (0..$#aln) { + my $currPair=$aln[$j][$pairs[$i]->{open}].$aln[$j][$pairs[$i]->{close}]; + push @all,$currPair; + } + + for my $pair (@all) { + if (($pair eq 'AU') or + ($pair eq 'UA') or + ($pair eq 'GC') or + ($pair eq 'CG') or + ($pair eq 'UG') or + ($pair eq 'GU')) { + + push @pairing, $pair; + } elsif ($pair eq "--") { + # do nothing + } else { + push @nonpairing,$pair; + } + } + + undef my %saw; + my @uniquePairing = grep(!$saw{$_}++, @pairing); + + $pairs[$i]->{all}=[@all]; + $pairs[$i]->{pairing}=[@uniquePairing]; + $pairs[$i]->{nonpairing}=[@nonpairing]; + } + + return @pairs; + +} + + +###################################################################### +# +# plotAln(\@aln ref-to-alignment, $consensus string, $ss string) +# +# Creates a image of the alignment with various annotations +# +# \@aln ... alignment in list of hash format +# $consensus ... consensus sequence +# $ss ... consensus secondary structure in dot bracket notation +# +# Returns png as string. +# +###################################################################### + +sub plotAln{ + + my @aln=@{$_[0]}; + my $consensus=$_[1]; + my $ss=$_[2]; + + my $ps=''; + + # Get important measures + (my $fontWidth, my $fontHeight) = (6,6.5); + + my $length=length($aln[0]->{seq}); + my $maxName=0; + foreach my $row (@aln) { + $maxName=length($row->{name}) if (length($row->{name})>$maxName); + } + $maxName = 5 if $maxName<5 && $ruler; + + # Custom Sizes + my $lineStep=$fontHeight+2; # distance between lines + my $blockStep=3.5*$fontHeight; # distance between blocks + my $consStep=$fontHeight*0.5; # distance between alignment and conservation curve + my $ssStep=2; + my $nameStep=3*$fontWidth; # distance between names and sequences + my $maxConsBar=2.5*$fontHeight; # Height of conservation curve + my $startY=2; # "y origin" + my $namesX=$fontWidth; # "x origin" + my $seqsX=$namesX+$maxName*$fontWidth+$nameStep; # x-coord. where sequences start + + # Calculate width and height + + my $tmpColumns=$columnWidth; + + $tmpColumns=$length if ($length<$columnWidth); + + my $imageWidth = int(0.9+$namesX+($maxName+$tmpColumns)*$fontWidth+$nameStep+$fontWidth); + $imageWidth += int($fontWidth + length("$length")*$fontWidth) if $resnum; + my $numBlock = int(1+($length-1)/$columnWidth); + my $imageHeight = int($startY+$numBlock*((@aln+1+$ruler)*$lineStep+$blockStep+$consStep+$ssStep)); + + + my $white="1 1 1"; + my $black="0 0 0" ; + my $grey="1 1 1"; + + my $red="0.0 1"; + my $ocre="0.16 1"; + my $green="0.32 1"; + my $turq="0.48 1"; + my $blue="0.65 1"; + my $violet="0.81 1"; + + + my $red1="0.0 0.6"; + my $ocre1="0.16 0.6"; + my $green1="0.32 0.6"; + my $turq1="0.48 0.6"; + my $blue1="0.65 0.6"; + my $violet1="0.81 0.6"; + + my $red2="0.0 0.2"; + my $ocre2="0.16 0.2"; + my $green2="0.32 0.2"; + my $turq2="0.48 0.2"; + my $blue2="0.65 0.2"; + my $violet2="0.81 0.2"; + + + my @colorMatrix=([$red,$red1,$red2], + [$ocre,$ocre1,$ocre2], + [$green,$green1,$green2], + [$turq,$turq1,$turq2], + [$blue,$blue1,$blue2], + [$violet,$violet1,$violet2]); + + my @pairs=getPairs(\@aln,$ss); + + foreach my $pair (@pairs) { + + foreach my $column ($pair->{open},$pair->{close}) { + my $block = int(0.999+($column+1)/$columnWidth); + my $effCol=$column-($block-1)*$columnWidth; + my $x=$seqsX+$effCol*$fontWidth; + + foreach my $row (0..$#aln) { + + my $pairing=@{$pair->{pairing}}; + my $nonpairing=@{$pair->{nonpairing}}; + + my $color; + + if ($nonpairing <=2) { + $color=$colorMatrix[$pairing-1][$nonpairing]; + + if (($pair->{all}->[$row] eq 'AU') or + ($pair->{all}->[$row] eq 'UA') or + ($pair->{all}->[$row] eq 'GC') or + ($pair->{all}->[$row] eq 'CG') or + ($pair->{all}->[$row] eq 'GU') or + ($pair->{all}->[$row] eq 'UG')) { + + my $y=$startY+($block-1)*($lineStep*(@aln+1+$ruler)+$blockStep+$consStep)+$ssStep*($block)+($row+1)*$lineStep; + + my $xtmp=$x+$fontWidth; + my $ytmp1=$y-1; + my $ytmp2=$y+$fontHeight+1; + $ps.="$x $ytmp1 $xtmp $ytmp2 $color box\n"; + } + } + } + } + } + # Calculate conservation scores as (M+1)/(N+1), where M is the + # number of matches to the consensus and N is the number of + # sequences in the alignment + + my @conservation=(); # each column one score + + for my $column (0..$length-1) { + + my $consChar=substr($consensus,$column,1); + + # if consensus is gap, score=0 + if ($consChar eq "-" or $consChar eq "_") { + push @conservation, 0; + next; + } + + my $match=0; + for my $row (0..$#aln) { + my $currChar=substr($aln[$row]->{seq},$column,1); + $match++ if ($currChar eq $consChar); + } + + my $score=($match-1)/(@aln-1); + push @conservation,$score; + } + + # Draw the alignments in chunks + my $currY=$startY; + my $currPos=0; + + while ($currPos<$length) { + + # secondary structure in first line + #$out->string($font,$seqsX,$currY,substr($ss,$currPos,$columnWidth),$black); + my $tmpSeq=substr($ss,$currPos,$columnWidth); + $tmpSeq=~s/\(/\\\(/g; + $tmpSeq=~s/\)/\\\)/g; + $ps .= "0 setgray\n"; + $ps.="($tmpSeq) $seqsX $currY string\n"; + $currY+=$lineStep+$ssStep; + + # sequences labeled only with the organism-specifier + foreach my $row (@aln) { + #$out->string($font,$namesX,$currY,$row->{name},$black); + $ps.="($row->{name}) $namesX $currY string\n"; + #$out->string($font,$seqsX,$currY,substr($row->{seq},$currPos,$columnWidth),$black); + my $tmpSeq=substr($row->{seq},$currPos,$columnWidth); + $ps.="($tmpSeq) $seqsX $currY string\n"; + if ($resnum) { + my $numX=$seqsX+(length($tmpSeq)+1)*$fontWidth; # x-coord. where residue nums start + $tmpSeq = substr($row->{seq},0, $currPos+$columnWidth); + my $p = length($tmpSeq) - $tmpSeq =~ tr/_-//; # remove gaps + $ps .= sprintf("(%" . length($length) . "d) $numX $currY string\n", $p); + } + $currY+=$lineStep; + } + if ($ruler) { + $ps .= "(ruler) $namesX $currY string\n"; + my $p = 10 * (int(1.5+$currPos/10)); + my $r = "." x ($p-$currPos-length("$p")); + $r .= $p; $p+=10; + my $maxp = $currPos+$columnWidth; + $maxp = $length if $maxp>$length; + for (;$p<=$maxp; $p+=10) { + $r .= sprintf("%10d", $p); + } + $r =~ tr/ /./; + $r .= "." x ($maxp-$p+10) if $p>$maxp; + $ps .= "($r) $seqsX $currY string\n"; + $currY+=$lineStep; + } + + # conservation curve + $currY+=$consStep; + $ps .= "0.6 setgray\n"; + for my $col ($currPos..$currPos+$columnWidth-1) { + my $score=shift @conservation; + last if (!defined $score); + my $barHeight=$maxConsBar*$score; + $barHeight=1 if ($barHeight==0); + my $x=$seqsX+($col-($columnWidth*int($currPos/$columnWidth)))*$fontWidth; + + my $ytmp1=$currY+$maxConsBar-$barHeight; + my $xtmp2=$x+$fontWidth; + my $ytmp2=$currY+$maxConsBar; + $ps .= "$x $ytmp1 $xtmp2 $ytmp2 box2\n"; + } + $currY+=$blockStep; + $currPos+=$columnWidth; + } + + my $BB="2 2 $imageWidth $imageHeight"; + + while () { + s/!BB!/$BB/; + s/!HEIGHT!/$imageHeight/; + print; + } + print $ps; + print "showpage\n"; +} + +=head1 NAME + +coloraln.pl - colorize an alignment with consensus structure + +=head1 SYNOPSIS + + coloraln.pl [-r] [-n] [-c columns] [-s structure.ps] file.aln + +=head1 DESCRIPTION + +colorrna.pl reads an alignment in CLUSTAL format, and a consensus +secondary structure (which it extracts from a postscript secondary +structure plot). It produces a postscript figure of the alignment, in +which compensatory mutation supporting the consensus structure are +marked by color. The color scheme is the same employed by RNAalifold +and alidot: Red marks pairs with no sequence variation; ochre, green, +turquoise, blue, and violet mark pairs with 2,3,4,5,6 different tpyes +of pairs, respectively. + +=head1 OPTIONS + +=over 4 + +=item B<-s> I + +read I to extract the consensus structure (default: C) + +=item B<-c> I + +break alignments into blocks of at most I columns, (default: 120) + +=item B<-t> + +suppress conversion of C to C, i.e. do not convert DNA to +RNA, (default: convert to C) + +=item B<-r> + +add a "ruler" below the alignment showing sequence positions + +=item B<-n> + +write sequence position at the end of each sequence line + +=back + +=cut + + +__DATA__ +%!PS-Adobe-3.0 EPSF-3.0 +%%BoundingBox: !BB! +%%EndComments + +% draws Vienna RNA like colored boxes +/box { % x1 y1 x2 y2 hue saturation + gsave + dup 0.3 mul 1 exch sub sethsbcolor + exch 3 index sub exch 2 index sub rectfill + grestore +} def + +% draws a box in current color +/box2 { % x1 y1 x2 y2 + exch 3 index sub exch 2 index sub rectfill +} def + +/string { % (Text) x y + 6 add + moveto + show +} def + +0 !HEIGHT! translate +1 -1 scale +/Courier findfont +[10 0 0 -10 0 0] makefont setfont + +%100 100 106 110 0.5 0.5 1 box +%(A) 100 100 string +%(This is a string) 200 200 string