#!/usr/bin/perl -w use Getopt::Long; use Pod::Usage; #use POSIX; #use GD; use strict; my $ssFile='alirna.ps'; my $columnWidth=120; my $formatT=0; my $man=0; my $ruler=0; my $resnum=0; GetOptions ('-s:s' => \$ssFile, '-c:i' => \$columnWidth, '-t' => \$formatT, '-r' => \$ruler, '-n' => \$resnum, "help"=> \$man, "man" => \$man, "h"=> sub {pod2usage(-verbose => 1)}, ) || pod2usage(-verbose => 0); pod2usage(-verbose => 2) if $man; if (!-e $ssFile) { pod2usage(-msg => "RNAalifold secondary structure file $ssFile not found (use option -s)", -verbose => 0); exit(1); } my $aln=readClustal(); my @ss=(); for my $i (1..length($aln->[0]->{seq})) { push @ss,'.'; } my $consStruc=''; open(ALIRNA,"<$ssFile"); my $pairsFlag=0; my $sequenceFlag=0; while () { $pairsFlag=1 if (/\/pairs/); if ($pairsFlag and /\[(\d+) (\d+)\]/) { $ss[$1-1]='('; $ss[$2-1]=')'; } $pairsFlag=0 if ($pairsFlag and /def/); } $consStruc=join('',@ss); for my $row (@$aln) { $row->{seq}=uc($row->{seq}); if ($formatT) { $row->{seq}=~s/U/T/g; } else { $row->{seq}=~s/T/U/g; } } my $consSeq=consensusSeq($aln); #print "$consSeq\n$consStruc\n"; plotAln($aln,$consSeq,$consStruc); ###################################################################### # # consensusSeq(\@aln alnref) # # Returns consensus sequence of alignment # ###################################################################### sub consensusSeq{ my @aln=@{$_[0]}; my $out=''; for my $i (0..length($aln[0]->{seq})-1) { my %countHash=('A'=>0,'C'=>0,'G'=>0,'T'=>0,'U'=>0,'-'=>0); #print %countHash,"\n"; for my $j (0..$#aln) { my $c=substr($aln[$j]->{seq},$i,1); $countHash{$c}++; } #print %countHash,"\n"; my $maxCount=0; my $maxChar=''; for my $c ('A','C','G','T','U','-') { if ($countHash{$c}>=$maxCount) { $maxChar=$c; $maxCount=$countHash{$c}; } } $out.=$maxChar; } return $out; } ###################################################################### # # readClustal(filehandle) # # Reads Clustal W formatted alignment file and returns it in list of # hash references with keys "name" and "seq" for the name and the sequence, # resp. # ###################################################################### sub readClustal{ # my $fh=shift; my @out=(); my (%order, $order, %alignments); while (<>) { next if ( /^\s+$/ ); my ($seqname, $aln_line) = ('', ''); if ( /^\s*(\S+)\s*\/\s*(\d+)-(\d+)\s+(\S+)\s*$/ ) { # clustal 1.4 format ($seqname,$aln_line) = ("$1/$2-$3",$4); } elsif ( /^(\S+)\s+([A-Z\-]+)\s*\d*$/ ) { ($seqname,$aln_line) = ($1,$2); } else { next; } if ( !exists $order{$seqname} ) { $order{$seqname} = $order++; } $alignments{$seqname} .= $aln_line; } foreach my $name ( sort { $order{$a} <=> $order{$b} } keys %alignments ) { if ( $name =~ /(\S+):(\d+)-(\d+)/ ) { (my $sname,my $start, my $end) = ($1,$2,$3); } else { (my $sname, my $start) = ($name,1); my $str = $alignments{$name}; $str =~ s/[^A-Za-z]//g; my $end = length($str); } my $seq=$alignments{$name}; push @out, {name=>$name,seq=>$seq}; } return [@out]; } ###################################################################### # # getPairs(\@aln alnref, $ss string) # # Evalutates the pairing of an alignment according to a given # consensus secondary structure # # Returns list of all base pairs which is a hash with the following # keys: # # open ... column in the alignment of the first base in the pair "(" # close ... column in the alignment of the second base in the pair ")" # all ... list of all basepairs in the different sequences in the alignment # pairing ... list of all different pairing basepairs # nonpairing ... list of all incompatible basepairs # ###################################################################### sub getPairs{ my @inputAln=@{$_[0]}; my $ss=$_[1]; # return nothing if there are no pairs if (!($ss=~tr/(/(/)) { return (); } my @aln=(); foreach my $row (@inputAln) { my $seq=$row->{seq}; $seq=uc($seq); $seq=~s/T/U/g; my @tmp=split(//,$seq); push @aln,\@tmp; } my @ss=split(//,$ss); my @pairs=(); my @stack=(); foreach my $column (0..$#ss) { my $currChar=$ss[$column]; if ($currChar eq '(') { push @stack,$column; } if ($currChar eq ')') { my $openedCol=pop @stack; push @pairs,{open=>$openedCol,close=>$column}; } } @pairs=sort {$a->{open} <=> $b->{open}} @pairs; foreach my $i (0..$#pairs) { #print "$i: $pairs[$i]->{open} - $pairs[$i]->{close}\n"; my @all=(); my @pairing=(); my @nonpairing=(); for my $j (0..$#aln) { my $currPair=$aln[$j][$pairs[$i]->{open}].$aln[$j][$pairs[$i]->{close}]; push @all,$currPair; } for my $pair (@all) { if (($pair eq 'AU') or ($pair eq 'UA') or ($pair eq 'GC') or ($pair eq 'CG') or ($pair eq 'UG') or ($pair eq 'GU')) { push @pairing, $pair; } elsif ($pair eq "--") { # do nothing } else { push @nonpairing,$pair; } } undef my %saw; my @uniquePairing = grep(!$saw{$_}++, @pairing); $pairs[$i]->{all}=[@all]; $pairs[$i]->{pairing}=[@uniquePairing]; $pairs[$i]->{nonpairing}=[@nonpairing]; } return @pairs; } ###################################################################### # # plotAln(\@aln ref-to-alignment, $consensus string, $ss string) # # Creates a image of the alignment with various annotations # # \@aln ... alignment in list of hash format # $consensus ... consensus sequence # $ss ... consensus secondary structure in dot bracket notation # # Returns png as string. # ###################################################################### sub plotAln{ my @aln=@{$_[0]}; my $consensus=$_[1]; my $ss=$_[2]; my $ps=''; # Get important measures (my $fontWidth, my $fontHeight) = (6,6.5); my $length=length($aln[0]->{seq}); my $maxName=0; foreach my $row (@aln) { $maxName=length($row->{name}) if (length($row->{name})>$maxName); } $maxName = 5 if $maxName<5 && $ruler; # Custom Sizes my $lineStep=$fontHeight+2; # distance between lines my $blockStep=3.5*$fontHeight; # distance between blocks my $consStep=$fontHeight*0.5; # distance between alignment and conservation curve my $ssStep=2; my $nameStep=3*$fontWidth; # distance between names and sequences my $maxConsBar=2.5*$fontHeight; # Height of conservation curve my $startY=2; # "y origin" my $namesX=$fontWidth; # "x origin" my $seqsX=$namesX+$maxName*$fontWidth+$nameStep; # x-coord. where sequences start # Calculate width and height my $tmpColumns=$columnWidth; $tmpColumns=$length if ($length<$columnWidth); my $imageWidth = int(0.9+$namesX+($maxName+$tmpColumns)*$fontWidth+$nameStep+$fontWidth); $imageWidth += int($fontWidth + length("$length")*$fontWidth) if $resnum; my $numBlock = int(1+($length-1)/$columnWidth); my $imageHeight = int($startY+$numBlock*((@aln+1+$ruler)*$lineStep+$blockStep+$consStep+$ssStep)); my $white="1 1 1"; my $black="0 0 0" ; my $grey="1 1 1"; my $red="0.0 1"; my $ocre="0.16 1"; my $green="0.32 1"; my $turq="0.48 1"; my $blue="0.65 1"; my $violet="0.81 1"; my $red1="0.0 0.6"; my $ocre1="0.16 0.6"; my $green1="0.32 0.6"; my $turq1="0.48 0.6"; my $blue1="0.65 0.6"; my $violet1="0.81 0.6"; my $red2="0.0 0.2"; my $ocre2="0.16 0.2"; my $green2="0.32 0.2"; my $turq2="0.48 0.2"; my $blue2="0.65 0.2"; my $violet2="0.81 0.2"; my @colorMatrix=([$red,$red1,$red2], [$ocre,$ocre1,$ocre2], [$green,$green1,$green2], [$turq,$turq1,$turq2], [$blue,$blue1,$blue2], [$violet,$violet1,$violet2]); my @pairs=getPairs(\@aln,$ss); foreach my $pair (@pairs) { foreach my $column ($pair->{open},$pair->{close}) { my $block = int(0.999+($column+1)/$columnWidth); my $effCol=$column-($block-1)*$columnWidth; my $x=$seqsX+$effCol*$fontWidth; foreach my $row (0..$#aln) { my $pairing=@{$pair->{pairing}}; my $nonpairing=@{$pair->{nonpairing}}; my $color; if ($nonpairing <=2) { $color=$colorMatrix[$pairing-1][$nonpairing]; if (($pair->{all}->[$row] eq 'AU') or ($pair->{all}->[$row] eq 'UA') or ($pair->{all}->[$row] eq 'GC') or ($pair->{all}->[$row] eq 'CG') or ($pair->{all}->[$row] eq 'GU') or ($pair->{all}->[$row] eq 'UG')) { my $y=$startY+($block-1)*($lineStep*(@aln+1+$ruler)+$blockStep+$consStep)+$ssStep*($block)+($row+1)*$lineStep; my $xtmp=$x+$fontWidth; my $ytmp1=$y-1; my $ytmp2=$y+$fontHeight+1; $ps.="$x $ytmp1 $xtmp $ytmp2 $color box\n"; } } } } } # Calculate conservation scores as (M+1)/(N+1), where M is the # number of matches to the consensus and N is the number of # sequences in the alignment my @conservation=(); # each column one score for my $column (0..$length-1) { my $consChar=substr($consensus,$column,1); # if consensus is gap, score=0 if ($consChar eq "-" or $consChar eq "_") { push @conservation, 0; next; } my $match=0; for my $row (0..$#aln) { my $currChar=substr($aln[$row]->{seq},$column,1); $match++ if ($currChar eq $consChar); } my $score=($match-1)/(@aln-1); push @conservation,$score; } # Draw the alignments in chunks my $currY=$startY; my $currPos=0; while ($currPos<$length) { # secondary structure in first line #$out->string($font,$seqsX,$currY,substr($ss,$currPos,$columnWidth),$black); my $tmpSeq=substr($ss,$currPos,$columnWidth); $tmpSeq=~s/\(/\\\(/g; $tmpSeq=~s/\)/\\\)/g; $ps .= "0 setgray\n"; $ps.="($tmpSeq) $seqsX $currY string\n"; $currY+=$lineStep+$ssStep; # sequences labeled only with the organism-specifier foreach my $row (@aln) { #$out->string($font,$namesX,$currY,$row->{name},$black); $ps.="($row->{name}) $namesX $currY string\n"; #$out->string($font,$seqsX,$currY,substr($row->{seq},$currPos,$columnWidth),$black); my $tmpSeq=substr($row->{seq},$currPos,$columnWidth); $ps.="($tmpSeq) $seqsX $currY string\n"; if ($resnum) { my $numX=$seqsX+(length($tmpSeq)+1)*$fontWidth; # x-coord. where residue nums start $tmpSeq = substr($row->{seq},0, $currPos+$columnWidth); my $p = length($tmpSeq) - $tmpSeq =~ tr/_-//; # remove gaps $ps .= sprintf("(%" . length($length) . "d) $numX $currY string\n", $p); } $currY+=$lineStep; } if ($ruler) { $ps .= "(ruler) $namesX $currY string\n"; my $p = 10 * (int(1.5+$currPos/10)); my $r = "." x ($p-$currPos-length("$p")); $r .= $p; $p+=10; my $maxp = $currPos+$columnWidth; $maxp = $length if $maxp>$length; for (;$p<=$maxp; $p+=10) { $r .= sprintf("%10d", $p); } $r =~ tr/ /./; $r .= "." x ($maxp-$p+10) if $p>$maxp; $ps .= "($r) $seqsX $currY string\n"; $currY+=$lineStep; } # conservation curve $currY+=$consStep; $ps .= "0.6 setgray\n"; for my $col ($currPos..$currPos+$columnWidth-1) { my $score=shift @conservation; last if (!defined $score); my $barHeight=$maxConsBar*$score; $barHeight=1 if ($barHeight==0); my $x=$seqsX+($col-($columnWidth*int($currPos/$columnWidth)))*$fontWidth; my $ytmp1=$currY+$maxConsBar-$barHeight; my $xtmp2=$x+$fontWidth; my $ytmp2=$currY+$maxConsBar; $ps .= "$x $ytmp1 $xtmp2 $ytmp2 box2\n"; } $currY+=$blockStep; $currPos+=$columnWidth; } my $BB="2 2 $imageWidth $imageHeight"; while () { s/!BB!/$BB/; s/!HEIGHT!/$imageHeight/; print; } print $ps; print "showpage\n"; } =head1 NAME coloraln.pl - colorize an alignment with consensus structure =head1 SYNOPSIS coloraln.pl [-r] [-n] [-c columns] [-s structure.ps] file.aln =head1 DESCRIPTION colorrna.pl reads an alignment in CLUSTAL format, and a consensus secondary structure (which it extracts from a postscript secondary structure plot). It produces a postscript figure of the alignment, in which compensatory mutation supporting the consensus structure are marked by color. The color scheme is the same employed by RNAalifold and alidot: Red marks pairs with no sequence variation; ochre, green, turquoise, blue, and violet mark pairs with 2,3,4,5,6 different tpyes of pairs, respectively. =head1 OPTIONS =over 4 =item B<-s> I read I to extract the consensus structure (default: C) =item B<-c> I break alignments into blocks of at most I columns, (default: 120) =item B<-t> suppress conversion of C to C, i.e. do not convert DNA to RNA, (default: convert to C) =item B<-r> add a "ruler" below the alignment showing sequence positions =item B<-n> write sequence position at the end of each sequence line =back =cut __DATA__ %!PS-Adobe-3.0 EPSF-3.0 %%BoundingBox: !BB! %%EndComments % draws Vienna RNA like colored boxes /box { % x1 y1 x2 y2 hue saturation gsave dup 0.3 mul 1 exch sub sethsbcolor exch 3 index sub exch 2 index sub rectfill grestore } def % draws a box in current color /box2 { % x1 y1 x2 y2 exch 3 index sub exch 2 index sub rectfill } def /string { % (Text) x y 6 add moveto show } def 0 !HEIGHT! translate 1 -1 scale /Courier findfont [10 0 0 -10 0 0] makefont setfont %100 100 106 110 0.5 0.5 1 box %(A) 100 100 string %(This is a string) 200 200 string