#!/usr/bin/perl -w # -*-Perl-*- # Last changed Time-stamp: <2006-08-15 22:04:27 ivo> # after predicting a conensus structure using RNAalifold, # refold the indivual sequences, using the consensus structure as constraint # read a clustal fromat alignment, plus either the standart output of # RNAalifold or a dot plot produced by "RNAalifold -p" # usage refold.pl [-t thresh] clustal.aln alidot.ps # or refold.pl clustal.aln clustal.alifold use Getopt::Long; #use POSIX; #use GD; use strict; my $thresh = 0.9; my $help = 0; GetOptions ('-t=f' => \$thresh, "help"=>\$help, "h"=>\$help, ); if ($help){ print "\ refold.pl [-t threshold] myseqs.aln alidot.ps | RNAfold -C\n"; print "or refold.pl myseqs.aln myseqs.alifold | RNAfold -C\n"; print " -t ... use only pairs with p>thresh (default: 0.9)\n\n"; exit(1); } my $aln = readClustal(); my $len = length($aln->[0]->{seq}); $_ = <>; #print STDERR "$_\n"; my $constraint = (/^%\!PS/) ? read_dot() : read_alifold(); #print STDERR "$constraint\n"; foreach my $a (@$aln) { print "> $a->{name}\n"; # remove gaps my $seq = $a->{seq}; my $cons = $constraint; my @pt = make_pair_table($cons); my %pair = ("AU" => 5, "GC" => 1, "CG" => 2, "UA" => 6, "GU" => 3, "GT" => 3, "TG" => 4, "UG" => 4, "AT" => 5, "TA" => 6); for my $p (0..length($seq)-1) { # open non-compatible pairs as well as pairs to a gap position my $c = substr($seq,$p,1); if ($c eq '-') { substr($cons,$p,1) = 'x'; # mark for removal substr($cons,$pt[$p],1) = '.' if $pt[$p]>0 && substr($cons,$pt[$p],1) ne 'x'; # open pair } elsif ($pt[$p]>$p) { substr($cons,$p,1) = substr($cons,$pt[$p],1) = '.' unless exists $pair{$c . substr($seq,$pt[$p],1)}; } } # print STDERR length($seq), length($cons), "\n"; $cons =~ s/x//g; $seq =~ s/-//g; # print STDERR length($seq), length($cons), "\n"; print "$seq\n$cons\n"; } sub make_pair_table { #indices start at 0 in this version! my $structure = shift; my (@olist, @table); my ($hx,$i) = (0,0); foreach my $c (split(//,$structure)) { if ($c eq '.') { $table[$i]= -1; } elsif ($c eq '(') { $olist[$hx++]=$i; } elsif ($c eq ')') { my $j = $olist[--$hx]; die ("unbalanced brackets in make_pair_table") if ($hx<0); $table[$i]=$j; $table[$j]=$i; } $i++; } die ("too few closed brackets in make_pair_table") if ($hx!=0); return @table; } sub read_alifold { while (<>) { return $1 if /^([(.)]+)/; } } sub read_dot { my $cons = '.' x $len; while(<>) { next if /^%/; next unless /lbox$/; my @F = split; next if $F[5] < $thresh; substr($cons,$F[3]-1,1) = '('; substr($cons,$F[4]-1,1) = ')'; } return $cons; } ###################################################################### # # readClustal(filehandle) # # Reads Clustal W formatted alignment file and returns it in list of # hash references with keys "name" and "seq" for the name and the sequence, # resp. # # Fixme: the code below assumes all uppercase sequences ###################################################################### sub readClustal{ # my $fh=shift; my @out=(); my (%order, $order, %alignments); while(<>) { last if eof; next if ( /^\s+$/ ); my ($seqname, $aln_line) = ('', ''); if( /^\s*(\S+)\s*\/\s*(\d+)-(\d+)\s+(\S+)\s*$/ ) { # clustal 1.4 format ($seqname,$aln_line) = ("$1/$2-$3",$4); } elsif( /^(\S+)\s+([A-Z\-]+)\s*$/ ) { ($seqname,$aln_line) = ($1,$2); } else { next; } if( !exists $order{$seqname} ) { $order{$seqname} = $order++; } $alignments{$seqname} .= $aln_line; } foreach my $name ( sort { $order{$a} <=> $order{$b} } keys %alignments ) { if( $name =~ /(\S+):(\d+)-(\d+)/ ) { (my $sname,my $start, my $end) = ($1,$2,$3); } else { (my $sname, my $start) = ($name,1); my $str = $alignments{$name}; $str =~ s/[^A-Za-z]//g; my $end = length($str); } my $seq=$alignments{$name}; push @out, {name=>$name,seq=>$seq}; } return [@out]; } ###################################################################### # # getPairs(\@aln alnref, $ss string) # # Evalutates the pairing of an alignment according to a given # consensus secondary structure # # Returns list of all base pairs which is a hash with the following # keys: # # open ... column in the alignment of the first base in the pair "(" # close ... column in the alignment of the second base in the pair ")" # all ... list of all basepairs in the different sequences in the alignment # pairing ... list of all different pairing basepairs # nonpairing ... list of all incompatible basepairs # ###################################################################### sub getPairs{ my @inputAln=@{$_[0]}; my $ss=$_[1]; # return nothing if there are no pairs if (!($ss=~tr/(/(/)){ return (); } my @aln=(); foreach my $row (@inputAln){ my $seq=$row->{seq}; $seq=uc($seq); $seq=~s/T/U/g; my @tmp=split(//,$seq); push @aln,\@tmp; } my @ss=split(//,$ss); my @pairs=(); my @stack=(); foreach my $column (0..$#ss){ my $currChar=$ss[$column]; if ($currChar eq '('){ push @stack,$column; } if ($currChar eq ')'){ my $openedCol=pop @stack; push @pairs,{open=>$openedCol,close=>$column}; } } @pairs=sort {$a->{open} <=> $b->{open}} @pairs; foreach my $i (0..$#pairs){ #print "$i: $pairs[$i]->{open} - $pairs[$i]->{close}\n"; my @all=(); my @pairing=(); my @nonpairing=(); for my $j (0..$#aln){ my $currPair=$aln[$j][$pairs[$i]->{open}].$aln[$j][$pairs[$i]->{close}]; push @all,$currPair; } for my $pair (@all){ if (($pair eq 'AU') or ($pair eq 'UA') or ($pair eq 'GC') or ($pair eq 'CG') or ($pair eq 'UG') or ($pair eq 'GU')){ push @pairing, $pair; } elsif ($pair eq "--"){ # do nothing } else { push @nonpairing,$pair; } } undef my %saw; my @uniquePairing = grep(!$saw{$_}++, @pairing); $pairs[$i]->{all}=[@all]; $pairs[$i]->{pairing}=[@uniquePairing]; $pairs[$i]->{nonpairing}=[@nonpairing]; } return @pairs; } =head1 NAME refold.pl - refold using consensus structure as constraint =head1 SYNOPSIS refold.pl [-t thresh] file.aln alidot.ps refold.pl file.aln file.alifold =head1 DESCRIPTION refold.pl reads an alignment in CLUSTAL format, and a consensus secondary structure, either in the form of the standard output of C, or in the form of a dot plot as produced by C. For each sequence in the alignment it writes the name, sequence, and constraint structure to stdout in a format suitable for piping into C. The constraint string is produced by removing from the consensus structure all gaps, as well as all pairs not compatible with the particular sequence. If the structure is read from a dot plot file, only pairs with probability > then some threshold (default 0.9) are used. =head1 OPTIONS =over 4 =item B<-t> I use only pairs with p>thresh as constraint. Only applicable when reading consensus structure from a dot plot. =back =head1 EXAMPLE RNAalifold test.aln > test.alifold refold.pl test.aln test.alifold > test.cfold RNAfold -C < test.cfold =cut