3 # Last changed Time-stamp: <2006-08-15 22:04:27 ivo>
4 # after predicting a conensus structure using RNAalifold,
5 # refold the indivual sequences, using the consensus structure as constraint
7 # read a clustal fromat alignment, plus either the standart output of
8 # RNAalifold or a dot plot produced by "RNAalifold -p"
10 # usage refold.pl [-t thresh] clustal.aln alidot.ps
11 # or refold.pl clustal.aln clustal.alifold
21 GetOptions ('-t=f' => \$thresh,
27 print "\ refold.pl [-t threshold] myseqs.aln alidot.ps | RNAfold -C\n";
28 print "or refold.pl myseqs.aln myseqs.alifold | RNAfold -C\n";
29 print " -t ... use only pairs with p>thresh (default: 0.9)\n\n";
33 my $aln = readClustal();
34 my $len = length($aln->[0]->{seq});
38 my $constraint = (/^%\!PS/) ? read_dot() : read_alifold();
40 #print STDERR "$constraint\n";
42 foreach my $a (@$aln) {
43 print "> $a->{name}\n";
47 my $cons = $constraint;
48 my @pt = make_pair_table($cons);
49 my %pair = ("AU" => 5,
61 for my $p (0..length($seq)-1) {
62 # open non-compatible pairs as well as pairs to a gap position
63 my $c = substr($seq,$p,1);
65 substr($cons,$p,1) = 'x'; # mark for removal
66 substr($cons,$pt[$p],1) = '.'
67 if $pt[$p]>0 && substr($cons,$pt[$p],1) ne 'x'; # open pair
70 substr($cons,$p,1) = substr($cons,$pt[$p],1) = '.'
71 unless exists $pair{$c . substr($seq,$pt[$p],1)};
74 # print STDERR length($seq), length($cons), "\n";
77 # print STDERR length($seq), length($cons), "\n";
78 print "$seq\n$cons\n";
83 #indices start at 0 in this version!
84 my $structure = shift;
88 foreach my $c (split(//,$structure)) {
94 my $j = $olist[--$hx];
95 die ("unbalanced brackets in make_pair_table") if ($hx<0);
101 die ("too few closed brackets in make_pair_table") if ($hx!=0);
107 return $1 if /^([(.)]+)/;
112 my $cons = '.' x $len;
117 next if $F[5] < $thresh;
118 substr($cons,$F[3]-1,1) = '(';
119 substr($cons,$F[4]-1,1) = ')';
124 ######################################################################
126 # readClustal(filehandle)
128 # Reads Clustal W formatted alignment file and returns it in list of
129 # hash references with keys "name" and "seq" for the name and the sequence,
132 # Fixme: the code below assumes all uppercase sequences
133 ######################################################################
138 my (%order, $order, %alignments);
142 my ($seqname, $aln_line) = ('', '');
143 if( /^\s*(\S+)\s*\/\s*(\d+)-(\d+)\s+(\S+)\s*$/ ) {
145 ($seqname,$aln_line) = ("$1/$2-$3",$4);
146 } elsif( /^(\S+)\s+([A-Z\-]+)\s*$/ ) {
147 ($seqname,$aln_line) = ($1,$2);
151 if( !exists $order{$seqname} ) {
152 $order{$seqname} = $order++;
154 $alignments{$seqname} .= $aln_line;
157 foreach my $name ( sort { $order{$a} <=> $order{$b} } keys %alignments ) {
158 if( $name =~ /(\S+):(\d+)-(\d+)/ ) {
159 (my $sname,my $start, my $end) = ($1,$2,$3);
161 (my $sname, my $start) = ($name,1);
162 my $str = $alignments{$name};
163 $str =~ s/[^A-Za-z]//g;
164 my $end = length($str);
166 my $seq=$alignments{$name};
167 push @out, {name=>$name,seq=>$seq};
172 ######################################################################
174 # getPairs(\@aln alnref, $ss string)
176 # Evalutates the pairing of an alignment according to a given
177 # consensus secondary structure
179 # Returns list of all base pairs which is a hash with the following
182 # open ... column in the alignment of the first base in the pair "("
183 # close ... column in the alignment of the second base in the pair ")"
184 # all ... list of all basepairs in the different sequences in the alignment
185 # pairing ... list of all different pairing basepairs
186 # nonpairing ... list of all incompatible basepairs
188 ######################################################################
192 my @inputAln=@{$_[0]};
195 # return nothing if there are no pairs
196 if (!($ss=~tr/(/(/)){
201 foreach my $row (@inputAln){
205 my @tmp=split(//,$seq);
208 my @ss=split(//,$ss);
213 foreach my $column (0..$#ss){
215 my $currChar=$ss[$column];
217 if ($currChar eq '('){
221 if ($currChar eq ')'){
222 my $openedCol=pop @stack;
223 push @pairs,{open=>$openedCol,close=>$column};
227 @pairs=sort {$a->{open} <=> $b->{open}} @pairs;
229 foreach my $i (0..$#pairs){
230 #print "$i: $pairs[$i]->{open} - $pairs[$i]->{close}\n";
236 for my $j (0..$#aln){
237 my $currPair=$aln[$j][$pairs[$i]->{open}].$aln[$j][$pairs[$i]->{close}];
242 if (($pair eq 'AU') or
249 push @pairing, $pair;
250 } elsif ($pair eq "--"){
253 push @nonpairing,$pair;
258 my @uniquePairing = grep(!$saw{$_}++, @pairing);
260 $pairs[$i]->{all}=[@all];
261 $pairs[$i]->{pairing}=[@uniquePairing];
262 $pairs[$i]->{nonpairing}=[@nonpairing];
272 refold.pl - refold using consensus structure as constraint
276 refold.pl [-t thresh] file.aln alidot.ps
277 refold.pl file.aln file.alifold
281 refold.pl reads an alignment in CLUSTAL format, and a consensus
282 secondary structure, either in the form of the standard output of
283 C<RNAalifold>, or in the form of a dot plot as produced by
284 C<RNAalifold -p>. For each sequence in the alignment it writes the
285 name, sequence, and constraint structure to stdout in a format
286 suitable for piping into C<RNAfold -C>.
288 The constraint string is produced by removing from the consensus
289 structure all gaps, as well as all pairs not compatible with the
290 particular sequence. If the structure is read from a dot plot file,
291 only pairs with probability > then some threshold (default 0.9) are
298 =item B<-t> I<thresh>
300 use only pairs with p>thresh as constraint. Only applicable when
301 reading consensus structure from a dot plot.
307 RNAalifold test.aln > test.alifold
308 refold.pl test.aln test.alifold > test.cfold
309 RNAfold -C < test.cfold