binaries/src/ViennaRNA/Utils/refold.pl

   1 #!/usr/bin/perl -w
   2 # -*-Perl-*-
   3 # Last changed Time-stamp: <2006-08-15 22:04:27 ivo>
   4 # after predicting a conensus structure using RNAalifold,
   5 # refold the indivual sequences, using the consensus structure as constraint
   6
   7 # read a clustal fromat alignment, plus either the standart output of
   8 # RNAalifold or a dot plot produced by "RNAalifold -p"
   9
  10 # usage refold.pl [-t thresh] clustal.aln alidot.ps
  11 # or    refold.pl clustal.aln clustal.alifold
  12
  13 use Getopt::Long;
  14 #use POSIX;
  15 #use GD;
  16 use strict;
  17
  18 my $thresh = 0.9;
  19 my $help = 0;
  20
  21 GetOptions ('-t=f' => \$thresh,
  22             "help"=>\$help,
  23             "h"=>\$help,
  24             );
  25
  26 if ($help){
  27   print "\  refold.pl [-t threshold] myseqs.aln alidot.ps | RNAfold -C\n";
  28   print "or refold.pl myseqs.aln myseqs.alifold | RNAfold -C\n";
  29   print " -t ... use only pairs with p>thresh (default: 0.9)\n\n";
  30   exit(1);
  31 }
  32
  33 my $aln = readClustal();
  34 my $len = length($aln->[0]->{seq});
  35
  36 $_ = <>;
  37 #print STDERR "$_\n";
  38 my $constraint = (/^%\!PS/) ? read_dot() : read_alifold();
  39
  40 #print STDERR "$constraint\n";
  41
  42 foreach my $a (@$aln) {
  43     print "> $a->{name}\n";
  44
  45     # remove gaps
  46     my $seq = $a->{seq};
  47     my $cons = $constraint;
  48     my @pt = make_pair_table($cons);
  49     my %pair = ("AU" => 5,
  50                 "GC" => 1,
  51                 "CG" => 2,
  52                 "UA" => 6,
  53                 "GU" => 3,
  54                 "GT" => 3,
  55                 "TG" => 4,
  56                 "UG" => 4,
  57                 "AT" => 5,
  58                 "TA" => 6);
  59
  60
  61     for my $p (0..length($seq)-1) {
  62         # open non-compatible pairs as well as pairs to a gap position
  63         my $c = substr($seq,$p,1);
  64         if ($c eq '-') {
  65             substr($cons,$p,1) = 'x'; # mark for removal
  66             substr($cons,$pt[$p],1) = '.'
  67                 if $pt[$p]>0  && substr($cons,$pt[$p],1) ne 'x';   # open pair
  68         }
  69         elsif ($pt[$p]>$p) {
  70             substr($cons,$p,1) = substr($cons,$pt[$p],1) = '.'
  71                 unless exists $pair{$c . substr($seq,$pt[$p],1)};
  72         }
  73     }
  74 #    print STDERR length($seq), length($cons), "\n";
  75     $cons =~ s/x//g;
  76     $seq  =~ s/-//g;
  77 #    print STDERR length($seq), length($cons), "\n";
  78     print "$seq\n$cons\n";
  79 }
  80
  81
  82 sub make_pair_table {
  83    #indices start at 0 in this version!
  84    my $structure = shift;
  85    my (@olist, @table);
  86    my ($hx,$i) = (0,0);
  87
  88    foreach my $c (split(//,$structure)) {
  89        if ($c eq '.') {
  90           $table[$i]= -1;
  91      } elsif ($c eq '(') {
  92           $olist[$hx++]=$i;
  93      } elsif ($c eq ')') {
  94          my $j = $olist[--$hx];
  95          die ("unbalanced brackets in make_pair_table") if ($hx<0);
  96          $table[$i]=$j;
  97          $table[$j]=$i;
  98      }
  99       $i++;
 100    }
 101    die ("too few closed brackets in make_pair_table") if ($hx!=0);
 102    return @table;
 103 }
 104
 105 sub read_alifold {
 106     while (<>) {
 107         return $1 if /^([(.)]+)/;
 108     }
 109 }
 110
 111 sub read_dot {
 112     my $cons = '.' x $len;
 113     while(<>) {
 114         next if /^%/;
 115         next unless /lbox$/;
 116         my @F = split;
 117         next if $F[5] < $thresh;
 118         substr($cons,$F[3]-1,1) = '(';
 119         substr($cons,$F[4]-1,1) = ')';
 120     }
 121     return $cons;
 122 }
 123
 124 ######################################################################
 125 #
 126 # readClustal(filehandle)
 127 #
 128 # Reads Clustal W formatted alignment file and returns it in list of
 129 # hash references with keys "name" and "seq" for the name and the sequence,
 130 # resp.
 131 #
 132 # Fixme: the code below assumes all uppercase sequences
 133 ######################################################################
 134
 135 sub readClustal{
 136 #  my $fh=shift;
 137   my @out=();
 138   my (%order, $order, %alignments);
 139   while(<>) {
 140         last if eof;
 141         next if ( /^\s+$/ );
 142         my ($seqname, $aln_line) = ('', '');
 143         if( /^\s*(\S+)\s*\/\s*(\d+)-(\d+)\s+(\S+)\s*$/ ) {
 144           # clustal 1.4 format
 145           ($seqname,$aln_line) = ("$1/$2-$3",$4);
 146         } elsif( /^(\S+)\s+([A-Z\-]+)\s*$/ ) {
 147           ($seqname,$aln_line) = ($1,$2);
 148         } else {
 149           next;
 150   }
 151         if( !exists $order{$seqname} ) {
 152           $order{$seqname} = $order++;
 153         }
 154         $alignments{$seqname} .= $aln_line;
 155   }
 156
 157   foreach my $name ( sort { $order{$a} <=> $order{$b} } keys %alignments ) {
 158         if( $name =~ /(\S+):(\d+)-(\d+)/ ) {
 159           (my $sname,my $start, my $end) = ($1,$2,$3);
 160         } else {
 161           (my $sname, my $start) = ($name,1);
 162           my $str  = $alignments{$name};
 163           $str =~ s/[^A-Za-z]//g;
 164           my $end = length($str);
 165         }
 166         my $seq=$alignments{$name};
 167         push @out, {name=>$name,seq=>$seq};
 168   }
 169   return [@out];
 170 }
 171
 172 ######################################################################
 173 #
 174 # getPairs(\@aln alnref, $ss string)
 175 #
 176 # Evalutates the pairing of an alignment according to a given
 177 # consensus secondary structure
 178 #
 179 # Returns list of all base pairs which is a hash with the following
 180 # keys:
 181 #
 182 #  open ... column in the alignment of the first base in the pair "("
 183 #  close ... column in the alignment of the second base in the pair ")"
 184 #  all ... list of all basepairs in the different sequences in the alignment
 185 #  pairing ... list of all different pairing basepairs
 186 #  nonpairing ... list of all incompatible basepairs
 187 #
 188 ######################################################################
 189
 190 sub getPairs{
 191
 192   my @inputAln=@{$_[0]};
 193   my $ss=$_[1];
 194
 195   # return nothing if there are no pairs
 196   if (!($ss=~tr/(/(/)){
 197         return ();
 198   }
 199
 200   my @aln=();
 201   foreach my $row (@inputAln){
 202         my $seq=$row->{seq};
 203         $seq=uc($seq);
 204         $seq=~s/T/U/g;
 205         my @tmp=split(//,$seq);
 206         push @aln,\@tmp;
 207   }
 208   my @ss=split(//,$ss);
 209
 210   my @pairs=();
 211   my @stack=();
 212
 213   foreach my $column (0..$#ss){
 214
 215         my $currChar=$ss[$column];
 216
 217         if ($currChar eq '('){
 218           push @stack,$column;
 219         }
 220
 221         if ($currChar eq ')'){
 222           my $openedCol=pop @stack;
 223           push @pairs,{open=>$openedCol,close=>$column};
 224         }
 225   }
 226
 227   @pairs=sort {$a->{open} <=> $b->{open}} @pairs;
 228
 229   foreach my $i (0..$#pairs){
 230         #print "$i: $pairs[$i]->{open} - $pairs[$i]->{close}\n";
 231
 232         my @all=();
 233         my @pairing=();
 234         my @nonpairing=();
 235
 236         for my $j (0..$#aln){
 237           my $currPair=$aln[$j][$pairs[$i]->{open}].$aln[$j][$pairs[$i]->{close}];
 238           push @all,$currPair;
 239         }
 240
 241         for my $pair (@all){
 242           if (($pair eq 'AU') or
 243                   ($pair eq 'UA') or
 244                   ($pair eq 'GC') or
 245                   ($pair eq 'CG') or
 246                   ($pair eq 'UG') or
 247                   ($pair eq 'GU')){
 248
 249                 push @pairing, $pair;
 250           } elsif ($pair eq "--"){
 251                 # do nothing
 252           } else {
 253                 push @nonpairing,$pair;
 254           }
 255         }
 256
 257         undef my %saw;
 258     my @uniquePairing = grep(!$saw{$_}++, @pairing);
 259
 260         $pairs[$i]->{all}=[@all];
 261         $pairs[$i]->{pairing}=[@uniquePairing];
 262         $pairs[$i]->{nonpairing}=[@nonpairing];
 263   }
 264
 265   return @pairs;
 266
 267 }
 268
 269
 270 =head1 NAME
 271
 272 refold.pl - refold using consensus structure as constraint
 273
 274 =head1 SYNOPSIS
 275
 276   refold.pl [-t thresh] file.aln alidot.ps
 277   refold.pl file.aln file.alifold
 278
 279 =head1 DESCRIPTION
 280
 281 refold.pl reads an alignment in CLUSTAL format, and a consensus
 282 secondary structure, either in the form of the standard output of
 283 C<RNAalifold>, or in the form of a dot plot as produced by
 284 C<RNAalifold -p>. For each sequence in the alignment it writes the
 285 name, sequence, and constraint structure to stdout in a format
 286 suitable for piping into C<RNAfold -C>.
 287
 288 The constraint string is produced by removing from the consensus
 289 structure all gaps, as well as all pairs not compatible with the
 290 particular sequence. If the structure is read from a dot plot file,
 291 only pairs with probability > then some threshold (default 0.9) are
 292 used.
 293
 294 =head1 OPTIONS
 295
 296 =over 4
 297
 298 =item B<-t> I<thresh>
 299
 300 use only pairs with p>thresh as constraint. Only applicable when
 301 reading consensus structure from a dot plot.
 302
 303 =back
 304
 305 =head1 EXAMPLE
 306
 307   RNAalifold test.aln > test.alifold
 308   refold.pl test.aln test.alifold > test.cfold
 309   RNAfold -C < test.cfold
 310
 311 =cut