#!/usr/bin/perl -w
# -*-Perl-*-
# Last changed Time-stamp: <2011-05-27 17:10:13 stef>
# tool for the design of bistable RNA molecules


# additional paths were perl looks for RNA.pm and RNA/barriers.pm
# use lib '/home/RNA/barriers';

use RNA;
use Getopt::Long;
use strict;
use vars qw/$opt_debug $opt_v $ParamFile/;
my $sE;
my %pair = ("AU", 5, "GC", 1, "CG", 2, "GU", 3, "UG", 4, "UA", 6);
my %mut = ('A' => 'G', 'G' => 'A', 'U' => 'C', 'C' => 'U');
my %wc = ('A' => 'U', 'G' => 'C', 'U' => 'A', 'C' => 'G');

my ($fist, $sest, $startseq, $cons);
my $noo = 1;
my $bar;
my $nom = 2000000;
my $border = 30000;
my $small = 0.3;
my $Temperature1 = 37;
my $Temperature2;
my ($optseq, $optcost, $e);
my @fibo = (0,1);
my %cos;
my $cpnt= -1;
my $dg = 0;

 Getopt::Long::config('no_ignore_case');
&usage() unless GetOptions("T=f" => \$Temperature1,
                           "T2=f" => \$Temperature2,
                           "4" => sub {$RNA::tetra_loop = 0},
                           "d|d0" => sub {$RNA::dangles=0},
                           "d2" => sub {$RNA::dangles=2},,
                           "debug",
                           "noGU" => \$RNA::noGU,
                           "noCloseGU" => \$RNA::no_closingGU,
                           "noLP" => \$RNA::noLonelyPairs,
                           "P=s" => \$ParamFile,
                           "n=i" => \$noo,
                           "trial=i" => \$nom,
                           "bar=f" => \$bar,
                           "g=f" => \$small,
                           "dG=f" => \$dg,
                           "v");

$RNA::temperature = $Temperature1;
$Temperature2 = $Temperature1 unless defined $Temperature2;

$noo = ($noo >= 1) ? $noo : 1;
$nom = ($nom >= 1) ? $nom : 1000;
$border = ($border >=1) ? $border : 100;
my $interactiv = init_ia(0);
my $ia = 1 if -t STDIN && -t STDOUT && $#ARGV < 0;
 RNA::read_parameter_file($ParamFile) if ($ParamFile);
 RNA::init_rand();
# don't call update at this stage since no sequence is known yet
#if ($cpnt != -1) { RNA::update_cofold_params() } else {  RNA::update_fold_params() }
 srand();

for(;;) { # main loop
   $interactiv->() if $ia;
   last if !process_input();

   my @fist = make_pair_table ($fist);
   my @sest = make_pair_table ($sest);

   fibo(length($fist)+2);

   my @clist = make_cycle_table (\@fist, \@sest);
   $border = estimate_border(\@clist);
   print_lol->(\@clist) if $opt_debug;
   my $nocs = calc_number_of_seq(\@clist);
   print "$nocs possible sequences\n" if ($opt_v);
   # make optimization
   for (1..$noo) {
      $startseq = make_compatible(@clist, ' ' x length($fist)); # unless $startseq;
      $optseq   = do_opti($nom, $startseq, \@clist);
      $optcost  = cost_function($optseq);
      substr($optseq, $cpnt-1, 0, '&') if ($cpnt != -1);
      $cos{$optseq}[1]++;
      $cos{$optseq}[0] = $optcost;
     printf "%s %8.4f\n", $optseq, $optcost;
   }

   print "the resulting sequences are:\n";
   foreach $e (sort {$cos{$b}[0]<=>$cos{$a}[0]} keys %cos) {
      printf "%s %8.4f",  $e, $cos{$e}[0];
      printf " %2d", $cos{$e}[1] if ($cos{$e}[1]>1);
      print "\n";
   }
   #print_statistics();
}

sub estimate_border {
   my @clist = @{$_[0]};
   my $max_cyc_len = 0;
   my $border = 0;
   for (@clist) {
      $max_cyc_len = $#{$_}+1 if  $#{$_}+1 > $max_cyc_len;
      $border += calc_number_of_seq([$_])-1;
   }
   printf "max cyclen is %4d border is %5d\n", $max_cyc_len, $border
       if $opt_debug;
   return $border;
}

sub print_lol {
   my @lol = @{$_[0]};
   for (@lol) {
      $, = ' ';
      print @{$_}, "\n";
      $, = '';
   }
}

sub fibo {
   my $length = shift;
   while ($#fibo < $length) {
      push @fibo, $fibo[-1] + $fibo[-2];
   }
   return $fibo[$length];
}


sub process_input {
   $_ = <>;
   return 0 if !defined $_;
   chomp;
   $fist = $_;
   return 0 if $fist eq '@';
   chomp($_ = <>);
   $sest = $_;
   $cpnt = index($fist, "&")+1;
 die "ERROR: Different Cut-Points set!\n"
     if ($cpnt != index($sest, "&")+1);
   if ($cpnt) {
       $fist =~ s/&//g;
       $sest =~ s/&//g;
       $RNA::cut_point = $cpnt;
   } else { $cpnt = -1 }
   die "structures have unequal length"
       if (length($fist) != length($sest));
   chomp($_ = <>);
   $_ .= 'N' x (length($fist)-length); # pad with N
#   print "$_\n";
   $cons = uc $_;
   # all init functions are deprecated since ViennaRNA 2.0 
   #if ($cpnt != -1) { &RNA::init_co_pf_fold(length($fist)); } else { &RNA::init_pf_fold(length($fist)); }
   return 1;
}

sub init_ia {
   my $times = undef;
   return sub {
      if (!defined($times)) {
         print "Input structure1 structure2 & start string\n";
         print "    @ to quit, enter for random start string\n";
      }
      print "T1= $Temperature1,  T2= $Temperature2";
      $bar ? print "  Barrier=$bar\n": print "\n";
      for (1..8) { print '....,....', $_;}
      print "\n";
      $times=1;
   }
}


sub make_pair_table {
   use integer;
   # let table[i]=j if (i,j) is pair, -1 if i unpaired
   # indices start at 0 in this version!
   my($str) = shift(@_);
   my($i,$j,$hx,$c,@olist,@table);
   $hx=$i=0;
   foreach $c (split(//,$str)) {
      if ($c eq '.') {
         $table[$i]= -1;
      } elsif ($c eq '(') {
         $olist[$hx++]=$i;
      } elsif ($c eq ')') {
         $j = $olist[--$hx];
         die ("unbalanced brackets in make_pair_table") if ($hx<0);
         $table[$i]=$j;
         $table[$j]=$i;
      }
      $i++;
   }
   die ("too few closed brackets in make_pair_table") if ($hx!=0);
   return @table;
}

sub make_cycle_table {
   use integer;
   my @fist = @{$_[0]};
   my @sest = @{$_[1]};
   my (@clist,@seen);
   for my $i (0..$#fist) {
      my @cycle1 = ();
      next if ($seen[$i]);
      push @cycle1, $i;
      $seen[$i]=1;
      my $j = $fist[$i];
      while ($j>=0 && !$seen[$j]) {
         push @cycle1, $j;
         $seen[$j] = 1;
         $j = ($#cycle1 % 2) ? $sest[$j] : $fist[$j];
      }
      $j = $sest[$i];
      my @cycle2 = ();
      while ($j>=0 && !$seen[$j]) {
         unshift @cycle2, $j;
         $seen[$j] = 1;
         $j = ($#cycle2 % 2) ? $sest[$j] : $fist[$j];
      }
      # duplicate first element if closed cycle
      unshift @cycle2, $j if ($j!=-1);
      push @clist, [@cycle2, @cycle1];
   }
   return @clist;
}

sub make_compatible {
   # make random sequence or randomly mutate sequence
   # by replacing all @components by random paths
   my $startseq = pop;
   my @components = @_;
   foreach (@components) {
      my @comp = @{$_};
      my $l = @comp;
      if ($comp[$[] eq $comp[-1] && (@comp>1)) { # it's a cycle
         $l--; $l=-$l;
      }
      my @seq = make_pathseq ($l);
      for ($[..$#seq) {
         substr($startseq, $comp[$_], 1, $seq[$_]);
      }
   }
   return $startseq;
}

sub make_pathseq {
   # with one argument: return random cycle of length $l
   # two arguments: return $rand-th possible cycle sequence
   # if $l<0 assume closed cycle else path
   my ($l, $rand) = @_;
   my $ll = $l;
   if ($l<0) { # its a closed cycle
      $l = -$l;
      $ll = $l-1;
      die "cycles must have even length" if $l%2;
   }
   my $n = 2*($fibo[$l+1]+$fibo[$ll]);       # total number of seqs
   $rand = rand($n) unless (defined($rand));

   my @seq = ();
   # set first base in sequence
   if ($rand < $fibo[$ll]) {
      push @seq, 'A', 'U';
   } elsif ($rand < 2*$fibo[$ll]) {
      push @seq, 'C', 'G'; $rand -= $fibo[$ll];
   } else {
      $rand -= 2*$fibo[$ll]; $ll=$l;
      push @seq, ($rand>=$fibo[$l+1])?'U':'G';
      $rand -= $fibo[$l+1] if $rand >= $fibo[$l+1];
   }

   # grow sequence to length $l
   # if we have a cycle starting with A or C $ll=$l-1, else $ll=$l
   while (@seq < $l) {
      if ($rand < $fibo[$ll-@seq]) {
         push @seq, 'C','G' if ($seq[-1] eq 'G');
         push @seq, 'A','U' if ($seq[-1] eq 'U');
      } else {
         $rand -= $fibo[$ll-@seq];
         push @seq, ($seq[-1] eq 'G') ? 'U' : 'G';
      }
   }
   pop @seq if (@seq > $l); # in case we've added one base too many
   return @seq;
}

sub invert_cyclelist {
# foreach position store its cycle in @pos_list
   my @clist = @{$_[0]};
   my @pos_list;
   foreach my $lp (@clist) {
      my @l = @{$lp};
      for my $p (0..$#l) {
         $pos_list[$l[$p]] = [$lp, $p];
      }
   }
   return @pos_list;
}


sub do_opti {
   my $nom = shift;
   my $refseq = shift;
   my $clistp = shift;
   my $refcost = cost_function($refseq);
   my ($mutseq,$newcost);
   my $reject = 0;
   my %seen = ();
   my @poslist = invert_cyclelist($clistp);
   for my $d (1..$nom) {
      $mutseq = mutate_seq($refseq, $clistp, \@poslist);
      if (!exists $seen{$mutseq}) {
         $seen{$mutseq}=1;
         $newcost = cost_function($mutseq,$refcost);
         if ($opt_v) {
            printf '%4d %s %6.3f', $d, $mutseq, $newcost;
            (defined $sE) ? printf("%6.2f\n", $sE) : print "\n";
         }
         if ($newcost < $refcost) {
            $refseq = $mutseq;
            $refcost = $newcost;
            $reject = 0;
            print "New refseq!\n" if ($opt_v);
         }
      }
      else {
         $seen{$mutseq}++;
         $reject++;
         if ($reject >= $border) {
            last;
         }
      }
   }
   return $refseq;
}

sub mutate_seq {
   my $refseq = shift;
   my ($mutseq, $closed);
   my @clist = @{$_[0]};
   my @poslist = @{$_[1]};
   # my @cyc = @{$clist[int rand(@clist)]};
   my $pos = int rand(length $refseq);

   my ($cp, $cpos) = @{$poslist[$pos]};
   my @cyc = @{$cp};
   if ($#cyc>0 && $cyc[0] == $cyc[-1]) { #close cycle
      shift @cyc;
      $cpos--;
      $closed=1;
   }
   if (rand() < 1/@cyc) {
#      my $cp = $clist[int rand(@clist)];
      do { # prevent mutations to the same cyclesequence
         $mutseq = make_compatible($cp, $refseq);
      } while (substr($mutseq,$pos,1) eq substr($refseq,$pos,1));
   } else {
      $mutseq = $refseq;
      my $c = $mut{substr($mutseq, $pos, 1)};
      substr($mutseq, $pos, 1, $c);
      if ($cpos>0 || ($closed)) { # pos>0 or closed cyc
         my $cc = substr($mutseq, $cyc[$cpos-1], 1);
         substr($mutseq, $cyc[$cpos-1], 1, $wc{$c})
             if (!exists $pair{"$c$cc"});
      }
      if ($cpos+1<=$#cyc || $closed) {
         my $cp = ($cpos+1) % @cyc;
         my $cc = substr($mutseq, $cyc[$cp], 1);
         substr($mutseq, $cyc[$cp], 1, $wc{$c})
             if (!exists $pair{"$c$cc"});
      }
   }
   return $mutseq;
}

sub cost_function {
   my $seq = shift;
   my $refcost = shift;
   $RNA::temperature = $Temperature1;
   my $f1;
   if ($cpnt != -1) {
       $f1 = RNA::co_pf_fold($seq, undef);
   } else {
       $f1 = RNA::pf_fold($seq, undef);
   }
   my $e1 = RNA::energy_of_struct($seq, $fist);
   my $e1s = RNA::energy_of_struct($seq, $sest);
   my $cost;

   if ($Temperature1 != $Temperature2) {
      $RNA::temperature = $Temperature2;
      my $f2;
      if ($cpnt != -1)  {
          $f2 = RNA::co_pf_fold($seq, undef);
      } else {
          $f2 = RNA::pf_fold($seq, undef);
      }
      my $e2 = RNA::energy_of_struct($seq, $sest);
      my $e2s = RNA::energy_of_struct($seq, $fist);
      $cost = ($e1-$f1)+($e2-$f2) +
          $small*(($e1-$e1s+$dg) + ($e2 - $e2s+$dg));
   } else {
       $cost = $e1+$e1s-2*$f1+$small*($e1-$e1s+$dg)*($e1-$e1s+$dg);
       if ($bar && ((!defined $refcost) || $cost<$refcost)) {
#           eval {
#               require RNA::barrier;
#           }; die $@ if $@;
#           $sE = (RNA::barrier::find_saddle($seq, $fist, $sest, 20))[0];
           $sE = RNA::find_saddle($seq, $fist, $sest, 20)/100.;
           $sE -= ($e1+$e1s)/2;
           printf "sE = %6.2f %6.2f\n", $sE,(0.1*$small*($sE - $bar)*($sE - $bar)) ;
           $cost += (0.1*$small*($sE - $bar)*($sE - $bar));
       }
   }

   my $d = class_dist($seq, $cons);
   $cost += $d*50*$small;

   return $cost;
}

sub calc_number_of_seq {
   my @allcyc = @{$_[0]};
   my $nos = 1;
   foreach (@allcyc) {
      my @cyc = @{$_};
      my $l = @cyc;
      my $nc;
      if ($cyc[0] eq $cyc[-1] && $#cyc>0) { # closed cycle
         $l--;
         $nos *= 2*(fibo($l+1)+fibo($l-1));
      }
      else { # open path
         $nos *= 2*(fibo($l+1)+fibo($l));
      }
   }
   return $nos;
}

sub class_dist {
   my %classes = ('A' => 'A',
                  'C' => 'C',
                  'G' => 'G',
                  'T' => 'U',
                  'U' => 'U',
                  'R' => 'AG',  # purine
                  'Y' => 'CT',  # pyrimidine
                  'S' => 'CG',
                  'M' => 'AC',
                  'W' => 'AU',
                  'K' => 'GU',
                  'V' => 'ACG', # not T
                  'H' => 'ACU', # not G
                  'D' => 'AGU', # not C
                  'B' => 'CGU', # not A
                  'N' => 'ACGU');

   my @seq = split(//, uc $_[0]);
   my @template = split(//, uc $_[1]);

   my $d=0;
   foreach (@seq) {
      my $c = $classes{shift @template} || 'N';
      $d++ unless /[$c]/;
   }
   return $d;
}

sub usage {
   print <<EOF;
usage: $0 [options] [infile]
program specific options:
 -n <int>     run $0 <int> times (default: $noo)
 -trial <int> max number of sequences tested per run (default: $nom)
 -g <float>   small positivie parameter for costfunction (default: $small)
 -bar <float> barrier hight, seperating the two states (default: $bar)
 -T2 <float>  for temperature sensitive switches, temperature at which
              2nd structure is ground-state (default: undef)
standard Vienna RNA options:
 -T <float>   sets temperature to <float> (default 37.0C)
 -4           no extrastable tetraloops
 -d           no dangling ends
 -d2          use double dangles
 -noGU        no GU or UG basepairs allowed
 -noClosingGU no closing GU or UG basepairs allowed
 -noLP        no lonely basepairs allowed
 -P <string>  read fold-parameters from file <string>
infile format:
1st-line structure one
2nd-line structure two
3rd-line sequence constraints or empty line

EOF
    exit;
}

=head1 NAME

switch.pl - design bistable RNA sequences

=head1 SYNOPSIS

  switch.pl [options] [infile]

=head1 DESCRIPTION

switch.pl designs RNA sequences that exhibit two secondary
structures of almost equal stability. For any two given structures
there always exist many sequences compatible with both
structures. If both structures are reasonable stable, we can find
sequences where both target structures have almost equal energy, and
all other structures have much higher energies.

For details of the algorithm see:  Flamm et al.,
"Design of Multi-Stable RNA Molecules", RNA 7:254-265 (2001)

Input consists of three lines, the first two containing the target
structures in dot bracket notations. The third line may be used to
define sequence constraints: It contains a sequence string using
IUPAC codes for nucleotide classes (i.e. C<Y> for pyrimidine, C<R> for
purine, C<N> for anything...). If the line is empty or shorter than the
structures it is padded with C<N>s.

Sequence constraints are not strictly enforced, instead a constraint
violation term is added to the cost function. Thus it is legal to
specify sequence constraints that cannot be fulfilled.

switch.pl uses the Vienna RNA package for energy evaluation, the
Vienna RNA package and corresponding Perl module therefore have to be
installed. 

=head1 OPTIONS

=over 4

=item B<-n> <int>

number of independent optimization runs

=item B<-trial> <int>

maximum number of sequences to test per run, up to a million or so.

=item B<-g> <int>

Parameter of the cost function that weights the importance of
equal energies, and desired energy barriers.

The cost function primarily optimizes product of Boltzman
probabilities of the two structures C<p(S1)*P(S2)>, in addition it
contains a penalty proportional to C<[E(S1)-E(S2)]^2> that
enforces equal energies for both structures. With the --bar it
also tries design for a given energy barrier. The -g parameter
defines the weight of these additional cost function terms.

=item B<-T> <float>

Temperature in C for all energy calculations. Default 37C.

=item B<-T2> <float>

For temperature sensitive switches, use -T and -T2 to define the
temperatures at which structures S1 and S2 should be prefered.

=item B<-bar> <float>

Size of the desired energy barrier between the two structures in
kcal/mol. A fast heuristic that looks at shortest refolding paths is
used to estimate the barrier. Requires a recent version of the Vienna
RNA package that includes the find_saddle() function for estimating
refolding paths.

=item ViennaRNA standard options

the -noLP, -P <paramfile>, -d, -d2, -4, -noGU, -noClosingGU,
should work as usual. See the RNAfold man page for details.

=back

=head1 AUTHORS

Ivo L. Hofacker, Christoph Flamm, Peter Stadler, Sebastian
Maurer-Stroh, Martin Zehl.
Send comments to <ivo@tbi.univie.ac.at>.

=cut

#  End of file