binaries/src/ViennaRNA/H/alifold.h

   1 #ifndef __VIENNA_RNA_PACKAGE_ALIFOLD_H__
   2 #define __VIENNA_RNA_PACKAGE_ALIFOLD_H__
   3
   4 #include "data_structures.h"
   5
   6 /**
   7  *  \addtogroup consensus_fold
   8  *  \brief compute various properties (consensus MFE structures, partition function,
   9  *  Boltzmann distributed stochastic samples, ...) for RNA sequence alignments
  10  *
  11  *  Consensus structures can be predicted by a modified version of the
  12  *  fold() algorithm that takes a set of aligned sequences instead
  13  *  of a single sequence. The energy function consists of the mean energy
  14  *  averaged over the sequences, plus a covariance term that favors pairs
  15  *  with consistent and compensatory mutations and penalizes pairs that
  16  *  cannot be formed by all structures. For details see \cite hofacker:2002 and
  17  *  \cite bernhart:2008.
  18  *  @{
  19  *    \file alifold.h
  20  *    \brief compute various properties (consensus MFE structures, partition function, Boltzmann
  21  *    distributed stochastic samples, ...) for RNA sequence alignments
  22  *
  23  *  @}
  24  */
  25
  26
  27 /**
  28  *  \addtogroup consensus_mfe_fold
  29  *  \ingroup consensus_fold
  30  *  @{
  31  *
  32  *  @}
  33  */
  34
  35 /**
  36  *  \brief This variable controls the weight of the covariance term in the
  37  *  energy function of alignment folding algorithms
  38  *
  39  *  \ingroup consensus_fold
  40  *
  41  *  Default is 1.
  42  */
  43 extern  double  cv_fact;
  44 /**
  45  *  \brief This variable controls the magnitude of the penalty for non-compatible sequences in
  46  *  the covariance term of alignment folding algorithms.
  47  *
  48  *  \ingroup consensus_fold
  49  *
  50  *  Default is 1.
  51  */
  52 extern  double  nc_fact;
  53
  54 /*
  55 ##############################################
  56 # MFE VARIANTS OF THE ALIFOLD IMPLEMENTATION #
  57 ##############################################
  58 */
  59
  60 /**
  61  *  \brief Update the energy parameters for alifold function
  62  *
  63  *  Call this to recalculate the pair matrix and energy parameters after a
  64  *  change in folding parameters like #temperature
  65  */
  66 void update_alifold_params(void);
  67
  68
  69 /**
  70  *  \brief Compute MFE and according consensus structure of an alignment of sequences
  71  *
  72  *  This function predicts the consensus structure for the aligned 'sequences'
  73  *  and returns the minimum free energy; the mfe structure in bracket
  74  *  notation is returned in 'structure'.
  75  *
  76  *  Sufficient space must be allocated for 'structure' before calling
  77  *  alifold().
  78  *
  79  *  \ingroup consensus_mfe_fold
  80  *
  81  *  \param strings    A pointer to a NULL terminated array of character arrays
  82  *  \param structure  A pointer to a character array that may contain a constraining consensus structure
  83  *                    (will be overwritten by a consensus structure that exhibits the MFE)
  84  *  \return           The free energy score in kcal/mol
  85  */
  86 float  alifold( const char **strings,
  87                 char *structure);
  88
  89
  90 /**
  91  *  \brief Compute MFE and according structure of an alignment of sequences assuming the sequences are circular instead of linear
  92  *
  93  *  \ingroup consensus_mfe_fold
  94  *
  95  *  \param strings    A pointer to a NULL terminated array of character arrays
  96  *  \param structure  A pointer to a character array that may contain a constraining consensus structure
  97  *                    (will be overwritten by a consensus structure that exhibits the MFE)
  98  *  \return           The free energy score in kcal/mol
  99  */
 100 float  circalifold( const char **strings,
 101                     char *structure);
 102
 103 /**
 104  *  \brief Free the memory occupied by MFE alifold functions
 105  *
 106  *  \ingroup consensus_mfe_fold
 107  *
 108  */
 109 void    free_alifold_arrays(void);
 110
 111 /**
 112  *  \brief Get the mean pairwise identity in steps from ?to?(ident)
 113  *
 114  *  \ingroup consensus_fold
 115  *
 116  *  \param Alseq
 117  *  \param n_seq  The number of sequences in the alignment
 118  *  \param length The length of the alignment
 119  *  \param mini
 120  *  \return       The mean pairwise identity
 121  */
 122 int get_mpi(char *Alseq[],
 123             int n_seq,
 124             int length,
 125             int *mini);
 126
 127 /**
 128  *  \brief Read a ribosum or other user-defined scoring matrix
 129  *
 130  *  \ingroup consensus_fold
 131  *
 132  */
 133 float   **readribosum(char *name);
 134
 135 /**
 136  *  \brief Calculate the free energy of a consensus structure given a set of aligned sequences
 137  *
 138  *  \ingroup consensus_fold
 139  *
 140  *  \param  sequences   The NULL terminated array of sequences
 141  *  \param  structure   The consensus structure
 142  *  \param  n_seq       The number of sequences in the alignment
 143  *  \param  energy      A pointer to an array of at least two floats that will hold the free energies
 144  *                      (energy[0] will contain the free energy, energy[1] will be filled with the covariance energy term)
 145  *  \returns free energy in kcal/mol
 146  *
 147  */
 148 float   energy_of_alistruct(const char **sequences,
 149                             const char *structure,
 150                             int n_seq,
 151                             float *energy);
 152
 153 float   energy_of_ali_gquad_structure(const char **sequences,
 154                                       const char *structure,
 155                                       int n_seq,
 156                                       float *energy);
 157
 158 /*
 159 #############################################################
 160 # some helper functions that might be useful in the library #
 161 #############################################################
 162 */
 163
 164 /**
 165  *  \brief Get arrays with encoded sequence of the alignment
 166  *
 167  *  this function assumes that in S, S5, s3, ss and as enough
 168  *  space is already allocated (size must be at least sequence length+2)
 169  *
 170  *  \ingroup consensus_fold
 171  *
 172  *  \param sequence The gapped sequence from the alignment
 173  *  \param S        pointer to an array that holds encoded sequence
 174  *  \param s5      pointer to an array that holds the next base 5' of alignment position i
 175  *  \param s3      pointer to an array that holds the next base 3' of alignment position i
 176  *  \param ss
 177  *  \param as
 178  *  \param circ    assume the molecules to be circular instead of linear (circ=0)
 179  */
 180 void encode_ali_sequence( const char *sequence,
 181                           short *S,
 182                           short *s5,
 183                           short *s3,
 184                           char *ss,
 185                           unsigned short *as,
 186                           int circ);
 187
 188 /**
 189  *  \brief Allocate memory for sequence array used to deal with aligned sequences
 190  *
 191  *  Note that these arrays will also be initialized according to the sequence alignment given
 192  *
 193  *  \ingroup consensus_fold
 194  *
 195  *  \see free_sequence_arrays()
 196  *
 197  *  \param sequences  The aligned sequences
 198  *  \param S          A pointer to the array of encoded sequences
 199  *  \param S5         A pointer to the array that contains the next 5' nucleotide of a sequence position
 200  *  \param S3         A pointer to the array that contains the next 3' nucleotide of a sequence position
 201  *  \param a2s        A pointer to the array that contains the alignment to sequence position mapping
 202  *  \param Ss         A pointer to the array that contains the ungapped sequence
 203  *  \param circ       assume the molecules to be circular instead of linear (circ=0)
 204  */
 205 void  alloc_sequence_arrays(const char **sequences,
 206                             short ***S,
 207                             short ***S5,
 208                             short ***S3,
 209                             unsigned short ***a2s,
 210                             char ***Ss,
 211                             int circ);
 212
 213 /**
 214  *  \brief Free the memory of the sequence arrays used to deal with aligned sequences
 215  *
 216  *  This function frees the memory previously allocated with alloc_sequence_arrays()
 217  *
 218  *  \ingroup consensus_fold
 219  *
 220  *  \see alloc_sequence_arrays()
 221  *
 222  *  \param n_seq      The number of aligned sequences
 223  *  \param S          A pointer to the array of encoded sequences
 224  *  \param S5         A pointer to the array that contains the next 5' nucleotide of a sequence position
 225  *  \param S3         A pointer to the array that contains the next 3' nucleotide of a sequence position
 226  *  \param a2s        A pointer to the array that contains the alignment to sequence position mapping
 227  *  \param Ss         A pointer to the array that contains the ungapped sequence
 228  */
 229 void  free_sequence_arrays( unsigned int n_seq,
 230                             short ***S,
 231                             short ***S5,
 232                             short ***S3,
 233                             unsigned short ***a2s,
 234                             char ***Ss);
 235
 236 /*
 237 #############################################################
 238 # PARTITION FUNCTION VARIANTS OF THE ALIFOLD IMPLEMENTATION #
 239 #############################################################
 240 */
 241
 242
 243 /**
 244  *  \addtogroup consensus_pf_fold
 245  *  \ingroup consensus_fold
 246  *  @{
 247  *
 248  *  @}
 249  */
 250
 251 /**
 252  *  \brief
 253  *
 254  *  \ingroup consensus_pf_fold
 255  *
 256  *  \param  sequences
 257  *  \param  structure
 258  *  \param  pl
 259  *  \param  parameters
 260  *  \param  calculate_bppm
 261  *  \param  is_constrained
 262  *  \param  is_circular
 263  *  \return
 264  */
 265 float alipf_fold_par( const char **sequences,
 266                       char *structure,
 267                       plist **pl,
 268                       pf_paramT *parameters,
 269                       int calculate_bppm,
 270                       int is_constrained,
 271                       int is_circular);
 272
 273 /**
 274  *  \brief
 275  *
 276  *  The partition function version of alifold() works in analogy to
 277  *  pf_fold(). Pair probabilities and information about sequence
 278  *  covariations are returned via the 'pi' variable as a list of
 279  *  #pair_info structs. The list is terminated by the first entry with
 280  *  pi.i = 0.
 281  *
 282  *  \ingroup consensus_pf_fold
 283  *
 284  *  \param sequences
 285  *  \param structure
 286  *  \param pl
 287  *  \return
 288  */
 289 float alipf_fold( const char **sequences,
 290                   char *structure,
 291                   plist **pl);
 292
 293 /**
 294  *  \brief
 295  *
 296  *  \ingroup consensus_pf_fold
 297  *
 298  *  \param sequences
 299  *  \param structure
 300  *  \param pl
 301  *  \return
 302  */
 303 float alipf_circ_fold(const char **sequences,
 304                       char *structure,
 305                       plist **pl);
 306
 307
 308 /**
 309  *  \brief Get a pointer to the base pair probability array
 310  *
 311  *  Accessing the base pair probabilities for a pair (i,j) is achieved by
 312  *  \verbatim FLT_OR_DBL *pr = export_bppm(); pr_ij = pr[iindx[i]-j]; \endverbatim
 313  *
 314  *  \ingroup consensus_pf_fold
 315  *
 316  *  \see get_iindx()
 317  *  \return A pointer to the base pair probability array
 318  */
 319 FLT_OR_DBL *export_ali_bppm(void);
 320
 321 /**
 322  *  \brief
 323  *
 324  *  \ingroup consensus_pf_fold
 325  *
 326  */
 327 void  free_alipf_arrays(void);
 328
 329 /**
 330  *  \addtogroup consensus_stochbt
 331  *  @{
 332  *
 333  *  @}
 334  */
 335
 336 /**
 337  *  \brief Sample a consensus secondary structure from the Boltzmann ensemble according its probability\n
 338  *
 339  *  \ingroup consensus_stochbt
 340  *
 341  *  \param  prob  to be described (berni)
 342  *  \return       A sampled consensus secondary structure in dot-bracket notation
 343  */
 344 char  *alipbacktrack(double *prob);
 345
 346
 347 /**
 348  *  \brief Get pointers to (almost) all relavant arrays used in alifold's partition function computation
 349  *
 350  *  \ingroup consensus_fold
 351  *
 352  *  \note To obtain meaningful pointers, call alipf_fold first!
 353  *
 354  *  \see pf_alifold(), alipf_circ_fold()
 355  *
 356  *  \param S_p      A pointer to the 'S' array (integer representation of nucleotides)
 357  *  \param S5_p     A pointer to the 'S5' array
 358  *  \param S3_p     A pointer to the 'S3' array
 359  *  \param a2s_p    A pointer to the pair type matrix
 360  *  \param Ss_p     A pointer to the 'Ss' array
 361  *  \param qb_p     A pointer to the Q<sup>B</sup> matrix
 362  *  \param qm_p     A pointer to the Q<sup>M</sup> matrix
 363  *  \param q1k_p    A pointer to the 5' slice of the Q matrix (\f$q1k(k) = Q(1, k)\f$)
 364  *  \param qln_p    A pointer to the 3' slice of the Q matrix (\f$qln(l) = Q(l, n)\f$)
 365  *  \return         Non Zero if everything went fine, 0 otherwise
 366  */
 367 int get_alipf_arrays(short ***S_p,
 368                      short ***S5_p,
 369                      short ***S3_p,
 370                      unsigned short ***a2s_p,
 371                      char ***Ss_p,
 372                      FLT_OR_DBL **qb_p,
 373                      FLT_OR_DBL **qm_p,
 374                      FLT_OR_DBL **q1k_p,
 375                      FLT_OR_DBL **qln_p,
 376                      short **pscore);
 377
 378 #endif