1 #ifndef __VIENNA_RNA_PACKAGE_PART_FUNC_H__
2 #define __VIENNA_RNA_PACKAGE_PART_FUNC_H__
4 #include "data_structures.h"
7 #define DEPRECATED(func) func __attribute__ ((deprecated))
9 #define DEPRECATED(func) func
15 * \brief This section provides information about all functions and variables related to
16 * the calculation of the partition function and base pair probabilities.
18 * Instead of the minimum free energy structure the partition function of all possible structures
19 * and from that the pairing probability for every possible pair can be calculated, using a dynamic
20 * programming algorithm as described in \cite mccaskill:1990.
24 * \brief Partition function of single RNA sequences
26 * This file includes (almost) all function declarations within the <b>RNAlib</b> that are related to
27 * Partion function folding...
32 * \brief Flag indicating that auxilary arrays are needed throughout the computations. This is essential for stochastic backtracking
34 * Set this variable to 1 prior to a call of pf_fold() to ensure that all matrices needed for stochastic backtracking
35 * are filled in the forward recursions
37 * \ingroup subopt_stochbt
39 * \see pbacktrack(), pbacktrack_circ
44 #################################################
45 # PARTITION FUNCTION COMPUTATION #
46 #################################################
50 * \brief Compute the partition function \f$Q\f$ for a given RNA sequence
52 * If \a structure is not a NULL pointer on input, it contains on
53 * return a string consisting of the letters " . , | { } ( ) " denoting
54 * bases that are essentially unpaired, weakly paired, strongly paired without
55 * preference, weakly upstream (downstream) paired, or strongly up-
56 * (down-)stream paired bases, respectively.
57 * If #fold_constrained is not 0, the \a structure string is
58 * interpreted on input as a list of constraints for the folding. The
59 * character "x" marks bases that must be unpaired, matching brackets " ( ) "
60 * denote base pairs, all other characters are ignored. Any pairs
61 * conflicting with the constraint will be forbidden. This is usually sufficient
62 * to ensure the constraints are honored.
63 * If tha parameter calculate_bppm is set to 0 base pairing probabilities will not
64 * be computed (saving CPU time), otherwise after calculations took place #pr will
65 * contain the probability that bases \a i and \a j pair.
69 * \note The global array #pr is deprecated and the user who wants the calculated
70 * base pair probabilities for further computations is advised to use the function
72 * \post After successful run the hidden folding matrices are filled with the appropriate Boltzmann factors.
73 * Depending on whether the global variable #do_backtrack was set the base pair probabilities are already
74 * computed and may be accessed for further usage via the export_bppm() function.
75 * A call of free_pf_arrays() will free all memory allocated by this function.
76 * Successive calls will first free previously allocated memory before starting the computation.
77 * \see pf_fold(), pf_circ_fold(), bppm_to_structure(), export_bppm(), get_boltzmann_factors(), free_pf_arrays()
78 * \param[in] sequence The RNA sequence input
79 * \param[in,out] structure A pointer to a char array where a base pair probability information can be stored in a
80 * pseudo-dot-bracket notation (may be NULL, too)
81 * \param[in] parameters Data structure containing the precalculated Boltzmann factors
82 * \param[in] calculate_bppm Switch to Base pair probability calculations on/off (0==off)
83 * \param[in] is_constrained Switch to indicate that a structure contraint is passed via the structure argument (0==off)
84 * \param[in] is_circular Switch to (de-)activate postprocessing steps in case RNA sequence is circular (0==off)
85 * \return The Gibbs free energy of the ensemble (\f$G = -RT \cdot \log(Q) \f$) in kcal/mol
87 float pf_fold_par( const char *sequence,
89 pf_paramT *parameters,
95 * \brief Compute the partition function \f$Q\f$ of an RNA sequence
97 * If \a structure is not a NULL pointer on input, it contains on
98 * return a string consisting of the letters " . , | { } ( ) " denoting
99 * bases that are essentially unpaired, weakly paired, strongly paired without
100 * preference, weakly upstream (downstream) paired, or strongly up-
101 * (down-)stream paired bases, respectively.
102 * If #fold_constrained is not 0, the \a structure string is
103 * interpreted on input as a list of constraints for the folding. The
104 * character "x" marks bases that must be unpaired, matching brackets " ( ) "
105 * denote base pairs, all other characters are ignored. Any pairs
106 * conflicting with the constraint will be forbidden. This is usually sufficient
107 * to ensure the constraints are honored.
108 * If #do_backtrack has been set to 0 base pairing probabilities will not
109 * be computed (saving CPU time), otherwise #pr will contain the probability
110 * that bases \a i and \a j pair.
114 * \note The global array #pr is deprecated and the user who wants the calculated
115 * base pair probabilities for further computations is advised to use the function
118 * This function is not entirely threadsafe. While the recursions are working on their
119 * own copies of data the model details for the recursions are determined from the global
120 * settings just before entering the recursions. Consider using pf_fold_par() for a
121 * really threadsafe implementation.
122 * \pre This function takes its model details from the global variables provided in \e RNAlib
123 * \post After successful run the hidden folding matrices are filled with the appropriate Boltzmann factors.
124 * Depending on whether the global variable #do_backtrack was set the base pair probabilities are already
125 * computed and may be accessed for further usage via the export_bppm() function.
126 * A call of free_pf_arrays() will free all memory allocated by this function.
127 * Successive calls will first free previously allocated memory before starting the computation.
128 * \see pf_fold_par(), pf_circ_fold(), bppm_to_structure(), export_bppm()
129 * \param sequence The RNA sequence input
130 * \param structure A pointer to a char array where a base pair probability information can be stored in a pseudo-dot-bracket notation (may be NULL, too)
131 * \return The Gibbs free energy of the ensemble (\f$G = -RT \cdot \log(Q) \f$) in kcal/mol
133 float pf_fold(const char *sequence,
137 * \brief Compute the partition function of a circular RNA sequence
141 * \note The global array #pr is deprecated and the user who wants the calculated
142 * base pair probabilities for further computations is advised to use the function
145 * This function is not entirely threadsafe. While the recursions are working on their
146 * own copies of data the model details for the recursions are determined from the global
147 * settings just before entering the recursions. Consider using pf_fold_par() for a
148 * really threadsafe implementation.
149 * \pre This function takes its model details from the global variables provided in \e RNAlib
150 * \post After successful run the hidden folding matrices are filled with the appropriate Boltzmann factors.
151 * Depending on whether the global variable #do_backtrack was set the base pair probabilities are already
152 * computed and may be accessed for further usage via the export_bppm() function.
153 * A call of free_pf_arrays() will free all memory allocated by this function.
154 * Successive calls will first free previously allocated memory before starting the computation.
155 * \see pf_fold_par(), pf_fold()
156 * \param[in] sequence The RNA sequence input
157 * \param[in,out] structure A pointer to a char array where a base pair probability information can be
158 * stored in a pseudo-dot-bracket notation (may be NULL, too)
159 * \return The Gibbs free energy of the ensemble (\f$G = -RT \cdot \log(Q) \f$) in kcal/mol
161 float pf_circ_fold( const char *sequence,
165 * \brief Sample a secondary structure from the Boltzmann ensemble according its probability\n
167 * \ingroup subopt_stochbt
168 * \pre pf_fold_par() or pf_fold() have to be called first to fill the partition function matrices
170 * \param sequence The RNA sequence
171 * \return A sampled secondary structure in dot-bracket notation
173 char *pbacktrack(char *sequence);
176 * \brief Sample a secondary structure of a circular RNA from the Boltzmann ensemble according its probability
178 * This function does the same as \ref pbacktrack() but assumes the RNA molecule to be circular
180 * \ingroup subopt_stochbt
181 * \pre pf_fold_par() or pf_fold_circ() have to be called first to fill the partition function matrices
183 * \param sequence The RNA sequence
184 * \return A sampled secondary structure in dot-bracket notation
186 char *pbacktrack_circ(char *sequence);
189 * \brief Free arrays for the partition function recursions
191 * Call this function if you want to free all allocated memory associated with
192 * the partition function forward recursion.
193 * \note Successive calls of pf_fold(), pf_circ_fold() already check if they should free
194 * any memory from a previous run.
195 * \note <b>OpenMP notice:</b><br>
196 * This function should be called before leaving a thread in order to avoid leaking memory
200 * \post All memory allocated by pf_fold_par(), pf_fold() or pf_circ_fold() will be free'd
201 * \see pf_fold_par(), pf_fold(), pf_circ_fold()
203 void free_pf_arrays(void);
206 * \brief Recalculate energy parameters
208 * Call this function to recalculate the pair matrix and energy parameters
209 * after a change in folding parameters like #temperature
214 void update_pf_params(int length);
217 * \brief Recalculate energy parameters
222 void update_pf_params_par(int length, pf_paramT *parameters);
225 * \brief Get a pointer to the base pair probability array
228 * Accessing the base pair probabilities for a pair (i,j) is achieved by
230 * FLT_OR_DBL *pr = export_bppm();
231 * pr_ij = pr[iindx[i]-j];
234 * \pre Call pf_fold_par(), pf_fold() or pf_circ_fold() first to fill the base pair probability array
236 * \see pf_fold(), pf_circ_fold(), get_iindx()
238 * \return A pointer to the base pair probability array
240 FLT_OR_DBL *export_bppm(void);
243 #################################################
244 # OTHER PARTITION FUNCTION RELATED DECLARATIONS #
245 #################################################
249 * \brief Create a plist from a probability matrix
251 * The probability matrix given is parsed and all pair probabilities above
252 * the given threshold are used to create an entry in the plist
254 * The end of the plist is marked by sequence positions i as well as j
255 * equal to 0. This condition should be used to stop looping over its
258 * \note This function is threadsafe
260 * \param[out] pl A pointer to the plist that is to be created
261 * \param[in] probs The probability matrix used for creting the plist
262 * \param[in] length The length of the RNA sequence
263 * \param[in] cutoff The cutoff value
265 void assign_plist_from_pr( plist **pl,
270 /* this doesn't work if free_pf_arrays() is called before */
271 void assign_plist_gquad_from_pr(plist **pl,
275 char *get_centroid_struct_gquad_pr(int length,
279 * \brief Get the pointers to (almost) all relavant computation arrays used in partition function computation
282 * \pre In order to assign meaningful pointers, you have to call pf_fold_par() or pf_fold() first!
283 * \see pf_fold_par(), pf_fold(), pf_circ_fold()
284 * \param[out] S_p A pointer to the 'S' array (integer representation of nucleotides)
285 * \param[out] S1_p A pointer to the 'S1' array (2nd integer representation of nucleotides)
286 * \param[out] ptype_p A pointer to the pair type matrix
287 * \param[out] qb_p A pointer to the Q<sup>B</sup> matrix
288 * \param[out] qm_p A pointer to the Q<sup>M</sup> matrix
289 * \param[out] q1k_p A pointer to the 5' slice of the Q matrix (\f$q1k(k) = Q(1, k)\f$)
290 * \param[out] qln_p A pointer to the 3' slice of the Q matrix (\f$qln(l) = Q(l, n)\f$)
291 * \return Non Zero if everything went fine, 0 otherwise
293 int get_pf_arrays(short **S_p,
302 * \brief Get the free energy of a subsequence from the q[] array
304 double get_subseq_F(int i, int j);
307 * \brief Get the centroid structure of the ensemble
309 * This function is a threadsafe replacement for \ref centroid() with a 'plist' input
311 * The centroid is the structure with the minimal average distance to all other structures
312 * \n \f$ <d(S)> = \sum_{(i,j) \in S} (1-p_{ij}) + \sum_{(i,j) \notin S} p_{ij} \f$ \n
313 * Thus, the centroid is simply the structure containing all pairs with \f$p_ij>0.5\f$
314 * The distance of the centroid to the ensemble is written to the memory adressed by \a dist.
316 * \ingroup centroid_fold
317 * \param[in] length The length of the sequence
318 * \param[out] dist A pointer to the distance variable where the centroid distance will be written to
319 * \param[in] pl A pair list containing base pair probability information about the ensemble
320 * \return The centroid structure of the ensemble in dot-bracket notation
322 char *get_centroid_struct_pl(int length,
327 * \brief Get the centroid structure of the ensemble
329 * This function is a threadsafe replacement for \ref centroid() with a probability array input
331 * The centroid is the structure with the minimal average distance to all other structures
332 * \n \f$ <d(S)> = \sum_{(i,j) \in S} (1-p_{ij}) + \sum_{(i,j) \notin S} p_{ij} \f$ \n
333 * Thus, the centroid is simply the structure containing all pairs with \f$p_ij>0.5\f$
334 * The distance of the centroid to the ensemble is written to the memory adressed by \a dist.
336 * \ingroup centroid_fold
337 * \param[in] length The length of the sequence
338 * \param[out] dist A pointer to the distance variable where the centroid distance will be written to
339 * \param[in] pr A upper triangular matrix containing base pair probabilities (access via iindx \ref get_iindx() )
340 * \return The centroid structure of the ensemble in dot-bracket notation
342 char *get_centroid_struct_pr(int length,
347 * \brief Get the mean base pair distance of the last partition function computation
349 * \note To ensure thread-safety, use the function mean_bp_distance_pr() instead!
353 * \see mean_bp_distance_pr()
356 * \return mean base pair distance in thermodynamic ensemble
358 double mean_bp_distance(int length);
361 * \brief Get the mean base pair distance in the thermodynamic ensemble
363 * This is a threadsafe implementation of \ref mean_bp_dist() !
365 * \f$<d> = \sum_{a,b} p_a p_b d(S_a,S_b)\f$\n
366 * this can be computed from the pair probs \f$p_ij\f$ as\n
367 * \f$<d> = \sum_{ij} p_{ij}(1-p_{ij})\f$
369 * \note This function is threadsafe
373 * \param length The length of the sequence
374 * \param pr The matrix containing the base pair probabilities
375 * \return The mean pair distance of the structure ensemble
377 double mean_bp_distance_pr(int length,
381 * \brief Create a dot-bracket like structure string from base pair probability matrix
383 void bppm_to_structure(char *structure,
385 unsigned int length);
387 plist *stackProb(double cutoff);
390 * \brief Get a pseudo dot bracket notation for a given probability information
392 char bppm_symbol(const float *x);
396 #################################################
397 # DEPRECATED FUNCTIONS #
398 #################################################
402 * \brief Allocate space for pf_fold()
404 * \deprecated This function is obsolete and will be removed soon!
406 DEPRECATED(void init_pf_fold(int length));
409 * \deprecated This function is deprecated and should not be used anymore as it is not threadsafe!
410 * \see get_centroid_struct_pl(), get_centroid_struct_pr()
412 DEPRECATED(char *centroid(int length,
413 double *dist)); /* mean pair distance of ensemble */
416 * get the mean pair distance of ensemble
418 * \deprecated This function is not threadsafe and should not be used anymore. Use \ref mean_bp_distance() instead!
420 DEPRECATED(double mean_bp_dist(int length));
423 * \deprecated Use \ref exp_E_IntLoop() from loop_energies.h instead
425 DEPRECATED(double expLoopEnergy(int u1,
435 * \deprecated Use exp_E_Hairpin() from loop_energies.h instead
437 DEPRECATED(double expHairpinEnergy( int u,
441 const char *string));