#ifndef __VIENNA_RNA_PACKAGE_ALIFOLD_H__ #define __VIENNA_RNA_PACKAGE_ALIFOLD_H__ #include "data_structures.h" /** * \addtogroup consensus_fold * \brief compute various properties (consensus MFE structures, partition function, * Boltzmann distributed stochastic samples, ...) for RNA sequence alignments * * Consensus structures can be predicted by a modified version of the * fold() algorithm that takes a set of aligned sequences instead * of a single sequence. The energy function consists of the mean energy * averaged over the sequences, plus a covariance term that favors pairs * with consistent and compensatory mutations and penalizes pairs that * cannot be formed by all structures. For details see \cite hofacker:2002 and * \cite bernhart:2008. * @{ * \file alifold.h * \brief compute various properties (consensus MFE structures, partition function, Boltzmann * distributed stochastic samples, ...) for RNA sequence alignments * * @} */ /** * \addtogroup consensus_mfe_fold * \ingroup consensus_fold * @{ * * @} */ /** * \brief This variable controls the weight of the covariance term in the * energy function of alignment folding algorithms * * \ingroup consensus_fold * * Default is 1. */ extern double cv_fact; /** * \brief This variable controls the magnitude of the penalty for non-compatible sequences in * the covariance term of alignment folding algorithms. * * \ingroup consensus_fold * * Default is 1. */ extern double nc_fact; /* ############################################## # MFE VARIANTS OF THE ALIFOLD IMPLEMENTATION # ############################################## */ /** * \brief Update the energy parameters for alifold function * * Call this to recalculate the pair matrix and energy parameters after a * change in folding parameters like #temperature */ void update_alifold_params(void); /** * \brief Compute MFE and according consensus structure of an alignment of sequences * * This function predicts the consensus structure for the aligned 'sequences' * and returns the minimum free energy; the mfe structure in bracket * notation is returned in 'structure'. * * Sufficient space must be allocated for 'structure' before calling * alifold(). * * \ingroup consensus_mfe_fold * * \param strings A pointer to a NULL terminated array of character arrays * \param structure A pointer to a character array that may contain a constraining consensus structure * (will be overwritten by a consensus structure that exhibits the MFE) * \return The free energy score in kcal/mol */ float alifold( const char **strings, char *structure); /** * \brief Compute MFE and according structure of an alignment of sequences assuming the sequences are circular instead of linear * * \ingroup consensus_mfe_fold * * \param strings A pointer to a NULL terminated array of character arrays * \param structure A pointer to a character array that may contain a constraining consensus structure * (will be overwritten by a consensus structure that exhibits the MFE) * \return The free energy score in kcal/mol */ float circalifold( const char **strings, char *structure); /** * \brief Free the memory occupied by MFE alifold functions * * \ingroup consensus_mfe_fold * */ void free_alifold_arrays(void); /** * \brief Get the mean pairwise identity in steps from ?to?(ident) * * \ingroup consensus_fold * * \param Alseq * \param n_seq The number of sequences in the alignment * \param length The length of the alignment * \param mini * \return The mean pairwise identity */ int get_mpi(char *Alseq[], int n_seq, int length, int *mini); /** * \brief Read a ribosum or other user-defined scoring matrix * * \ingroup consensus_fold * */ float **readribosum(char *name); /** * \brief Calculate the free energy of a consensus structure given a set of aligned sequences * * \ingroup consensus_fold * * \param sequences The NULL terminated array of sequences * \param structure The consensus structure * \param n_seq The number of sequences in the alignment * \param energy A pointer to an array of at least two floats that will hold the free energies * (energy[0] will contain the free energy, energy[1] will be filled with the covariance energy term) * \returns free energy in kcal/mol * */ float energy_of_alistruct(const char **sequences, const char *structure, int n_seq, float *energy); float energy_of_ali_gquad_structure(const char **sequences, const char *structure, int n_seq, float *energy); /* ############################################################# # some helper functions that might be useful in the library # ############################################################# */ /** * \brief Get arrays with encoded sequence of the alignment * * this function assumes that in S, S5, s3, ss and as enough * space is already allocated (size must be at least sequence length+2) * * \ingroup consensus_fold * * \param sequence The gapped sequence from the alignment * \param S pointer to an array that holds encoded sequence * \param s5 pointer to an array that holds the next base 5' of alignment position i * \param s3 pointer to an array that holds the next base 3' of alignment position i * \param ss * \param as * \param circ assume the molecules to be circular instead of linear (circ=0) */ void encode_ali_sequence( const char *sequence, short *S, short *s5, short *s3, char *ss, unsigned short *as, int circ); /** * \brief Allocate memory for sequence array used to deal with aligned sequences * * Note that these arrays will also be initialized according to the sequence alignment given * * \ingroup consensus_fold * * \see free_sequence_arrays() * * \param sequences The aligned sequences * \param S A pointer to the array of encoded sequences * \param S5 A pointer to the array that contains the next 5' nucleotide of a sequence position * \param S3 A pointer to the array that contains the next 3' nucleotide of a sequence position * \param a2s A pointer to the array that contains the alignment to sequence position mapping * \param Ss A pointer to the array that contains the ungapped sequence * \param circ assume the molecules to be circular instead of linear (circ=0) */ void alloc_sequence_arrays(const char **sequences, short ***S, short ***S5, short ***S3, unsigned short ***a2s, char ***Ss, int circ); /** * \brief Free the memory of the sequence arrays used to deal with aligned sequences * * This function frees the memory previously allocated with alloc_sequence_arrays() * * \ingroup consensus_fold * * \see alloc_sequence_arrays() * * \param n_seq The number of aligned sequences * \param S A pointer to the array of encoded sequences * \param S5 A pointer to the array that contains the next 5' nucleotide of a sequence position * \param S3 A pointer to the array that contains the next 3' nucleotide of a sequence position * \param a2s A pointer to the array that contains the alignment to sequence position mapping * \param Ss A pointer to the array that contains the ungapped sequence */ void free_sequence_arrays( unsigned int n_seq, short ***S, short ***S5, short ***S3, unsigned short ***a2s, char ***Ss); /* ############################################################# # PARTITION FUNCTION VARIANTS OF THE ALIFOLD IMPLEMENTATION # ############################################################# */ /** * \addtogroup consensus_pf_fold * \ingroup consensus_fold * @{ * * @} */ /** * \brief * * \ingroup consensus_pf_fold * * \param sequences * \param structure * \param pl * \param parameters * \param calculate_bppm * \param is_constrained * \param is_circular * \return */ float alipf_fold_par( const char **sequences, char *structure, plist **pl, pf_paramT *parameters, int calculate_bppm, int is_constrained, int is_circular); /** * \brief * * The partition function version of alifold() works in analogy to * pf_fold(). Pair probabilities and information about sequence * covariations are returned via the 'pi' variable as a list of * #pair_info structs. The list is terminated by the first entry with * pi.i = 0. * * \ingroup consensus_pf_fold * * \param sequences * \param structure * \param pl * \return */ float alipf_fold( const char **sequences, char *structure, plist **pl); /** * \brief * * \ingroup consensus_pf_fold * * \param sequences * \param structure * \param pl * \return */ float alipf_circ_fold(const char **sequences, char *structure, plist **pl); /** * \brief Get a pointer to the base pair probability array * * Accessing the base pair probabilities for a pair (i,j) is achieved by * \verbatim FLT_OR_DBL *pr = export_bppm(); pr_ij = pr[iindx[i]-j]; \endverbatim * * \ingroup consensus_pf_fold * * \see get_iindx() * \return A pointer to the base pair probability array */ FLT_OR_DBL *export_ali_bppm(void); /** * \brief * * \ingroup consensus_pf_fold * */ void free_alipf_arrays(void); /** * \addtogroup consensus_stochbt * @{ * * @} */ /** * \brief Sample a consensus secondary structure from the Boltzmann ensemble according its probability\n * * \ingroup consensus_stochbt * * \param prob to be described (berni) * \return A sampled consensus secondary structure in dot-bracket notation */ char *alipbacktrack(double *prob); /** * \brief Get pointers to (almost) all relavant arrays used in alifold's partition function computation * * \ingroup consensus_fold * * \note To obtain meaningful pointers, call alipf_fold first! * * \see pf_alifold(), alipf_circ_fold() * * \param S_p A pointer to the 'S' array (integer representation of nucleotides) * \param S5_p A pointer to the 'S5' array * \param S3_p A pointer to the 'S3' array * \param a2s_p A pointer to the pair type matrix * \param Ss_p A pointer to the 'Ss' array * \param qb_p A pointer to the QB matrix * \param qm_p A pointer to the QM matrix * \param q1k_p A pointer to the 5' slice of the Q matrix (\f$q1k(k) = Q(1, k)\f$) * \param qln_p A pointer to the 3' slice of the Q matrix (\f$qln(l) = Q(l, n)\f$) * \return Non Zero if everything went fine, 0 otherwise */ int get_alipf_arrays(short ***S_p, short ***S5_p, short ***S3_p, unsigned short ***a2s_p, char ***Ss_p, FLT_OR_DBL **qb_p, FLT_OR_DBL **qm_p, FLT_OR_DBL **q1k_p, FLT_OR_DBL **qln_p, short **pscore); #endif