1 #ifndef __VIENNA_RNA_PACKAGE_UTILS_H__
2 #define __VIENNA_RNA_PACKAGE_UTILS_H__
6 * \brief Various utility- and helper-functions used throughout the Vienna RNA package
10 * Output flag of \ref get_input_line(): "An ERROR has occured, maybe EOF"
12 #define VRNA_INPUT_ERROR 1U
14 * Output flag of \ref get_input_line(): "the user requested quitting the program"
16 #define VRNA_INPUT_QUIT 2U
18 * Output flag of \ref get_input_line(): "something was read"
20 #define VRNA_INPUT_MISC 4U
22 /** Input/Output flag of \ref get_input_line():\n
23 * if used as input option this tells get_input_line() that the data to be read should comply
24 * with the FASTA format
26 * the function will return this flag if a fasta header was read
28 #define VRNA_INPUT_FASTA_HEADER 8U
30 /** Input flag for get_input_line():\n
31 * Tell get_input_line() that we assume to read a nucleotide sequence
34 #define VRNA_INPUT_SEQUENCE 16U
36 /** Input flag for get_input_line():\n
37 * Tell get_input_line() that we assume to read a structure constraint
40 #define VRNA_INPUT_CONSTRAINT 32U
43 * Input switch for \ref get_input_line():
44 * "do not trunkate the line by eliminating white spaces at end of line"
46 #define VRNA_INPUT_NO_TRUNCATION 256U
49 * Input switch for read_record(): "do fill rest array"
51 #define VRNA_INPUT_NO_REST 512U
54 * Input switch for read_record(): "never allow data to span more than one line"
56 #define VRNA_INPUT_NO_SPAN 1024U
59 * Input switch for read_record(): "do not skip empty lines"
61 #define VRNA_INPUT_NOSKIP_BLANK_LINES 2048U
64 * Output flag for read_record(): "read an empty line"
66 #define VRNA_INPUT_BLANK_LINE 4096U
69 * Input switch for \ref get_input_line(): "do not skip comment lines"
71 #define VRNA_INPUT_NOSKIP_COMMENTS 128U
74 * Output flag for read_record(): "read a comment"
76 #define VRNA_INPUT_COMMENT 8192U
82 * pipe sign '|' switch for structure constraints (paired with another base)
84 #define VRNA_CONSTRAINT_PIPE 1U
86 * dot '.' switch for structure constraints (no constraint at all)
88 #define VRNA_CONSTRAINT_DOT 2U
90 * 'x' switch for structure constraint (base must not pair)
92 #define VRNA_CONSTRAINT_X 4U
94 * angle brackets '<', '>' switch for structure constraint (paired downstream/upstream)
96 #define VRNA_CONSTRAINT_ANG_BRACK 8U
98 * round brackets '(',')' switch for structure constraint (base i pairs base j)
100 #define VRNA_CONSTRAINT_RND_BRACK 16U
102 * constraint may span over several lines
104 #define VRNA_CONSTRAINT_MULTILINE 32U
106 * do not print the header information line
108 #define VRNA_CONSTRAINT_NO_HEADER 64U
110 * placeholder for all constraining characters
112 #define VRNA_CONSTRAINT_ALL 128U
114 * '+' switch for structure constraint (base is involved in a gquad)
116 #define VRNA_CONSTRAINT_G 256U
121 * Tell a function that an input is assumed to span several lines if used as input-option
122 * A function might also be returning this state telling that it has read data from
125 * \see extract_record_rest_structure(), read_record(), getConstraint()
128 #define VRNA_OPTION_MULTILINE 32U
132 * Get the minimum of two comparable values
134 #define MIN2(A, B) ((A) < (B) ? (A) : (B))
136 * Get the maximum of two comparable values
138 #define MAX2(A, B) ((A) > (B) ? (A) : (B))
140 * Get the minimum of three comparable values
142 #define MIN3(A, B, C) (MIN2( (MIN2((A),(B))) ,(C)))
144 * Get the maximum of three comparable values
146 #define MAX3(A, B, C) (MAX2( (MAX2((A),(B))) ,(C)))
150 * Stringify a macro after expansion
152 #define XSTR(s) STR(s)
154 * Stringify a macro argument
158 #ifndef FILENAME_MAX_LENGTH
160 * \brief Maximum length of filenames that are generated by our programs
162 * This definition should be used throughout the complete ViennaRNA package
163 * wherever a static array holding filenames of output files is declared.
165 #define FILENAME_MAX_LENGTH 80
167 * \brief Maximum length of id taken from fasta header for filename generation
169 * this has to be smaller than FILENAME_MAX_LENGTH since in most cases,
170 * some suffix will be appended to the ID
172 #define FILENAME_ID_LENGTH 42
179 char *strdup(const char *s);
183 /* use dmalloc library to check for memory management bugs */
185 #define space(S) calloc(1,(S))
189 * \brief Allocate space safely
191 * \param size The size of the memory to be allocated in bytes
192 * \return A pointer to the allocated memory
194 /*@only@*/ /*@notnull@*/
195 void *space(unsigned size) /*@ensures MaxSet(result) == (size-1);@*/;
198 * \brief Reallocate space safely
200 * \param p A pointer to the memory region to be reallocated
201 * \param size The size of the memory to be allocated in bytes
202 * \return A pointer to the newly allocated memory
204 /*@only@*/ /*@notnull@*/
205 void *xrealloc(/*@null@*/ /*@only@*/ /*@out@*/ /*@returned@*/ void *p,
206 unsigned size) /*@modifies *p @*/ /*@ensures MaxSet(result) == (size-1) @*/;
210 * \brief Die with an error message
213 * \param message The error message to be printed before exiting with 'FAILURE'
216 void nrerror(const char message[]);
219 * \brief Print a warning message
221 * Print a warning message to \e stderr
223 * \param message The warning message
225 void warn_user(const char message[]);
228 * \brief Make random number seeds
230 void init_rand(void);
233 * \brief Current 48 bit random number
235 * This variable is used by urn(). These should be set to some
236 * random number seeds before the first call to urn().
240 extern unsigned short xsubi[3];
243 * \brief get a random number from [0..1]
245 * \note Usually implemented by calling \e erand48().
246 * \return A random number in range [0..1]
251 * \brief Generates a pseudo random integer in a specified range
253 * \param from The first number in range
254 * \param to The last number in range
255 * \return A pseudo random number in range [from, to]
257 int int_urn(int from, int to);
259 void filecopy(FILE *from, FILE *to); /* inefficient `cp' */
262 * \brief Get a timestamp
264 * Returns a string containing the current date in the format
265 * \verbatim Fri Mar 19 21:10:57 1993\endverbatim
267 * \return A string containing the timestamp
270 char *time_stamp(void);
273 * \brief Create a random string using characters from a specified symbol set
275 * \param l The length of the sequence
276 * \param symbols The symbol set
277 * \return A random string of length 'l' containing characters from the symbolset
279 /*@only@*/ /*@notnull@*/
280 char *random_string(int l, const char symbols[]);
283 * \brief Calculate hamming distance between two sequences
285 * Calculate the number of positions in which
286 * \param s1 The first sequence
287 * \param s2 The second sequence
288 * \return The hamming distance between s1 and s2
290 int hamming(const char *s1, const char *s2);
293 * \brief Calculate hamming distance between two sequences up to a specified length
295 * This function is similar to hamming() but instead of comparing both sequences
296 * up to their actual length only the first 'n' characters are taken into account
297 * \param s1 The first sequence
298 * \param s2 The second sequence
299 * \return The hamming distance between s1 and s2
301 int hamming_bound(const char *s1, const char *s2, int n);
304 * \brief Read a line of arbitrary length from a stream
306 * Returns a pointer to the resulting string. The necessary memory is
307 * allocated and should be released using \e free() when the string is
310 * \param fp A file pointer to the stream where the function should read from
311 * \return A pointer to the resulting string
313 /*@only@*/ /*@null@*/
314 char *get_line(FILE *fp);
316 int skip_comment_lines(char **line);
319 * Retrieve a line from 'stdin' savely while skipping comment characters and
321 * This function returns the type of input it has read if recognized.
322 * An option argument allows to switch between different reading modes.\n
323 * Currently available options are:\n
324 * #VRNA_INPUT_NOPRINT_COMMENTS, #VRNA_INPUT_NOSKIP_COMMENTS, #VRNA_INPUT_NOELIM_WS_SUFFIX
326 * pass a collection of options as one value like this:
327 * \verbatim get_input_line(string, option_1 | option_2 | option_n) \endverbatim
329 * If the function recognizes the type of input, it will report it in the return
330 * value. It also reports if a user defined 'quit' command (@-sign on 'stdin')
331 * was given. Possible return values are:\n
332 * #VRNA_INPUT_FASTA_HEADER, #VRNA_INPUT_ERROR, #VRNA_INPUT_MISC, #VRNA_INPUT_QUIT
334 * \param string A pointer to the character array that contains the line read
335 * \param options A collection of options for switching the functions behavior
336 * \return A flag with information about what has been read
338 unsigned int get_input_line(char **string,
339 unsigned int options);
341 unsigned int get_multi_input_line(char **string,
342 unsigned int options);
345 * \brief Get a data record from stdin
347 * This function may be used to obtain complete datasets from stdin. A dataset is always
348 * defined to contain at least a sequence. If data on stdin starts with a fasta header,
350 * \verbatim >some header info \endverbatim
351 * then read_record() will assume that the sequence that follows the header may span
352 * over several lines. To disable this behavior and to assign a single line to the argument
353 * 'sequence' one can pass VRNA_INPUT_NO_SPAN in the 'options' argument.
354 * If no fasta header is read in the beginning of a data block, a sequence must not span over
356 * Unless the options #VRNA_INPUT_NOSKIP_COMMENTS or #VRNA_INPUT_NOSKIP_BLANK_LINES are passed,
357 * a sequence may be interrupted by lines starting with a comment character or empty lines.\n
358 * A sequence is regarded as completely read if it was either assumed to not span over multiple
359 * lines, a secondary structure or structure constraint follows the sequence on the next line
360 * or a new header marks the beginning of a new sequence...\n
361 * All lines following the sequence (this includes comments) and not initiating a new dataset are
362 * available through the line-array 'rest'. Here one can usually find the structure constraint or
363 * other information belonging to the current dataset. Filling of 'rest' may be prevented by
364 * passing #VRNA_INPUT_NO_REST to the options argument.\n
366 * \note This function will exit any program with an error message if no sequence could be read!
368 * The main purpose of this function is to be able to easily parse blocks of data from stdin
369 * in the header of a loop where all calculations for the appropriate data is done inside the
370 * loop. The loop may be then left on certain return values, e.g.:
372 char *id, *seq, **rest;
374 while(!(read_record(&id, &seq, &rest, 0) & (VRNA_INPUT_ERROR | VRNA_INPUT_QUIT))){
375 if(id) printf("%s\n", id);
379 printf("%s\n", rest[i]);
382 * In the example above, the while loop will be terminated when read_record() returns either an
383 * error or a user initiated quit request.\n
384 * As long as data is read from stdin, the id is printed if it is available for the current block
385 * of data. The sequence will be printed in any case and if some more lines belong to the current
386 * block of data each line will be printed as well.
388 * \note Do not forget to free the memory occupied by header, sequence and rest!
390 * \param header A pointer which will be set such that it points to the header of the record
391 * \param sequence A pointer which will be set such that it points to the sequence of the record
392 * \param rest A pointer which will be set such that it points to an array of lines which also belong to the record
393 * \param options Some options which may be passed to alter the behavior of the function, use 0 for no options
394 * \return A flag with information about what the function actually did read
396 unsigned int read_record( char **header,
399 unsigned int options);
402 /* \brief Extract a dot-bracket structure string from (multiline)character array
404 * This function extracts a dot-bracket structure string from the 'rest' array as
405 * returned by read_record() and returns it. All occurences of comments within the
406 * 'lines' array will be skipped as long as they do not break the structure string.
407 * If no structure could be read, this function returns NULL.
411 * \param lines The (multiline) character array to be parsed
412 * \param length The assumed length of the dot-bracket string (passing a value < 1 results in no length limit)
413 * \param option Some options which may be passed to alter the behavior of the function, use 0 for no options
414 * \return The dot-bracket string read from lines or NULL
416 char *extract_record_rest_structure(const char **lines,
418 unsigned int option);
421 * \brief Pack secondary secondary structure, 5:1 compression using base 3 encoding
423 * Returns a binary string encoding of the secondary structure using
424 * a 5:1 compression scheme. The string is NULL terminated and can
425 * therefore be used with standard string functions such as strcmp().
426 * Useful for programs that need to keep many structures in memory.
428 * \param struc The secondary structure in dot-bracket notation
429 * \return The binary encoded structure
431 char *pack_structure(const char *struc);
434 * \brief Unpack secondary structure previously packed with pack_structure()
436 * Translate a compressed binary string produced by pack_structure() back into
437 * the familiar dot-bracket notation.
439 * \param packed The binary encoded packed secondary structure
440 * \return The unpacked secondary structure in dot-bracket notation
442 char *unpack_structure(const char *packed);
445 * \brief Create a pair table of a secondary structure
447 * Returns a newly allocated table, such that table[i]=j if (i.j) pair
448 * or 0 if i is unpaired, table[0] contains the length of the structure.
450 * \param structure The secondary structure in dot-bracket notation
451 * \return A pointer to the created pair_table
453 short *make_pair_table(const char *structure);
455 short *make_pair_table_pk(const char *structure);
458 * \brief Get an exact copy of a pair table
460 * \param pt The pair table to be copied
461 * \return A pointer to the copy of 'pt'
463 short *copy_pair_table(const short *pt);
466 ***Pair table for snoop align
470 short *alimake_pair_table(const char *structure);
473 *** returns a newly allocated table, such that: table[i]=j if (i.j) pair or
474 *** 0 if i is unpaired, table[0] contains the length of the structure.
475 *** The special pseudoknotted H/ACA-mRNA structure is taken into account.
477 short *make_pair_table_snoop(const char *structure);
480 * \brief Compute the "base pair" distance between two secondary structures s1 and s2.
482 * The sequences should have the same length.
483 * dist = number of base pairs in one structure but not in the other
484 * same as edit distance with open-pair close-pair as move-set
486 * \param str1 First structure in dot-bracket notation
487 * \param str2 Second structure in dot-bracket notation
488 * \return The base pair distance between str1 and str2
491 int *make_loop_index_pt(short *pt);
494 int bp_distance(const char *str1,
498 * \brief Print a line to \e stdout that asks for an input sequence
500 * There will also be a ruler (scale line) printed that helps orientation of the sequence positions
502 void print_tty_input_seq(void);
505 * \brief Print a line with a user defined string and a ruler to stdout.
507 * (usually this is used to ask for user input)
508 * There will also be a ruler (scale line) printed that helps orientation of the sequence positions
510 * \param s A user defined string that will be printed to stdout
512 void print_tty_input_seq_str(const char *s);
515 * \brief Print structure constraint characters to stdout
516 * (full constraint support)
519 void print_tty_constraint_full(void);
522 * \brief Print structure constraint characters to stdout.
523 * (constraint support is specified by option parameter)
525 * Currently available options are:\n
526 * #VRNA_CONSTRAINT_PIPE (paired with another base)\n
527 * #VRNA_CONSTRAINT_DOT (no constraint at all)\n
528 * #VRNA_CONSTRAINT_X (base must not pair)\n
529 * #VRNA_CONSTRAINT_ANG_BRACK (paired downstream/upstream)\n
530 * #VRNA_CONSTRAINT_RND_BRACK (base i pairs base j)\n
532 * pass a collection of options as one value like this:
533 * \verbatim print_tty_constraint(option_1 | option_2 | option_n) \endverbatim
535 * \param option Option switch that tells which constraint help will be printed
537 void print_tty_constraint(unsigned int option);
540 * \brief Convert a DNA input sequence to RNA alphabet
542 * This function substitudes <i>T</i> and <i>t</i> with <i>U</i> and <i>u</i>, respectively
544 * \param sequence The sequence to be converted
546 void str_DNA2RNA(char *sequence);
549 * \brief Convert an input sequence to uppercase
551 * \param sequence The sequence to be converted
553 void str_uppercase(char *sequence);
556 * \brief Get an index mapper array (iindx) for accessing the energy matrices, e.g. in partition function related functions.
558 * Access of a position "(i,j)" is then accomplished by using \verbatim (i,j) ~ iindx[i]-j \endverbatim
559 * This function is necessary as most of the two-dimensional energy matrices are actually one-dimensional arrays throughout
560 * the ViennaRNAPackage
562 * Consult the implemented code to find out about the mapping formula ;)
565 * \param length The length of the RNA sequence
566 * \return The mapper array
568 int *get_iindx(unsigned int length);
571 * \brief Get an index mapper array (indx) for accessing the energy matrices, e.g. in MFE related functions.
573 * Access of a position "(i,j)" is then accomplished by using \verbatim (i,j) ~ indx[j]+i \endverbatim
574 * This function is necessary as most of the two-dimensional energy matrices are actually one-dimensional arrays throughout
575 * the ViennaRNAPackage
577 * Consult the implemented code to find out about the mapping formula ;)
580 * \param length The length of the RNA sequence
581 * \return The mapper array
584 int *get_indx(unsigned int length);
586 void getConstraint( char **cstruc,
588 unsigned int option);
591 * \brief Insert constraining pair types according to constraint structure string
593 * \see get_indx(), get_iindx()
595 * \param constraint The structure constraint string
596 * \param length The actual length of the sequence (constraint may be shorter)
597 * \param ptype A pointer to the basepair type array
598 * \param min_loop_size The minimal loop size (usually \ref TURN )
599 * \param idx_type Define the access type for base pair type array (0 = indx, 1 = iindx)
601 void constrain_ptypes(const char *constraint,
606 unsigned int idx_type);
608 unsigned int *make_referenceBP_array(short *reference_pt,
611 unsigned int *compute_BPdifferences( short *pt1,