1 /* Last changed Time-stamp: <2009-02-24 14:49:24 ivo> */
3 Access to alifold Routines
16 #include "part_func.h"
17 #include "fold_vars.h"
23 #include "read_epars.h"
26 #include "RNAalifold_cmdl.h"
29 static const char rcsid[] = "$Id: RNAalifold.c,v 1.23 2009/02/24 14:21:26 ivo Exp $";
31 #define MAX_NUM_NAMES 500
33 PRIVATE char **annote(const char *structure, const char *AS[]);
34 PRIVATE void print_pi(const pair_info pi, FILE *file);
35 PRIVATE void print_aliout(char **AS, plist *pl, int n_seq, char * mfe, FILE *aliout);
36 PRIVATE void mark_endgaps(char *seq, char egap);
37 PRIVATE cpair *make_color_pinfo(char **sequences, plist *pl, int n_seq, plist *mfel);
39 /*--------------------------------------------------------------------------*/
40 int main(int argc, char *argv[]){
41 struct RNAalifold_args_info args_info;
42 unsigned int input_type;
43 char ffname[FILENAME_MAX_LENGTH], gfname[FILENAME_MAX_LENGTH], fname[FILENAME_MAX_LENGTH];
44 char *input_string, *string, *structure, *cstruc, *ParamFile, *ns_bases, *c;
45 int n_seq, i, length, sym, r, noPS;
46 int endgaps, mis, circular, doAlnPS, doColor, doMEA, n_back, eval_energy, pf, istty;
47 double min_en, real_en, sfact, MEAgamma, bppmThreshold, betaScale;
48 char *AS[MAX_NUM_NAMES]; /* aligned sequences */
49 char *names[MAX_NUM_NAMES]; /* sequence names */
50 FILE *clust_file = stdin;
51 pf_paramT *pf_parameters;
54 fname[0] = ffname[0] = gfname[0] = '\0';
55 string = structure = cstruc = ParamFile = ns_bases = NULL;
57 endgaps = mis = pf = circular = doAlnPS = doColor = n_back = eval_energy = oldAliEn = doMEA = ribo = noPS = 0;
66 set_model_details(&md);
69 #############################################
70 # check the command line prameters
71 #############################################
73 if(RNAalifold_cmdline_parser (argc, argv, &args_info) != 0) exit(1);
75 if(args_info.temp_given) temperature = args_info.temp_arg;
76 /* structure constraint */
77 if(args_info.constraint_given) fold_constrained=1;
78 /* do not take special tetra loop energies into account */
79 if(args_info.noTetra_given) md.special_hp = tetra_loop=0;
80 /* set dangle model */
81 if(args_info.dangles_given){
82 if((args_info.dangles_arg != 0) && (args_info.dangles_arg != 2))
83 warn_user("required dangle model not implemented, falling back to default dangles=2");
85 md.dangles = dangles=args_info.dangles_arg;
87 /* do not allow weak pairs */
88 if(args_info.noLP_given) md.noLP = noLonelyPairs = 1;
89 /* do not allow wobble pairs (GU) */
90 if(args_info.noGU_given) md.noGU = noGU = 1;
91 /* do not allow weak closing pairs (AU,GU) */
92 if(args_info.noClosingGU_given) md.noGUclosure = no_closingGU = 1;
93 /* gquadruplex support */
94 if(args_info.gquad_given) md.gquad = gquad = 1;
95 /* do not convert DNA nucleotide "T" to appropriate RNA "U" */
96 /* set energy model */
97 if(args_info.energyModel_given) energy_set = args_info.energyModel_arg;
98 /* take another energy parameter set */
99 if(args_info.paramFile_given) ParamFile = strdup(args_info.paramFile_arg);
100 /* Allow other pairs in addition to the usual AU,GC,and GU pairs */
101 if(args_info.nsp_given) ns_bases = strdup(args_info.nsp_arg);
102 /* set pf scaling factor */
103 if(args_info.pfScale_given) sfact = args_info.pfScale_arg;
104 /* assume RNA sequence to be circular */
105 if(args_info.circ_given) circular=1;
106 /* do not produce postscript output */
107 if(args_info.noPS_given) noPS = 1;
108 /* partition function settings */
109 if(args_info.partfunc_given){
111 if(args_info.partfunc_arg != -1)
112 do_backtrack = args_info.partfunc_arg;
114 /* MEA (maximum expected accuracy) settings */
115 if(args_info.MEA_given){
117 if(args_info.MEA_arg != -1)
118 MEAgamma = args_info.MEA_arg;
120 if(args_info.betaScale_given) betaScale = args_info.betaScale_arg;
121 /* set the bppm threshold for the dotplot */
122 if(args_info.bppmThreshold_given)
123 bppmThreshold = MIN2(1., MAX2(0.,args_info.bppmThreshold_arg));
125 if(args_info.cfactor_given) cv_fact = args_info.cfactor_arg;
127 if(args_info.nfactor_given) nc_fact = args_info.nfactor_arg;
128 if(args_info.endgaps_given) endgaps = 1;
129 if(args_info.mis_given) mis = 1;
130 if(args_info.color_given) doColor=1;
131 if(args_info.aln_given) doAlnPS=1;
132 if(args_info.old_given) oldAliEn = 1;
133 if(args_info.stochBT_given){
134 n_back = args_info.stochBT_arg;
139 if(args_info.stochBT_en_given){
140 n_back = args_info.stochBT_en_arg;
146 if(args_info.ribosum_file_given){
147 RibosumFile = strdup(args_info.ribosum_file_arg);
150 if(args_info.ribosum_scoring_given){
154 /* alignment file name given as unnamed option? */
155 if(args_info.inputs_num == 1){
156 clust_file = fopen(args_info.inputs[0], "r");
157 if (clust_file == NULL) {
158 fprintf(stderr, "can't open %s\n", args_info.inputs[0]);
162 /* free allocated memory of command line data structure */
163 RNAalifold_cmdline_parser_free (&args_info);
166 #############################################
168 #############################################
170 if(circular && gquad){
171 nrerror("G-Quadruplex support is currently not available for circular RNA structures");
176 if (circular && noLonelyPairs)
177 warn_user("depending on the origin of the circular sequence, "
178 "some structures may be missed when using -noLP\n"
179 "Try rotating your sequence a few times\n");
181 if (ParamFile != NULL) read_parameter_file(ParamFile);
183 if (ns_bases != NULL) {
184 nonstandards = space(33);
192 nonstandards[i++]=*c++;
193 nonstandards[i++]=*c;
194 if ((sym)&&(*c!=*(c-1))) {
195 nonstandards[i++]=*c;
196 nonstandards[i++]=*(c-1);
203 istty = isatty(fileno(stdout))&&isatty(fileno(stdin));
206 ########################################################
207 # handle user input from 'stdin' if necessary
208 ########################################################
210 if(fold_constrained){
212 print_tty_constraint_full();
213 print_tty_input_seq_str("");
215 input_type = get_input_line(&input_string, VRNA_INPUT_NOSKIP_COMMENTS);
216 if(input_type & VRNA_INPUT_QUIT){ return 0;}
217 else if((input_type & VRNA_INPUT_MISC) && (strlen(input_string) > 0)){
218 cstruc = strdup(input_string);
221 else warn_user("constraints missing");
224 if (istty && (clust_file == stdin))
225 print_tty_input_seq_str("Input aligned sequences in clustalw or stockholm format\n(enter a line starting with \"//\" to indicate the end of your input)");
227 n_seq = read_clustal(clust_file, AS, names);
228 if (n_seq==0) nrerror("no sequences found");
230 if (clust_file != stdin) fclose(clust_file);
232 ########################################################
233 # done with 'stdin' handling, now init everything properly
234 ########################################################
237 length = (int) strlen(AS[0]);
238 structure = (char *)space((unsigned) length+1);
240 if(fold_constrained && cstruc != NULL)
241 strncpy(structure, cstruc, length);
244 for (i=0; i<n_seq; i++) mark_endgaps(AS[i], '~');
247 ########################################################
248 # begin actual calculations
249 ########################################################
255 min_en = circalifold((const char **)AS, structure);
256 for (i=0; AS[i]!=NULL; i++)
257 s += energy_of_circ_structure(AS[i], structure, -1);
260 float *ens = (float *)space(2*sizeof(float));
261 min_en = alifold((const char **)AS, structure);
263 energy_of_ali_gquad_structure((const char **)AS, structure, n_seq, ens);
265 energy_of_alistruct((const char **)AS, structure, n_seq, ens);
271 string = (mis) ? consens_mis((const char **) AS) : consensus((const char **) AS);
272 printf("%s\n%s", string, structure);
275 printf("\n minimum free energy = %6.2f kcal/mol (%6.2f + %6.2f)\n",
276 min_en, real_en, min_en - real_en);
278 printf(" (%6.2f = %6.2f + %6.2f) \n", min_en, real_en, min_en-real_en );
280 strcpy(ffname, "alirna.ps");
281 strcpy(gfname, "alirna.g");
285 A = annote(structure, (const char**) AS);
289 (void) PS_rna_plot_a_gquad(string, structure, ffname, A[0], A[1]);
291 (void) PS_rna_plot_a_gquad(string, structure, ffname, NULL, A[1]);
294 (void) PS_rna_plot_a(string, structure, ffname, A[0], A[1]);
296 (void) PS_rna_plot_a(string, structure, ffname, NULL, A[1]);
298 free(A[0]); free(A[1]); free(A);
301 PS_color_aln(structure, "aln.ps", (const char const **) AS, (const char const **) names);
303 /* free mfe arrays */
304 free_alifold_arrays();
310 mfe_struc = strdup(structure);
312 kT = (betaScale*((temperature+K0)*GASCONST))/1000.; /* in Kcal */
313 pf_scale = exp(-(sfact*min_en)/kT/length);
314 if (length>2000) fprintf(stderr, "scaling factor %f\n", pf_scale);
317 if (cstruc!=NULL) strncpy(structure, cstruc, length+1);
319 pf_parameters = get_boltzmann_factors_ali(n_seq, temperature, betaScale, md, pf_scale);
320 energy = alipf_fold_par((const char **)AS, structure, NULL, pf_parameters, do_backtrack, fold_constrained, circular);
323 /*stochastic sampling*/
324 for (i=0; i<n_back; i++) {
327 s = alipbacktrack(&prob);
329 if (eval_energy ) printf("%6g %.2f ",prob, -1*(kT*log(prob)-energy));
336 printf("%s", structure);
337 if (!istty) printf(" [%6.2f]\n", energy);
340 if ((istty)||(!do_backtrack))
341 printf(" free energy of ensemble = %6.2f kcal/mol\n", energy);
342 printf(" frequency of mfe structure in ensemble %g\n",
343 exp((energy-min_en)/kT));
350 FLT_OR_DBL *probs = export_ali_bppm();
353 assign_plist_from_pr(&pl, probs, length, bppmThreshold);
354 assign_plist_from_db(&mfel, mfe_struc, 0.95*0.95);
358 cent = get_centroid_struct_pr(length, &dist, probs);
359 ens=(float *)space(2*sizeof(float));
360 energy_of_alistruct((const char **)AS, cent, n_seq, ens);
361 /*cent_en = energy_of_struct(string, cent);*/ /*ali*/
362 printf("%s %6.2f {%6.2f + %6.2f}\n",cent,ens[0]-ens[1],ens[0],(-1)*ens[1]);
369 assign_plist_from_pr(&pl2, probs, length, 1e-4/(1+MEAgamma));
370 mea = MEA(pl2, structure, MEAgamma);
371 ens = (float *)space(2*sizeof(float));
373 energy_of_alistruct((const char **)AS, structure, n_seq, ens);
375 ens[0] = energy_of_structure(string, structure, 0);
376 printf("%s {%6.2f MEA=%.2f}\n", structure, ens[0], mea);
381 if (fname[0]!='\0') {
382 strcpy(ffname, fname);
383 strcat(ffname, "_ali.out");
384 } else strcpy(ffname, "alifold.out");
385 aliout = fopen(ffname, "w");
387 fprintf(stderr, "can't open %s skipping output\n", ffname);
389 print_aliout(AS, pl, n_seq, mfe_struc, aliout);
392 if (fname[0]!='\0') {
393 strcpy(ffname, fname);
394 strcat(ffname, "_dp.ps");
395 } else strcpy(ffname, "alidot.ps");
396 cp = make_color_pinfo(AS,pl, n_seq, mfel);
397 (void) PS_color_dot_plot(string, cp, ffname);
406 if (cstruc!=NULL) free(cstruc);
407 (void) fflush(stdout);
410 for (i=0; AS[i]; i++) {
411 free(AS[i]); free(names[i]);
416 PRIVATE void mark_endgaps(char *seq, char egap) {
419 for (i=0; i<n && (seq[i]=='-'); i++) {
422 for (i=n-1; i>0 && (seq[i]=='-'); i--) {
427 PRIVATE void print_pi(const pair_info pi, FILE *file) {
428 const char *pname[8] = {"","CG","GC","GU","UG","AU","UA", "--"};
431 /* numbering starts with 1 in output */
432 fprintf(file, "%5d %5d %2d %5.1f%% %7.3f",
433 pi.i, pi.j, pi.bp[0], 100.*pi.p, pi.ent);
435 if (pi.bp[i]) fprintf(file, " %s:%-4d", pname[i], pi.bp[i]);
436 if (!pi.comp) fprintf(file, " +");
440 /*-------------------------------------------------------------------------*/
442 PRIVATE char **annote(const char *structure, const char *AS[]) {
443 /* produce annotation for colored drawings from PS_rna_plot_a() */
444 char *ps, *colorps, **A;
445 int i, n, s, pairings, maxl;
447 char * colorMatrix[6][3] = {
448 {"0.0 1", "0.0 0.6", "0.0 0.2"}, /* red */
449 {"0.16 1","0.16 0.6", "0.16 0.2"}, /* ochre */
450 {"0.32 1","0.32 0.6", "0.32 0.2"}, /* turquoise */
451 {"0.48 1","0.48 0.6", "0.48 0.2"}, /* green */
452 {"0.65 1","0.65 0.6", "0.65 0.2"}, /* blue */
453 {"0.81 1","0.81 0.6", "0.81 0.2"} /* violet */
459 A = (char **) space(sizeof(char *)*2);
460 ps = (char *) space(maxl);
461 colorps = (char *) space(maxl);
462 ptable = make_pair_table(structure);
463 for (i=1; i<=n; i++) {
464 char pps[64], ci='\0', cj='\0';
465 int j, type, pfreq[8] = {0,0,0,0,0,0,0,0}, vi=0, vj=0;
466 if ((j=ptable[i])<i) continue;
467 for (s=0; AS[s]!=NULL; s++) {
468 type = pair[encode_char(AS[s][i-1])][encode_char(AS[s][j-1])];
471 if (AS[s][i-1] != ci) { ci = AS[s][i-1]; vi++;}
472 if (AS[s][j-1] != cj) { cj = AS[s][j-1]; vj++;}
475 for (pairings=0,s=1; s<=7; s++) {
476 if (pfreq[s]) pairings++;
479 if ((maxl - strlen(ps) < 192) || ((maxl - strlen(colorps)) < 64)) {
481 ps = realloc(ps, maxl);
482 colorps = realloc(colorps, maxl);
483 if ((ps==NULL) || (colorps == NULL))
484 nrerror("out of memory in realloc");
487 if (pfreq[0]<=2 && pairings>0) {
488 snprintf(pps, 64, "%d %d %s colorpair\n",
489 i,j, colorMatrix[pairings-1][pfreq[0]]);
490 strcat(colorps, pps);
494 snprintf(pps, 64, "%d %d %d gmark\n", i, j, pfreq[0]);
498 snprintf(pps, 64, "%d cmark\n", i);
502 snprintf(pps, 64, "%d cmark\n", j);
512 /*-------------------------------------------------------------------------*/
515 PRIVATE int compare_pair_info(const void *pi1, const void *pi2) {
518 p1 = (pair_info *)pi1; p2 = (pair_info *)pi2;
519 for (nc1=nc2=0, i=1; i<=6; i++) {
520 if (p1->bp[i]>0) nc1++;
521 if (p2->bp[i]>0) nc2++;
523 /* sort mostly by probability, add
524 epsilon * comp_mutations/(non-compatible+1) to break ties */
525 return (p1->p + 0.01*nc1/(p1->bp[0]+1.)) <
526 (p2->p + 0.01*nc2/(p2->bp[0]+1.)) ? 1 : -1;
529 PRIVATE void print_aliout(char **AS, plist *pl, int n_seq, char * mfe, FILE *aliout) {
530 int i, j, k, n, num_p=0, max_p = 64;
534 for (n=0; pl[n].i>0; n++);
536 max_p = 64; pi = space(max_p*sizeof(pair_info));
537 duck = (double *) space((strlen(mfe)+1)*sizeof(double));
538 ptable = make_pair_table(mfe);
540 for (k=0; k<n; k++) {
542 p = pl[k].p; i=pl[k].i; j = pl[k].j;
543 duck[i] -= p * log(p);
544 duck[j] -= p * log(p);
546 if (p<PMIN) continue;
551 pi[num_p].ent = duck[i]+duck[j]-p*log(p);
552 for (type=0; type<8; type++) pi[num_p].bp[type]=0;
553 for (s=0; s<n_seq; s++) {
555 a=encode_char(toupper(AS[s][i-1]));
556 b=encode_char(toupper(AS[s][j-1]));
558 if ((AS[s][i-1] == '-')||(AS[s][j-1] == '-')) type = 7;
559 if ((AS[s][i-1] == '~')||(AS[s][j-1] == '~')) type = 7;
560 pi[num_p].bp[type]++;
561 pi[num_p].comp = (ptable[i] == j) ? 1:0;
566 pi = xrealloc(pi, max_p * sizeof(pair_info));
571 qsort(pi, num_p, sizeof(pair_info), compare_pair_info );
574 fprintf(aliout, "%d sequence; length of alignment %d\n",
575 n_seq, (int) strlen(AS[0]));
576 fprintf(aliout, "alifold output\n");
578 for (k=0; pi[k].i>0; k++) {
579 pi[k].comp = (ptable[pi[k].i] == pi[k].j) ? 1:0;
580 print_pi(pi[k], aliout);
582 fprintf(aliout, "%s\n", mfe);
588 PRIVATE cpair *make_color_pinfo(char **sequences, plist *pl, int n_seq, plist *mfel) {
589 /* produce info for PS_color_dot_plot */
591 int i, n,s, a, b,z,t,j, c;
593 for (n=0; pl[n].i>0; n++);
595 cp = (cpair *) space(sizeof(cpair)*(n+1));
596 for (i=0; i<n; i++) {
602 for (z=0; z<7; z++) pfreq[z]=0;
603 for (s=0; s<n_seq; s++) {
604 a=encode_char(toupper(sequences[s][cp[c].i-1]));
605 b=encode_char(toupper(sequences[s][cp[c].j-1]));
606 if ((sequences[s][cp[c].j-1]=='~')||(sequences[s][cp[c].i-1] == '~')) continue;
609 for (z=1; z<7; z++) {
613 cp[c].hue = (ncomp-1.0)/6.2; /* hue<6/6.9 (hue=1 == hue=0) */
614 cp[c].sat = 1 - MIN2( 1.0, (float) (pfreq[0]*2. /*pi[i].bp[0]*/ /(n_seq)));
618 for (t=0; mfel[t].i > 0; t++) {
620 for (j=0; j<c; j++) {
621 if ((cp[j].i==mfel[t].i)&&(cp[j].j==mfel[t].j)) {
628 fprintf(stderr,"mfe base pair with very low prob in pf: %d %d\n",mfel[t].i,mfel[t].j);
629 cp = (cpair *) realloc(cp,sizeof(cpair)*(c+1));