1 /***********************************************
2 * # Copyright 2009-2010. Liu Yongchao
3 * # Contact: Liu Yongchao, School of Computer Engineering,
4 * # Nanyang Technological University.
5 * # Emails: liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
7 * # GPL version 3.0 applies.
9 * ************************************************/
10 #include "SafeVector.h"
19 #define TRACE 0 // 0: NOTRACE 1: TRACE
21 #define endgaps 1 // 1: engap penaties enabled 0: disabled
22 #define PART_FULL_MEMORY 0 //0: LOW MEM OPTION
23 #define REVPART_FULL_MEMORY 0 //0: LOW MEM OPTION
27 #define OS_HUGE_VALL HUGE_VAL
29 #define OS_HUGE_VALL HUGE_VALL
38 char opt; //can be 'P' or 'M'
43 typedef struct sequence {
49 typedef struct alignment {
55 ////////////////////////////////////////////////////////
56 //externs related to scoring matrix and input arguments
57 ///////////////////////////////////////////////////////////
58 extern float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
59 extern char aminos[26], matrixtype[20], bases[26];
61 extern double sub_matrix[26][26];
62 extern int subst_index[26];
64 extern float TEMPERATURE;
65 extern int MATRIXTYPE;
69 extern argument_decl argument;
71 //////////////////////////////////////////////////////////////////////////////
72 //calculates reverse partition function values based on z matrices
73 //and also simulaneously calculates the propability of each basepair
74 //or aminoacid residue pair i,j
75 //////////////////////////////////////////////////////////////////////////////
77 VF *revers_partf(fasta sequences[2], const double termgapopen,
78 const double termgapextend, long double **Zfm, const double d,
80 // printf("revpart\n");
81 //rest of the declarations
83 long double **Zm = NULL;
84 long double **Ze = NULL;
85 long double **Zf = NULL;
90 double endgapopen, endgapextend;
93 //Init lengths of sequences
94 len0 = strlen(sequences[0].text);
95 len1 = strlen(sequences[1].text);
97 //Safe vector declared
98 VF *posteriorPtr = new VF((len0 + 1) * (len1 + 1));
99 VF & posterior = *posteriorPtr;
100 VF::iterator ptr = posterior.begin();
102 if (TRACE) //open the trace file
103 fo = fopen("revpartdump", "a");
106 endgapopen = termgapopen;
107 endgapextend = termgapextend;
109 //instantiate the z matrix
110 if (REVPART_FULL_MEMORY) {
112 Ze = new long double *[sequences[1].length + 1];
113 Zf = new long double *[sequences[1].length + 1];
114 Zm = new long double *[sequences[1].length + 1];
117 printf("\n\n %e %e\n", d, e);
119 //DYNAMICALLY GROW 2D Zm Zf Ze MARICES (long double)
120 for (i = 0; i <= sequences[1].length; i++) {
121 Ze[i] = new long double[sequences[0].length + 1];
122 Zf[i] = new long double[sequences[0].length + 1];
123 Zm[i] = new long double[sequences[0].length + 1];
126 Zm = new long double *[2];
127 Ze = new long double *[2];
128 Zf = new long double *[2];
129 for (i = 0; i <= 1; i++) {
130 Zm[i] = new long double[sequences[0].length + 1];
131 Ze[i] = new long double[sequences[0].length + 1];
132 Zf[i] = new long double[sequences[0].length + 1];
138 printf("in rev partf---");
142 if (REVPART_FULL_MEMORY) {
143 for (i = 0; i <= len1; i++)
144 for (j = 0; j <= len0; j++) {
151 for (j = 0; j <= len0; j++) {
161 //fill the probability matrix with 0s
162 for (i = 0; i <= len1; i++)
163 for (j = 0; j <= len0; j++)
164 ptr[j * (len1 + 1) + i] = 0;
168 Ze[len1][len0] = Zf[len1][len0] = 0;
169 Zf[len1 - 1][len0] = Zm[len1][len0] * d;
170 Ze[len1][len0 - 1] = Zm[len1][len0] * d;
173 if (REVPART_FULL_MEMORY) {
174 for (i = len1 - 2; i >= 0; i--) {
175 Zf[i][len0] = Zf[i + 1][len0] * e;
180 if (REVPART_FULL_MEMORY) {
181 for (j = len0 - 2; j >= 0; j--) {
182 Ze[len1][j] = Ze[len1][j + 1] * e;
185 for (j = len0 - 2; j >= 0; j--) {
186 Ze[0][j] = Ze[0][j + 1] * e;
191 if (REVPART_FULL_MEMORY) {
194 Ze[len1][len0] = Zf[len1][len0] = 0;
195 Zf[len1 - 1][len0] = Zm[len1][len0] * endgapopen;
196 Ze[len1][len0 - 1] = Zm[len1][len0] * endgapopen;
199 for (i = len1 - 2; i >= 0; i--) {
200 Zf[i][len0] = Zf[i + 1][len0] * endgapextend;
206 for (j = len0 - 2; j >= 0; j--) {
207 Ze[len1][j] = Ze[len1][j + 1] * endgapextend;
213 // Zm(0) be the current row being filled/computed
214 // Zm(1) be the previous row
217 Ze[0][len0] = Zf[0][len0] = 0;
218 Zf[1][len0] = Zm[1][len0] * endgapopen;
219 Ze[0][len0 - 1] = Zm[1][len0] * endgapopen;
222 for (j = len0 - 2; j >= 0; j--) {
223 Ze[0][j] = Ze[0][j + 1] * endgapextend;
228 } //END FULL MEMORY and GAP enablement IF STATEMENT
230 double scorez, zz = 0;
232 for (i = len1 - 1; i >= 0; i--) {
234 for (j = len0 - 1; j >= 0; j--) {
235 Si = subst_index[sequences[1].text[i] - 'A'];
236 Tj = subst_index[sequences[0].text[j] - 'A'];
237 scorez = sub_matrix[Si][Tj];
239 //endgaps modification aug 10
240 double open0, extend0, open1, extend1;
243 extend0 = extend1 = e;
247 //check to see if one of the 2 sequences or both reach the end
251 extend0 = endgapextend;
257 extend1 = endgapextend;
262 if (REVPART_FULL_MEMORY) {
265 Ze[i][j] = Zm[i][j + 1] * open0 + Ze[i][j + 1] * extend0;
266 Zf[i][j] = Zm[i + 1][j] * open1 + Zf[i + 1][j] * extend1;
267 Zm[i][j] = (Zm[i + 1][j + 1] + Zf[i + 1][j + 1]
268 + Ze[i + 1][j + 1]) * scorez;
269 zz = Zm[i][j] + Zf[i][j] + Ze[i][j];
273 //2 ROW zE zF ALGORITHM GOES...:
274 //Ze[1][j] =Zm[i][j + 1] * exp(beta * open0) + Ze[1][j + 1] *exp(beta * extend0);
275 //Zf[1][j] = Zm[i + 1][j] * exp(beta * open1) + Zf[0][j] * exp(beta * extend1);
276 //Zm[i][j] = (Zm[i + 1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1]) * exp(beta * scorez);
277 //zz = Zm[0][j] + Zf[1][j] + Ze[1][j];
279 //lowmem code for merging probability calculating module
280 //Here we make use of Zm as a 2 row matrix
282 Zf[1][j] = Zm[1][j] * open1 + Zf[0][j] * extend1;
283 Ze[1][j] = Zm[0][j + 1] * open0 + Ze[1][j + 1] * extend0;
284 Zm[0][j] = (Zm[1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1])
287 tempvar = Zfm[i + 1][j + 1] * Zm[0][j];
288 //divide P(i,j) i.e. pairwise probability by denominator
289 tempvar /= (scorez * Zfm[0][0]);
290 probability = (float) tempvar;
292 //store only noticable probabilities
293 if (probability <= 1 && probability >= 0.001) {
295 //validprob[i + 1][j + 1] = probability;
296 ptr[(j + 1) * (len1 + 1) + (i + 1)] = probability;
298 //lowmem code ends here
304 if (REVPART_FULL_MEMORY == 0) {
305 for (int t = 0; t <= sequences[0].length; t++) {
323 printf("\n\nrM:....\n\n");
324 if (REVPART_FULL_MEMORY) {
325 for (i = 0; i <= len1; i++) {
326 for (j = 0; j <= len0; j++)
327 printf("%.2Le ", Zm[i][j]);
331 printf("\n\nrE:....\n\n");
332 for (i = 0; i <= len1; i++) {
333 for (j = 0; j <= len0; j++)
334 printf("%.2Le ", Ze[i][j]);
339 printf("\n\nrF:....\n\n");
340 for (i = 0; i <= len1; i++) {
341 for (j = 0; j <= len0; j++)
342 printf("%.2Le ", Zf[i][j]);
356 //delete unused memory
358 if (REVPART_FULL_MEMORY) {
359 for (i = 0; i <= len1; i++) {
374 for (i = 0; i <= len1; i++) {
391 return (posteriorPtr);
395 //////////////////////////////////////////////////////////////
396 //forward partition function
397 /////////////////////////////////////////////////////////////
399 long double **partf(fasta sequences[2], const double termgapopen,
400 const double termgapextend, const double d, const double e) {
402 int i, j, len1, len0;
403 long double **Zm = NULL, **Zf = NULL, **Ze = NULL, zz = 0;
404 double endgapopen, endgapextend;
407 endgapopen = termgapopen;
408 endgapextend = termgapextend;
410 //the flag endgaps is set at the #define section
411 if (PART_FULL_MEMORY) {
413 Zf = new long double *[sequences[1].length + 1];
414 Ze = new long double *[sequences[1].length + 1];
415 Zm = new long double *[sequences[1].length + 1];
419 printf("\nPARTF:====\n");
421 //DYNAMICALLY GROW 2D M,IX,IY,PIX,PIY MARICES
422 for (i = 0; i <= sequences[1].length; i++) {
423 Zf[i] = new long double[sequences[0].length + 1];
424 Ze[i] = new long double[sequences[0].length + 1];
425 Zm[i] = new long double[sequences[0].length + 1];
428 Zm = new long double *[sequences[1].length + 1];
429 Ze = new long double *[2];
430 Zf = new long double *[2];
431 for (i = 0; i <= sequences[1].length; i++) {
432 Zm[i] = new long double[sequences[0].length + 1];
434 Ze[0] = new long double[sequences[0].length + 1];
435 Zf[0] = new long double[sequences[0].length + 1];
436 Ze[1] = new long double[sequences[0].length + 1];
437 Zf[1] = new long double[sequences[0].length + 1];
440 len0 = strlen(sequences[0].text);
441 len1 = strlen(sequences[1].text);
443 if (PART_FULL_MEMORY) {
444 for (i = 0; i <= sequences[1].length; i++)
445 for (j = 0; j <= sequences[0].length; j++) {
451 for (i = 0; i <= len1; i++) {
452 for (j = 0; j <= len0; j++) {
456 for (j = 0; j <= len0; j++) {
469 Zf[0][0] = Ze[0][0] = 0;
470 Zf[1][0] = Zm[0][0] * d;
471 Ze[0][1] = Zm[0][0] * d;
474 if (PART_FULL_MEMORY) {
475 for (i = 2; i <= sequences[1].length; i++) {
476 Zf[i][0] = Zf[i - 1][0] * e;
481 for (j = 2; j <= sequences[0].length; j++) {
482 Ze[0][j] = Ze[0][j - 1] * e;
487 Zf[0][0] = Ze[0][0] = 0;
488 Zf[1][0] = Zm[0][0] * endgapopen;
489 Ze[0][1] = Zm[0][0] * endgapopen;
492 if (PART_FULL_MEMORY) {
493 for (i = 2; i <= sequences[1].length; i++) {
494 Zf[i][0] = Zf[i - 1][0] * endgapextend;
499 for (j = 2; j <= sequences[0].length; j++) {
500 Ze[0][j] = Ze[0][j - 1] * endgapextend;
509 for (i = 1; i <= sequences[1].length; i++) {
511 for (j = 1; j <= sequences[0].length; j++) {
513 Si = subst_index[sequences[1].text[i - 1] - 'A'];
514 Tj = subst_index[sequences[0].text[j - 1] - 'A'];
516 score = sub_matrix[Si][Tj];
518 double open0, extend0, open1, extend1;
521 extend0 = extend1 = e;
524 //check to see if one of the 2 sequences or both reach the end
526 if (i == sequences[1].length) {
528 extend0 = endgapextend;
532 if (j == sequences[0].length) {
534 extend1 = endgapextend;
539 //z computation using open and extend temp vars
540 //open0 is gap open in seq0 and open1 is gap open in seq1
541 //entend0 is gap extend in seq0 and extend1 is gap extend in seq1
543 if (PART_FULL_MEMORY) {
544 Ze[i][j] = Zm[i][j - 1] * open0 + Ze[i][j - 1] * extend0;
546 if (Ze[i][j] >= OS_HUGE_VALL) {
547 printf("ERROR: huge val error for Ze\n");
551 Zf[i][j] = Zm[i - 1][j] * open1 + Zf[i - 1][j] * extend1;
553 if (Zf[i][j] >= OS_HUGE_VALL) {
554 printf("ERROR: huge val error for Zf\n");
558 Zm[i][j] = (Zm[i - 1][j - 1] + Ze[i - 1][j - 1]
559 + Zf[i - 1][j - 1]) * score;
561 if (Zm[i][j] >= OS_HUGE_VALL) {
562 printf("ERROR: huge val error for Zm\n");
566 zz = Zm[i][j] + Ze[i][j] + Zf[i][j];
568 Ze[1][j] = Zm[i][j - 1] * open0 + Ze[1][j - 1] * extend0;
570 if (Ze[1][j] >= OS_HUGE_VALL) {
571 printf("ERROR: huge val error for zE\n");
575 Zf[1][j] = Zm[i - 1][j] * open1 + Zf[0][j] * extend1;
577 if (Zf[1][j] >= OS_HUGE_VALL) {
578 printf("ERROR: huge val error for zF\n");
582 Zm[i][j] = (Zm[i - 1][j - 1] + Ze[0][j - 1] + Zf[0][j - 1])
585 if (Zm[i][j] >= OS_HUGE_VALL) {
586 printf("ERROR: huge val error for zM\n");
590 zz = Zm[i][j] + Ze[1][j] + Zf[1][j];
595 if (!PART_FULL_MEMORY) {
596 for (int t = 0; t <= sequences[0].length; t++) {
610 //store the sum of zm zf ze (m,n)s in zm's 0,0 th position
615 //print the 3 Z matrices namely Zm Zf and Ze
617 printf("\n\nFINAL Zm:\n");
618 for (i = 0; i <= sequences[1].length; i++) {
619 for (j = 0; j <= sequences[0].length; j++)
620 printf("%.2Le ", Zm[i][j]);
624 printf("FINAL Zf \n");
625 for (i = 0; i <= sequences[1].length; i++) {
626 for (j = 0; j <= sequences[0].length; j++)
627 printf("%.2Le ", Zf[i][j]);
631 printf("FINAL Ze \n");
632 for (i = 0; i <= sequences[1].length; i++) {
633 for (j = 0; j <= sequences[0].length; j++)
634 printf("%.2Le ", Ze[i][j]);
638 //end debug dump code
642 if (PART_FULL_MEMORY) {
643 for (i = 0; i <= sequences[1].length; i++) {
659 } //end of forward partition function
661 /////////////////////////////////////////////////////////////////////////////////////////
662 //entry point (was the main function) , returns the posterior probability safe vector
663 ////////////////////////////////////////////////////////////////////////////////////////
664 VF *ComputePostProbs(int a, int b, string seq1, string seq2) {
665 //printf("probamod\n");
666 double gap_open = -22, gap_ext = -1, beta = 0.2;//T = 5, beta = 1/T = 0.2, by default
669 double termgapopen = 1.0f; //exp(0)
670 double termgapextend = 1.0f; //exp(0)
672 //initialize the sequence structure
675 sequences[0].length = strlen((char *) seq1.c_str());
676 sequences[0].text = (char *) seq1.c_str();
677 sequences[0].title = new char[10];
678 strcpy(sequences[0].title, "seq0");
679 sequences[1].length = strlen((char *) seq2.c_str());
680 sequences[1].text = (char *) seq2.c_str();
681 sequences[1].title = new char[10];
682 strcpy(sequences[1].title, "seq1");
687 printf("%d %d %s\n%d %d %s\n--\n", a, sequences[0].length,
688 sequences[0].text, b, sequences[1].length, sequences[1].text);
689 printf("after init\n");
691 FILE *dump1 = fopen("dump1", "a");
692 fprintf(dump1, "%d %d %s\n%d %d %s\n--\n", a, sequences[0].length,
693 sequences[0].text, b, sequences[1].length, sequences[1].text);
697 gap_open = argument.gapopen;
698 gap_ext = argument.gapext;
699 beta = argument.beta;
701 stock_loop = argument.N;
702 le = argument.matrix;
704 //compute the values of exp(beta * ?)
705 termgapopen = exp(beta * 0.0);
706 termgapextend = exp(beta * 0.0);
707 gap_open = exp(beta * gap_open);
708 gap_ext = exp(beta * gap_ext);
711 printf("%f %f %f %d\n", gap_open, gap_ext, beta, le);
713 //call for calculating the posterior probabilities
714 // 1. call partition function partf
715 // 2. calculate revpartition using revers_parf
716 // 3. calculate probabilities
717 /// MODIFICATION... POPULATE SAFE VECTOR
721 MAT1 = partf(sequences, termgapopen, termgapextend, gap_open, gap_ext);
723 return revers_partf(sequences, termgapopen, termgapextend, MAT1, gap_open,
728 //end of posterior probability module