Add GLprobs and MSAprobs to binaries
[jabaws.git] / binaries / src / GLProbs-1.0 / MSAPartProbs.cpp
1 /***********************************************
2  * # Copyright 2009-2010. Liu Yongchao
3  * # Contact: Liu Yongchao, School of Computer Engineering,
4  * #                     Nanyang Technological University.
5  * # Emails:     liuy0039@ntu.edu.sg; nkcslyc@hotmail.com
6  * #
7  * # GPL version 3.0 applies.
8  * #
9  * ************************************************/
10 #include "SafeVector.h"
11 #include <iostream>
12 #include <string.h>
13 #include <stdlib.h>
14 #include <stdio.h>
15 #include <math.h>
16 #include <time.h>
17 #include <ctype.h>
18 #include <assert.h>
19 #include "MultiSequence.h"
20 #include "ScoreType.h"
21
22 #define  TRACE 0                // 0: NOTRACE 1: TRACE
23 //proba like settings
24 #define  endgaps 1              // 1: engap penaties enabled 0: disabled
25 #define  PART_FULL_MEMORY 0     //0: LOW MEM OPTION
26 #define  REVPART_FULL_MEMORY 0  //0: LOW MEM OPTION
27 using namespace std;
28
29 #ifdef _WIN32
30 #define OS_HUGE_VALL    HUGE_VAL        
31 #else
32 #define OS_HUGE_VALL    HUGE_VALL
33 #endif
34
35 typedef struct {
36         char input[30];
37         int matrix;
38         int N;
39         float T;
40         float beta;
41         char opt;                       //can be 'P' or 'M'
42         float gapopen;
43         float gapext;
44 } argument_decl;
45
46 typedef struct sequence {
47         char *title;
48         char *text;
49         int length;
50 } fasta;
51
52 typedef struct alignment {
53         char *title;
54         char *text;
55         int length;
56 } align;
57
58 ////////////////////////////////////////////////////////
59 //externs related to scoring matrix and input arguments
60 ///////////////////////////////////////////////////////////
61 extern float g_gap_open1, g_gap_open2, g_gap_ext1, g_gap_ext2;
62 extern char aminos[26], matrixtype[20], bases[26];
63
64 extern double sub_matrix[26][26];
65 extern double normalized_matrix[26][26]; // add by YE Yongtao
66 extern int subst_index[26];
67
68 extern float TEMPERATURE;
69 extern int MATRIXTYPE;
70
71 extern float GAPOPEN;
72 extern float GAPEXT;
73 extern argument_decl argument;
74
75 //////////////////////////////////////////////////////////////////////////////
76 //calculates reverse partition function values based on z matrices
77 //and also simulaneously calculates the propability of each basepair
78 //or aminoacid residue pair i,j
79 //////////////////////////////////////////////////////////////////////////////
80
81 VF *revers_partf(fasta sequences[2], const double termgapopen,
82                 const double termgapextend, long double **Zfm, const double d,
83                 const double e) {
84         // printf("revpart\n");
85         //rest of the declarations
86         int i, j;
87         long double **Zm = NULL;
88         long double **Ze = NULL;
89         long double **Zf = NULL;
90         int len0, len1;
91         float probability;
92         long double tempvar;
93         int Si, Tj;
94         double endgapopen, endgapextend;
95         FILE *fo;
96
97         //Init lengths of sequences
98         len0 = strlen(sequences[0].text);
99         len1 = strlen(sequences[1].text);
100
101         //Safe vector declared
102         VF *posteriorPtr = new VF((len0 + 1) * (len1 + 1));
103         VF & posterior = *posteriorPtr;
104         VF::iterator ptr = posterior.begin();
105
106         if (TRACE)                      //open the trace file
107                 fo = fopen("revpartdump", "a");
108
109         //default:
110         endgapopen = termgapopen;
111         endgapextend = termgapextend;
112
113         //instantiate the z matrix
114         if (REVPART_FULL_MEMORY) {
115
116                 Ze = new long double *[sequences[1].length + 1];
117                 Zf = new long double *[sequences[1].length + 1];
118                 Zm = new long double *[sequences[1].length + 1];
119
120                 if (TRACE)
121                         printf("\n\n %e %e\n", d, e);
122
123                 //DYNAMICALLY GROW 2D Zm Zf Ze MARICES (long double)
124                 for (i = 0; i <= sequences[1].length; i++) {
125                         Ze[i] = new long double[sequences[0].length + 1];
126                         Zf[i] = new long double[sequences[0].length + 1];
127                         Zm[i] = new long double[sequences[0].length + 1];
128                 }
129         } else {
130                 Zm = new long double *[2];
131                 Ze = new long double *[2];
132                 Zf = new long double *[2];
133                 for (i = 0; i <= 1; i++) {
134                         Zm[i] = new long double[sequences[0].length + 1];
135                         Ze[i] = new long double[sequences[0].length + 1];
136                         Zf[i] = new long double[sequences[0].length + 1];
137                 }
138
139         }
140
141         if (TRACE) {
142                 printf("in rev partf---");
143                 printf("\n\n");
144         }
145
146         if (REVPART_FULL_MEMORY) {
147                 for (i = 0; i <= len1; i++)
148                         for (j = 0; j <= len0; j++) {
149                                 Zm[i][j] = 0.0;
150                                 Zf[i][j] = 0.0;
151                                 Ze[i][j] = 0.0;
152                         }
153         } else {
154
155                 for (j = 0; j <= len0; j++) {
156                         Zm[0][j] = 0;
157                         Zf[0][j] = 0;
158                         Ze[0][j] = 0;
159                         Zf[1][j] = 0;
160                         Ze[1][j] = 0;
161                         Zm[1][j] = 0;
162                 }
163         }
164
165         //fill the probability matrix with 0s
166         for (i = 0; i <= len1; i++)
167                 for (j = 0; j <= len0; j++)
168                         ptr[j * (len1 + 1) + i] = 0;
169
170         if (endgaps == 0) {
171                 Zm[len1][len0] = 1;
172                 Ze[len1][len0] = Zf[len1][len0] = 0;
173                 Zf[len1 - 1][len0] = Zm[len1][len0] * d;
174                 Ze[len1][len0 - 1] = Zm[len1][len0] * d;
175
176                 //>=2ND ROW INIT
177                 if (REVPART_FULL_MEMORY) {
178                         for (i = len1 - 2; i >= 0; i--) {
179                                 Zf[i][len0] = Zf[i + 1][len0] * e;
180                         }
181                 }
182
183                 //>=2ND COL INIT
184                 if (REVPART_FULL_MEMORY) {
185                         for (j = len0 - 2; j >= 0; j--) {
186                                 Ze[len1][j] = Ze[len1][j + 1] * e;
187                         }
188                 } else {
189                         for (j = len0 - 2; j >= 0; j--) {
190                                 Ze[0][j] = Ze[0][j + 1] * e;
191                         }
192                 }
193         } else {
194
195                 if (REVPART_FULL_MEMORY) {
196
197                         Zm[len1][len0] = 1;
198                         Ze[len1][len0] = Zf[len1][len0] = 0;
199                         Zf[len1 - 1][len0] = Zm[len1][len0] * endgapopen;
200                         Ze[len1][len0 - 1] = Zm[len1][len0] * endgapopen;
201
202                         //>=2ND ROW INIT
203                         for (i = len1 - 2; i >= 0; i--) {
204                                 Zf[i][len0] = Zf[i + 1][len0] * endgapextend;
205                         }
206
207                         //M Iy= d+j*e
208
209                         //>=2ND COL INIT
210                         for (j = len0 - 2; j >= 0; j--) {
211                                 Ze[len1][j] = Ze[len1][j + 1] * endgapextend;
212                         }
213
214                 } else {
215                         //in Zm
216                         //let:
217                         //  Zm(0) be the current row being filled/computed
218                         //  Zm(1) be the previous row
219
220                         Zm[1][len0] = 1;
221                         Ze[0][len0] = Zf[0][len0] = 0;
222                         Zf[1][len0] = Zm[1][len0] * endgapopen;
223                         Ze[0][len0 - 1] = Zm[1][len0] * endgapopen;
224
225                         //>=2ND COL INIT
226                         for (j = len0 - 2; j >= 0; j--) {
227                                 Ze[0][j] = Ze[0][j + 1] * endgapextend;
228                         }
229
230                 }                       //END ELSE
231
232         }                               //END FULL MEMORY and GAP enablement IF STATEMENT
233
234         double scorez, zz = 0;
235
236         for (i = len1 - 1; i >= 0; i--) {
237
238                 for (j = len0 - 1; j >= 0; j--) {
239                         Si = subst_index[sequences[1].text[i] - 'A'];
240                         Tj = subst_index[sequences[0].text[j] - 'A'];
241                         scorez = sub_matrix[Si][Tj];
242
243                         //endgaps modification aug 10
244                         double open0, extend0, open1, extend1;
245
246                         open0 = open1 = d;
247                         extend0 = extend1 = e;
248
249                         if (endgaps == 1) {
250
251                                 //check to see if one of the 2 sequences or both reach the end
252
253                                 if (i == 0) {
254                                         open0 = endgapopen;
255                                         extend0 = endgapextend;
256
257                                 }
258
259                                 if (j == 0) {
260                                         open1 = endgapopen;
261                                         extend1 = endgapextend;
262                                 }
263
264                         }
265
266                         if (REVPART_FULL_MEMORY) {
267                                 //z computation
268
269                                 Ze[i][j] = Zm[i][j + 1] * open0 + Ze[i][j + 1] * extend0;
270                                 Zf[i][j] = Zm[i + 1][j] * open1 + Zf[i + 1][j] * extend1;
271                                 Zm[i][j] = (Zm[i + 1][j + 1] + Zf[i + 1][j + 1]
272                                                 + Ze[i + 1][j + 1]) * scorez;
273                                 zz = Zm[i][j] + Zf[i][j] + Ze[i][j];
274
275                         } else {
276
277                                 //2 ROW zE zF ALGORITHM GOES...:
278                                 //Ze[1][j] =Zm[i][j + 1] * exp(beta * open0) + Ze[1][j + 1] *exp(beta * extend0);
279                                 //Zf[1][j] = Zm[i + 1][j] * exp(beta * open1) + Zf[0][j] * exp(beta * extend1);
280                                 //Zm[i][j] = (Zm[i + 1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1]) * exp(beta * scorez);
281                                 //zz = Zm[0][j] + Zf[1][j] + Ze[1][j];
282
283                                 //lowmem code for merging probability calculating module
284                                 //Here we make use of Zm as a 2 row matrix
285
286                                 Zf[1][j] = Zm[1][j] * open1 + Zf[0][j] * extend1;
287                                 Ze[1][j] = Zm[0][j + 1] * open0 + Ze[1][j + 1] * extend0;
288                                 Zm[0][j] = (Zm[1][j + 1] + Zf[0][j + 1] + Ze[0][j + 1])
289                                                 * scorez;
290
291                                 tempvar = Zfm[i + 1][j + 1] * Zm[0][j];
292                                 //divide P(i,j) i.e. pairwise probability by denominator
293                                 tempvar /= (scorez * Zfm[0][0]);
294                                 probability = (float) tempvar;
295
296                                 //store only noticable probabilities
297                                 //if (probability <= 1 && probability >= 0.001) {
298                                         //algorithm goes...
299                                         //validprob[i + 1][j + 1] = probability;
300                                         ptr[(j + 1) * (len1 + 1) + (i + 1)] = probability;
301                                 //}
302                                 //lowmem code ends here
303
304                         }
305
306                 }                       //end of for
307
308                 if (REVPART_FULL_MEMORY == 0) {
309                         for (int t = 0; t <= sequences[0].length; t++) {
310                                 Ze[0][t] = Ze[1][t];
311                                 Ze[1][t] = 0;
312
313                                 Zf[0][t] = Zf[1][t];
314                                 Zf[1][t] = 0;
315
316                                 Zm[1][t] = Zm[0][t];
317                                 Zm[0][t] = 0;
318
319                         }
320                         Zf[0][len0] = 1;
321
322                 }
323
324         }                               //end of for
325
326         if (TRACE) {
327                 printf("\n\nrM:....\n\n");
328                 if (REVPART_FULL_MEMORY) {
329                         for (i = 0; i <= len1; i++) {
330                                 for (j = 0; j <= len0; j++)
331                                         printf("%.2Le ", Zm[i][j]);
332                                 printf("\n");
333                         }
334
335                         printf("\n\nrE:....\n\n");
336                         for (i = 0; i <= len1; i++) {
337                                 for (j = 0; j <= len0; j++)
338                                         printf("%.2Le ", Ze[i][j]);
339                                 printf("\n");
340
341                         }
342
343                         printf("\n\nrF:....\n\n");
344                         for (i = 0; i <= len1; i++) {
345                                 for (j = 0; j <= len0; j++)
346                                         printf("%.2Le ", Zf[i][j]);
347                                 printf("\n");
348
349                         }
350
351                 }
352
353         }
354
355         if (TRACE) {
356                 fprintf(fo, "\n");
357                 fclose(fo);
358         }
359
360         //delete unused memory
361
362         if (REVPART_FULL_MEMORY) {
363                 for (i = 0; i <= len1; i++) {
364                         delete (Zm[i]);
365                         delete (Zf[i]);
366                         delete (Ze[i]);
367                 }
368         } else {
369                 delete (Zf[0]);
370                 delete (Ze[0]);
371                 delete (Zm[0]);
372
373                 delete (Zm[1]);
374                 delete (Zf[1]);
375                 delete (Ze[1]);
376         }
377
378         for (i = 0; i <= len1; i++) {
379                 delete (Zfm[i]);
380         }
381
382         if (Zf != NULL)
383                 delete (Zf);
384
385         if (Ze != NULL)
386                 delete (Ze);
387
388         if (Zm != NULL)
389                 delete (Zm);
390
391         if (Zfm != NULL)
392                 delete (Zfm);
393
394         posterior[0] = 0;
395         return (posteriorPtr);
396
397 }
398
399 //////////////////////////////////////////////////////////////
400 //forward partition function
401 /////////////////////////////////////////////////////////////
402
403 long double **partf(fasta sequences[2], const double termgapopen,
404                 const double termgapextend, const double d, const double e) {
405         //printf("partf\n");
406         int i, j, len1, len0;
407         long double **Zm = NULL, **Zf = NULL, **Ze = NULL, zz = 0;
408         double endgapopen, endgapextend;
409
410         //default:
411         endgapopen = termgapopen;
412         endgapextend = termgapextend;
413
414         //the flag endgaps is set at the #define section
415         if (PART_FULL_MEMORY) {
416
417                 Zf = new long double *[sequences[1].length + 1];
418                 Ze = new long double *[sequences[1].length + 1];
419                 Zm = new long double *[sequences[1].length + 1];
420
421                 //comment
422                 if (TRACE)
423                         printf("\nPARTF:====\n");
424
425                 //DYNAMICALLY GROW 2D M,IX,IY,PIX,PIY MARICES
426                 for (i = 0; i <= sequences[1].length; i++) {
427                         Zf[i] = new long double[sequences[0].length + 1];
428                         Ze[i] = new long double[sequences[0].length + 1];
429                         Zm[i] = new long double[sequences[0].length + 1];
430                 }
431         } else {
432                 Zm = new long double *[sequences[1].length + 1];
433                 Ze = new long double *[2];
434                 Zf = new long double *[2];
435                 for (i = 0; i <= sequences[1].length; i++) {
436                         Zm[i] = new long double[sequences[0].length + 1];
437                 }
438                 Ze[0] = new long double[sequences[0].length + 1];
439                 Zf[0] = new long double[sequences[0].length + 1];
440                 Ze[1] = new long double[sequences[0].length + 1];
441                 Zf[1] = new long double[sequences[0].length + 1];
442         }
443
444         len0 = strlen(sequences[0].text);
445         len1 = strlen(sequences[1].text);
446
447         if (PART_FULL_MEMORY) {
448                 for (i = 0; i <= sequences[1].length; i++)
449                         for (j = 0; j <= sequences[0].length; j++) {
450                                 Zm[i][j] = 0.00;
451                                 Zf[i][j] = 0.00;
452                                 Ze[i][j] = 0.00;
453                         }
454         } else {
455                 for (i = 0; i <= len1; i++) {
456                         for (j = 0; j <= len0; j++) {
457                                 Zm[i][j] = 0;
458                         }
459                 }
460                 for (j = 0; j <= len0; j++) {
461                         Zf[0][j] = 0;
462                         Ze[0][j] = 0;
463                         Zf[1][j] = 0;
464                         Ze[1][j] = 0;
465                 }
466         }
467
468         //INTITIALIZE THE DP 
469
470         if (endgaps == 0) {
471                 Zm[0][0] = 1.00;
472
473                 Zf[0][0] = Ze[0][0] = 0;
474                 Zf[1][0] = Zm[0][0] * d;
475                 Ze[0][1] = Zm[0][0] * d;
476
477                 //>=2ND ROW INIT
478                 if (PART_FULL_MEMORY) {
479                         for (i = 2; i <= sequences[1].length; i++) {
480                                 Zf[i][0] = Zf[i - 1][0] * e;
481                         }
482                 }
483
484                 //>=2ND COL INIT
485                 for (j = 2; j <= sequences[0].length; j++) {
486                         Ze[0][j] = Ze[0][j - 1] * e;
487                 }
488         } else {
489                 //init z
490                 Zm[0][0] = 1.00;
491                 Zf[0][0] = Ze[0][0] = 0;
492                 Zf[1][0] = Zm[0][0] * endgapopen;
493                 Ze[0][1] = Zm[0][0] * endgapopen;
494
495                 //>=2ND ROW INIT
496                 if (PART_FULL_MEMORY) {
497                         for (i = 2; i <= sequences[1].length; i++) {
498                                 Zf[i][0] = Zf[i - 1][0] * endgapextend;
499                         }
500                 }
501
502                 //>=2ND COL INIT
503                 for (j = 2; j <= sequences[0].length; j++) {
504                         Ze[0][j] = Ze[0][j - 1] * endgapextend;
505                 }
506         }
507
508         //1ST ROW/COL INIT
509
510         int Si, Tj;
511         double score;
512
513         for (i = 1; i <= sequences[1].length; i++) {
514
515                 for (j = 1; j <= sequences[0].length; j++) {
516
517                         Si = subst_index[sequences[1].text[i - 1] - 'A'];
518                         Tj = subst_index[sequences[0].text[j - 1] - 'A'];
519
520                         score = sub_matrix[Si][Tj];
521
522                         double open0, extend0, open1, extend1;
523
524                         open0 = open1 = d;
525                         extend0 = extend1 = e;
526
527                         if (endgaps == 1) {
528                                 //check to see if one of the 2 sequences or both reach the end
529
530                                 if (i == sequences[1].length) {
531                                         open0 = endgapopen;
532                                         extend0 = endgapextend;
533
534                                 }
535
536                                 if (j == sequences[0].length) {
537                                         open1 = endgapopen;
538                                         extend1 = endgapextend;
539                                 }
540                         }
541
542                         //
543                         //z computation using open and extend temp vars
544                         //open0 is gap open in seq0 and open1 is gap open in seq1
545                         //entend0 is gap extend in seq0 and extend1 is gap extend in seq1
546
547                         if (PART_FULL_MEMORY) {
548                                 Ze[i][j] = Zm[i][j - 1] * open0 + Ze[i][j - 1] * extend0;
549
550                                 if (Ze[i][j] >= OS_HUGE_VALL) {
551                                         printf("ERROR: huge val error for Ze\n");
552                                         exit(1);
553                                 }
554
555                                 Zf[i][j] = Zm[i - 1][j] * open1 + Zf[i - 1][j] * extend1;
556
557                                 if (Zf[i][j] >= OS_HUGE_VALL) {
558                                         printf("ERROR: huge val error for Zf\n");
559                                         exit(1);
560                                 }
561
562                                 Zm[i][j] = (Zm[i - 1][j - 1] + Ze[i - 1][j - 1]
563                                                 + Zf[i - 1][j - 1]) * score;
564
565                                 if (Zm[i][j] >= OS_HUGE_VALL) {
566                                         printf("ERROR: huge val error for Zm\n");
567                                         exit(1);
568                                 }
569
570                                 zz = Zm[i][j] + Ze[i][j] + Zf[i][j];
571                         } else {
572                                 Ze[1][j] = Zm[i][j - 1] * open0 + Ze[1][j - 1] * extend0;
573
574                                 if (Ze[1][j] >= OS_HUGE_VALL) {
575                                         printf("ERROR: huge val error for zE\n");
576                                         exit(1);
577                                 }
578
579                                 Zf[1][j] = Zm[i - 1][j] * open1 + Zf[0][j] * extend1;
580
581                                 if (Zf[1][j] >= OS_HUGE_VALL) {
582                                         printf("ERROR: huge val error for zF\n");
583                                         exit(1);
584                                 }
585
586                                 Zm[i][j] = (Zm[i - 1][j - 1] + Ze[0][j - 1] + Zf[0][j - 1])
587                                                 * score;
588
589                                 if (Zm[i][j] >= OS_HUGE_VALL) {
590                                         printf("ERROR: huge val error for zM\n");
591                                         exit(1);
592                                 }
593
594                                 zz = Zm[i][j] + Ze[1][j] + Zf[1][j];
595                         }
596
597                 }                       //end for
598
599                 if (!PART_FULL_MEMORY) {
600                         for (int t = 0; t <= sequences[0].length; t++) {
601                                 Ze[0][t] = Ze[1][t];
602                                 Ze[1][t] = 0;
603
604                                 Zf[0][t] = Zf[1][t];
605                                 Zf[1][t] = 0;
606                         }
607
608                         Zf[1][0] = 1;
609
610                 }
611
612         }                               //end for
613
614         //store the sum of zm zf ze (m,n)s in zm's 0,0 th position
615         Zm[0][0] = zz;
616
617         if (TRACE) {
618                 //debug code aug 3 
619                 //print the 3 Z matrices namely Zm Zf and Ze
620
621                 printf("\n\nFINAL Zm:\n");
622                 for (i = 0; i <= sequences[1].length; i++) {
623                         for (j = 0; j <= sequences[0].length; j++)
624                                 printf("%.2Le ", Zm[i][j]);
625                         printf("\n");
626                 }
627
628                 printf("FINAL Zf \n");
629                 for (i = 0; i <= sequences[1].length; i++) {
630                         for (j = 0; j <= sequences[0].length; j++)
631                                 printf("%.2Le ", Zf[i][j]);
632                         printf("\n");
633                 }
634
635                 printf("FINAL Ze \n");
636                 for (i = 0; i <= sequences[1].length; i++) {
637                         for (j = 0; j <= sequences[0].length; j++)
638                                 printf("%.2Le ", Ze[i][j]);
639                         printf("\n");
640                 }
641
642                 //end debug dump code
643
644         }
645
646         if (PART_FULL_MEMORY) {
647                 for (i = 0; i <= sequences[1].length; i++) {
648                         delete (Zf[i]);
649                         delete (Ze[i]);
650                 }
651         } else {
652                 delete (Zf[0]);
653                 delete (Ze[0]);
654                 delete (Zf[1]);
655                 delete (Ze[1]);
656         }
657
658         delete (Zf);
659         delete (Ze);
660
661         return Zm;
662
663 }                               //end of forward partition function
664
665 /////////////////////////////////////////////////////////////////////////////////////////
666 //entry point (was the main function) , returns the posterior probability safe vector
667 ////////////////////////////////////////////////////////////////////////////////////////
668 VF *ComputePostProbs(int a, int b, string seq1, string seq2) {
669         //printf("probamod\n"); 
670         double gap_open = -22, gap_ext = -1, beta = 0.2;//T = 5, beta = 1/T = 0.2, by default
671         int stock_loop = 1;
672         int le = 160;
673         double termgapopen = 1.0f;      //exp(0)
674         double termgapextend = 1.0f;    //exp(0)
675
676         //initialize the sequence structure
677         fasta sequences[2];
678
679         sequences[0].length = strlen((char *) seq1.c_str());
680         sequences[0].text = (char *) seq1.c_str();
681         sequences[0].title = new char[10];
682         strcpy(sequences[0].title, "seq0");
683         sequences[1].length = strlen((char *) seq2.c_str());
684         sequences[1].text = (char *) seq2.c_str();
685         sequences[1].title = new char[10];
686         strcpy(sequences[1].title, "seq1");
687
688         if (TRACE)
689
690         {
691                 printf("%d %d %s\n%d %d %s\n--\n", a, sequences[0].length,
692                                 sequences[0].text, b, sequences[1].length, sequences[1].text);
693                 printf("after init\n");
694
695                 FILE *dump1 = fopen("dump1", "a");
696                 fprintf(dump1, "%d %d %s\n%d %d %s\n--\n", a, sequences[0].length,
697                                 sequences[0].text, b, sequences[1].length, sequences[1].text);
698                 fclose(dump1);
699         }
700
701         gap_open = argument.gapopen;
702         gap_ext = argument.gapext;
703         beta = argument.beta;
704
705         stock_loop = argument.N;
706         le = argument.matrix;
707
708         //compute the values of exp(beta * ?)
709         termgapopen = exp(beta * 0.0);
710         termgapextend = exp(beta * 0.0);
711         gap_open = exp(beta * gap_open);
712         gap_ext = exp(beta * gap_ext);
713
714         if (TRACE)
715                 printf("%f %f %f %d\n", gap_open, gap_ext, beta, le);
716
717         //call for calculating the posterior probabilities
718         // 1. call partition function partf
719         // 2. calculate revpartition using revers_parf
720         // 3. calculate probabilities
721         /// MODIFICATION... POPULATE SAFE VECTOR
722
723         long double **MAT1;
724
725         MAT1 = partf(sequences, termgapopen, termgapextend, gap_open, gap_ext);
726
727         return revers_partf(sequences, termgapopen, termgapextend, MAT1, gap_open,
728                         gap_ext);
729
730 }
731
732 //////////////////////////////////////////////////////////////
733 //Compute Viterbi Alignment 
734 // Added by YE Yongtao
735 /////////////////////////////////////////////////////////////
736
737 pair<SafeVector<char> *, float> partViterbi(string seq1, string seq2) {
738
739
740         double gap_open = -12, gap_ext = -1, beta = 0.2;//T = 5, beta = 1/T = 0.2, by default
741         int stock_loop = 1;
742         int le = 160;
743         //double termgapopen = 1.0f;    //exp(0)
744         //double termgapextend = 1.0f;  //exp(0)
745
746         //initialize the sequence structure 
747         fasta sequences[2];
748         sequences[0].length = strlen((char *) seq1.c_str());
749         sequences[0].text = (char *) seq1.c_str();
750         sequences[0].title = new char[10];
751         strcpy(sequences[0].title, "seq0");
752         sequences[1].length = strlen((char *) seq2.c_str());
753         sequences[1].text = (char *) seq2.c_str();
754         sequences[1].title = new char[10];
755         strcpy(sequences[1].title, "seq1");
756
757         gap_open = argument.gapopen;
758         gap_ext = argument.gapext;
759         beta = argument.beta;
760
761         stock_loop = argument.N;
762         le = argument.matrix;
763
764         //compute the values of exp(beta * ?)
765         double endgapopen = exp(beta * 0.0);
766         double endgapextend = exp(beta * 0.0);
767         double d = exp(beta * gap_open);
768         double e = exp(beta * gap_ext);
769
770         int i, j, len1, len0;
771         long double **Zm = NULL, **Zf = NULL, **Ze = NULL;
772         int **traceZm = NULL, **traceZf = NULL, **traceZe = NULL;
773
774         //the flag endgaps is set at the #define section
775         Zf = new long double *[sequences[1].length + 1];
776         Ze = new long double *[sequences[1].length + 1];
777         Zm = new long double *[sequences[1].length + 1];
778
779         traceZf = new int *[sequences[1].length + 1];
780         traceZe = new int *[sequences[1].length + 1];
781         traceZm = new int *[sequences[1].length + 1];
782
783         //DYNAMICALLY GROW 2D M,IX,IY,PIX,PIY MARICES
784         for (i = 0; i <= sequences[1].length; i++) {
785                 Zf[i] = new long double[sequences[0].length + 1];
786                 Ze[i] = new long double[sequences[0].length + 1];
787                 Zm[i] = new long double[sequences[0].length + 1];
788
789                 traceZf[i] = new int[sequences[0].length + 1];
790                 traceZe[i] = new int[sequences[0].length + 1];
791                 traceZm[i] = new int[sequences[0].length + 1];
792         }
793         
794         len0 = strlen(sequences[0].text);
795         len1 = strlen(sequences[1].text);
796
797         
798         for (i = 0; i <= sequences[1].length; i++)
799                 for (j = 0; j <= sequences[0].length; j++) {
800                         Zm[i][j] = 0.00;
801                         Zf[i][j] = 0.00;
802                         Ze[i][j] = 0.00;
803
804                         traceZm[i][j] = -1;
805                         traceZf[i][j] = -1;
806                         traceZe[i][j] = -1;
807                 }
808         
809
810         //INTITIALIZE THE DP 
811         if (endgaps == 0) {
812                 Zm[0][0] = 1.00;
813
814                 Zf[0][0] = Ze[0][0] = 0;
815                 Zf[1][0] = Zm[0][0] * d;
816                 Ze[0][1] = Zm[0][0] * d;
817
818                 //>=2ND ROW INIT
819                 
820                 for (i = 2; i <= sequences[1].length; i++) {
821                         Zf[i][0] = Zf[i - 1][0] * e;
822                         traceZf[i][0] = 2;
823                 }
824                 
825
826                 //>=2ND COL INIT
827                 for (j = 2; j <= sequences[0].length; j++) {
828                         Ze[0][j] = Ze[0][j - 1] * e;
829                         traceZe[0][j] = 1;
830                 }
831         } else {
832                 //init z
833                 Zm[0][0] = 1.00;
834                 Zf[0][0] = Ze[0][0] = 0;
835                 Zf[1][0] = Zm[0][0] * endgapopen;
836                 Ze[0][1] = Zm[0][0] * endgapopen;
837
838                 //>=2ND ROW INIT
839                 
840                 for (i = 2; i <= sequences[1].length; i++) {
841                         Zf[i][0] = Zf[i - 1][0] * endgapextend;
842                         traceZf[i][0] = 2;
843                 }
844                 //>=2ND COL INIT
845                 for (j = 2; j <= sequences[0].length; j++) {
846                         Ze[0][j] = Ze[0][j - 1] * endgapextend;
847                         traceZe[0][j] = 1;
848                 }
849         }
850
851         //1ST ROW/COL INIT
852
853         int Si, Tj;
854         double score;
855
856         for (i = 1; i <= sequences[1].length; i++) {
857
858                 for (j = 1; j <= sequences[0].length; j++) {
859
860                         Si = subst_index[sequences[1].text[i - 1] - 'A'];
861                         Tj = subst_index[sequences[0].text[j - 1] - 'A'];
862
863                         score = sub_matrix[Si][Tj];
864
865                         double open0, extend0, open1, extend1;
866
867                         open0 = open1 = d;
868                         extend0 = extend1 = e;
869
870                         if (endgaps == 1) {
871                                 //check to see if one of the 2 sequences or both reach the end
872
873                                 if (i == sequences[1].length) {
874                                         open0 = endgapopen;
875                                         extend0 = endgapextend;
876
877                                 }
878
879                                 if (j == sequences[0].length) {
880                                         open1 = endgapopen;
881                                         extend1 = endgapextend;
882                                 }
883                         }
884
885                         //
886                         //z computation using open and extend temp vars
887                         //open0 is gap open in seq0 and open1 is gap open in seq1
888                         //entend0 is gap extend in seq0 and extend1 is gap extend in seq1
889                         Zf[i][j] = Zf[i - 1][j] * extend1; 
890                         traceZf[i][j] = 2;
891
892                         if(Zm[i - 1][j] * open1 > Zf[i][j]){
893                                 Zf[i][j] = Zm[i - 1][j] * open1;
894                                 traceZf[i][j] = 0;
895                         }
896                         if (Zf[i][j] >= OS_HUGE_VALL) {
897                                 printf("ERROR: huge val error for Zf\n");
898                                 exit(1);
899                         }
900                         Ze[i][j] = Ze[i][j - 1] * extend0;
901                         traceZe[i][j] = 1;
902                         if(Zm[i][j - 1] * open0 > Ze[i][j]){
903                                 Ze[i][j] = Zm[i][j - 1] * open0;
904                                 traceZe[i][j] = 0;
905                         }
906
907                         if (Ze[i][j] >= OS_HUGE_VALL) {
908                                 printf("ERROR: huge val error for Ze\n");
909                                 exit(1);
910                         }
911
912                         Zm[i][j] = Zm[i - 1][j - 1] * score;
913                         traceZm[i][j] = 0;
914                         if(Zf[i - 1][j - 1] * score > Zm[i][j]){
915                                 Zm[i][j] = Zf[i - 1][j - 1] * score;
916                                 traceZm[i][j] = 2;
917                         }
918                         if(Ze[i - 1][j - 1] * score > Zm[i][j]){ 
919                                 Zm[i][j] = Ze[i - 1][j - 1] * score;
920                                 traceZm[i][j] = 1;
921                         }
922                         if (Zm[i][j] >= OS_HUGE_VALL) {
923                                 printf("ERROR: huge val error for Zm\n");
924                                 exit(1);
925                         }               
926
927                 }//end for
928         }//end for
929         // figure out best terminating cell
930
931         float bestProb = Zm[sequences[1].length][sequences[0].length];
932         int state = 0;
933         if( bestProb < Zf[sequences[1].length][sequences[0].length]){     
934                 bestProb = Zf[sequences[1].length][sequences[0].length];
935                 state = 2;
936         }
937         if( bestProb < Ze[sequences[1].length][sequences[0].length]){     
938                 bestProb = Ze[sequences[1].length][sequences[0].length];
939                 state = 1;
940         }
941         assert (state != -1);
942  
943         // compute traceback
944         SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
945         int c = sequences[1].length, r = sequences[0].length;
946         while (r != 0 || c != 0){
947                 int newState;
948                 if(state == 0){
949                         newState = traceZm[c][r];
950                         c--; r--; alignment->push_back ('B');
951                 }
952                 else if(state == 1){
953                         newState = traceZe[c][r];
954                         r--; alignment->push_back ('X');
955                 }
956                 else{
957                         newState = traceZf[c][r];
958                         c--; alignment->push_back ('Y');
959                 }
960                 state = newState;
961         }
962
963         reverse (alignment->begin(), alignment->end());
964
965         for (i = 0; i <= sequences[1].length; i++) {
966                 delete (Zf[i]);
967                 delete (Ze[i]);
968                 delete (Zm[i]);
969                 delete (traceZf[i]);
970                 delete (traceZe[i]);
971                 delete (traceZm[i]);
972         }
973
974         delete (Zf);
975         delete (Ze);
976         delete (Zm);
977         delete (traceZf);
978         delete (traceZe);
979         delete (traceZm);
980
981         return make_pair(alignment, bestProb);
982 }
983
984 //////////////////////////////////////////////////////////////
985 // Compute two sequences' similarity defined as the normalized alignment score without gap penalties
986 // Added by YE Yongtao
987 /////////////////////////////////////////////////////////////
988
989 float computeSimilarity(string seq1, string seq2, SafeVector<char> * alignment) {
990
991         //initialize the sequence structure 
992         fasta sequences[2];
993         sequences[0].length = strlen((char *) seq1.c_str());
994         sequences[0].text = (char *) seq1.c_str();
995         sequences[0].title = new char[10];
996         strcpy(sequences[0].title, "seq0");
997         sequences[1].length = strlen((char *) seq2.c_str());
998         sequences[1].text = (char *) seq2.c_str();
999         sequences[1].title = new char[10];
1000         strcpy(sequences[1].title, "seq1");
1001
1002         float bestProb = 0;
1003         int Si, Tj;
1004         double score;
1005         int i = 1;int j = 1;
1006         for (SafeVector<char>::iterator iter = alignment->begin(); 
1007                 iter != alignment->end(); ++iter){
1008                 if (*iter == 'B'){
1009                         Si = subst_index[sequences[1].text[j - 1] - 'A'];
1010                         Tj = subst_index[sequences[0].text[i - 1] - 'A'];
1011                         score = normalized_matrix[Si][Tj];
1012                         bestProb += score;
1013                         i++; j++;               
1014                 }
1015                 else if(*iter == 'X') i++;
1016                 else if(*iter == 'Y') j++;
1017         }
1018         if(i!= sequences[0].length + 1 || j!= sequences[1].length + 1 ) cerr << "similarity error"<< endl;
1019         bestProb /= alignment->size();      
1020         //bestProb /= min(sequences[0].length, sequences[1].length);
1021         return bestProb;
1022 }                               
1023 //end of posterior probability  module