new mafft v 6.857 with extensions
[jabaws.git] / binaries / src / mafft / core / rna.c
1 #include "mltaln.h"
2 #include "dp.h"
3
4 #define MEMSAVE 1
5
6 #define DEBUG 1
7 #define USE_PENALTY_EX  1
8 #define STOREWM 1
9
10
11
12 static float singleribosumscore( int n1, int n2, char **s1, char **s2, double *eff1, double *eff2, int p1, int p2 )
13 {
14         float val;
15         int i, j;
16         int code1, code2;
17
18         val = 0.0;
19         for( i=0; i<n1; i++ ) for( j=0; j<n2; j++ )
20         {
21                 code1 = amino_n[(int)s1[i][p1]];
22                 if( code1 > 3 ) code1 = 36;
23                 code2 = amino_n[(int)s2[j][p2]];
24                 if( code2 > 3 ) code2 = 36;
25
26 //              fprintf( stderr, "'l'%c-%c: %f\n", s1[i][p1], s2[j][p2], (float)ribosumdis[code1][code2] );
27
28                 val += (float)ribosumdis[code1][code2] * eff1[i] * eff2[j];
29         }
30         return( val );
31 }
32 static float pairedribosumscore53( int n1, int n2, char **s1, char **s2, double *eff1, double *eff2, int p1, int p2, int c1, int c2 )
33 {
34         float val;
35         int i, j;
36         int code1o, code1u, code2o, code2u, code1, code2;
37
38         val = 0.0;
39         for( i=0; i<n1; i++ ) for( j=0; j<n2; j++ )
40         {
41                 code1o = amino_n[(int)s1[i][p1]];
42                 code1u = amino_n[(int)s1[i][c1]];
43                 if( code1o > 3 ) code1 = code1o = 36;
44                 else if( code1u > 3 ) code1 = 36;
45                 else code1 = 4 + code1o * 4 + code1u;
46
47                 code2o = amino_n[(int)s2[j][p2]];
48                 code2u = amino_n[(int)s2[j][c2]];
49                 if( code2o > 3 ) code2 = code1o = 36;
50                 else if( code2u > 3 ) code2 = 36;
51                 else code2 = 4 + code2o * 4 + code2u;
52
53
54 //              fprintf( stderr, "%c%c-%c%c: %f\n", s1[i][p1], s1[i][c1], s2[j][p2], s2[j][c2], (float)ribosumdis[code1][code2] );
55
56                 if( code1 == 36 || code2 == 36 )
57                         val += (float)n_dis[code1o][code2o] * eff1[i] * eff2[j];
58                 else
59                         val += (float)ribosumdis[code1][code2] * eff1[i] * eff2[j];
60         }
61         return( val );
62 }
63
64 static float pairedribosumscore35( int n1, int n2, char **s1, char **s2, double *eff1, double *eff2, int p1, int p2, int c1, int c2 )
65 {
66         float val;
67         int i, j;
68         int code1o, code1u, code2o, code2u, code1, code2;
69
70         val = 0.0;
71         for( i=0; i<n1; i++ ) for( j=0; j<n2; j++ )
72         {
73                 code1o = amino_n[(int)s1[i][p1]];
74                 code1u = amino_n[(int)s1[i][c1]];
75                 if( code1o > 3 ) code1 = code1o = 36;
76                 else if( code1u > 3 ) code1 = 36;
77                 else code1 = 4 + code1u * 4 + code1o;
78
79                 code2o = amino_n[(int)s2[j][p2]];
80                 code2u = amino_n[(int)s2[j][c2]];
81                 if( code2o > 3 ) code2 = code1o = 36;
82                 else if( code2u > 3 ) code2 = 36;
83                 else code2 = 4 + code2u * 4 + code2o;
84
85
86 //              fprintf( stderr, "%c%c-%c%c: %f\n", s1[i][p1], s1[i][c1], s2[j][p2], s2[j][c2], (float)ribosumdis[code1][code2] );
87
88                 if( code1 == 36 || code2 == 36 )
89                         val += (float)n_dis[code1o][code2o] * eff1[i] * eff2[j];
90                 else
91                         val += (float)ribosumdis[code1][code2] * eff1[i] * eff2[j];
92         }
93         return( val );
94 }
95
96
97 static void mccaskillextract( char **seq, char **nogap, int nseq, RNApair **pairprob, RNApair ***single, int **sgapmap, double *eff )
98 {
99         int lgth;
100         int nogaplgth;
101         int i, j;
102         int left, right, adpos;
103         float prob;
104         static TLS int *pairnum;
105         RNApair *pt, *pt2;
106
107         lgth = strlen( seq[0] );
108         pairnum = calloc( lgth, sizeof( int ) );
109         for( i=0; i<lgth; i++ ) pairnum[i] = 0;
110
111         for( i=0; i<nseq; i++ )
112         {
113                 nogaplgth = strlen( nogap[i] );
114                 for( j=0; j<nogaplgth; j++ ) for( pt=single[i][j]; pt->bestpos!=-1; pt++ )
115                 {
116                         left = sgapmap[i][j];
117                         right = sgapmap[i][pt->bestpos];
118                         prob = pt->bestscore;
119
120
121                         for( pt2=pairprob[left]; pt2->bestpos!=-1; pt2++ )
122                                 if( pt2->bestpos == right ) break;
123
124 //                      fprintf( stderr, "i,j=%d,%d, left=%d, right=%d, pt=%d, pt2->bestpos = %d\n", i, j, left, right, pt-single[i][j], pt2->bestpos );
125                         if( pt2->bestpos == -1 )
126                         {
127                                 pairprob[left] = (RNApair *)realloc( pairprob[left], (pairnum[left]+2) * sizeof( RNApair ) );
128                                 adpos = pairnum[left];
129                                 pairnum[left]++;
130                                 pairprob[left][adpos].bestscore = 0.0;
131                                 pairprob[left][adpos].bestpos = right;
132                                 pairprob[left][adpos+1].bestscore = -1.0;
133                                 pairprob[left][adpos+1].bestpos = -1;
134                                 pt2 = pairprob[left]+adpos;
135                         }
136                         else
137                                 adpos = pt2-pairprob[left];
138
139                         pt2->bestscore += prob * eff[i];
140
141                         if( pt2->bestpos != right )
142                         {
143                                 fprintf( stderr, "okashii!\n" );
144                                 exit( 1 );
145                         }
146 //                      fprintf( stderr, "adding %d-%d, %f\n", left, right, prob );
147 //                      fprintf( stderr, "pairprob[0][0].bestpos=%d\n", pairprob[0][0].bestpos );
148 //                      fprintf( stderr, "pairprob[0][0].bestscore=%f\n", pairprob[0][0].bestscore );
149                 }
150         }
151
152 //      fprintf( stderr, "before taikakuka\n" );
153         for( i=0; i<lgth; i++ ) for( j=0; j<pairnum[i]; j++ )
154         {
155                 if( pairprob[i][j].bestpos > -1 )
156                 {
157 //                      pairprob[i][j].bestscore /= (float)nseq;
158 //                      fprintf( stderr, "pair of %d = %d (%f) %c:%c\n", i, pairprob[i][j].bestpos, pairprob[i][j].bestscore, seq[0][i], seq[0][pairprob[i][j].bestpos] );
159                 }
160         }
161
162 #if 0
163         for( i=0; i<lgth; i++ ) for( j=0; j<pairnum[i]; j++ )
164         {
165                 right=pairprob[i][j].bestpos;
166                 if( right < i ) continue;
167                 fprintf( stderr, "no%d-%d, adding %d -> %d\n", i, j, right, i );
168                 pairprob[right] = (RNApair *)realloc( pairprob[right], (pairnum[right]+2) * sizeof( RNApair ) );
169                 pairprob[right][pairnum[right]].bestscore = pairprob[i][j].bestscore;
170                 pairprob[right][pairnum[right]].bestpos = i;
171                 pairnum[right]++;
172                 pairprob[right][pairnum[right]].bestscore = -1.0;
173                 pairprob[right][pairnum[right]].bestpos = -1;
174         }
175 #endif
176
177         free( pairnum );
178
179 }
180
181
182 void rnaalifoldcall( char **seq, int nseq, RNApair **pairprob )
183 {
184         int lgth;
185         int i;
186         static TLS int *order = NULL;
187         static TLS char **name = NULL;
188         char gett[1000];
189         FILE *fp;
190         int left, right, dumm;
191         float prob;
192         static TLS int pid;
193         static TLS char fnamein[100];
194         static TLS char cmd[1000];
195         static TLS int *pairnum;
196
197         lgth = strlen( seq[0] );
198         if( order == NULL )
199         {
200                 pid = (int)getpid();
201                 sprintf( fnamein, "/tmp/_rnaalifoldin.%d", pid );
202                 order = AllocateIntVec( njob );
203                 name = AllocateCharMtx( njob, 10 );
204                 for( i=0; i<njob; i++ )
205                 {
206                         order[i] = i;
207                         sprintf( name[i], "seq%d", i );
208                 }
209         }
210         pairnum = calloc( lgth, sizeof( int ) );
211         for( i=0; i<lgth; i++ ) pairnum[i] = 0;
212
213         fp = fopen( fnamein, "w" );
214         if( !fp )
215         {
216                 fprintf( stderr, "Cannot open /tmp/_rnaalifoldin\n" );
217                 exit( 1 );
218         }
219         clustalout_pointer( fp, nseq, lgth, seq, name, NULL, NULL, order, 15 );
220         fclose( fp );
221
222         sprintf( cmd, "RNAalifold -p %s", fnamein );
223         system( cmd );
224
225         fp = fopen( "alifold.out", "r" );
226         if( !fp )
227         {
228                 fprintf( stderr, "Cannot open /tmp/_rnaalifoldin\n" );
229                 exit( 1 );
230         }
231
232 #if 0
233         for( i=0; i<lgth; i++ ) // atode kesu
234         {
235                 pairprob[i] = (RNApair *)realloc( pairprob[i], (2) * sizeof( RNApair ) ); // atode kesu
236                 pairprob[i][1].bestscore = -1.0;
237                 pairprob[i][1].bestpos = -1;
238         }
239 #endif
240
241         while( 1 )
242         {
243                 fgets( gett, 999, fp );
244                 if( gett[0] == '(' ) break;
245                 if( gett[0] == '{' ) break;
246                 if( gett[0] == '.' ) break;
247                 if( gett[0] == ',' ) break;
248                 if( gett[0] != ' ' ) continue;
249
250                 sscanf( gett, "%d %d %d %f", &left, &right, &dumm, &prob );
251                 left--;
252                 right--;
253
254
255 #if 0
256                 if( prob > 50.0 && prob > pairprob[left][0].bestscore )
257                 {
258                         pairprob[left][0].bestscore = prob;
259                         pairprob[left][0].bestpos = right;
260 #else
261                 if( prob > 0.0 )
262                 {
263                         pairprob[left] = (RNApair *)realloc( pairprob[left], (pairnum[left]+2) * sizeof( RNApair ) );
264                         pairprob[left][pairnum[left]].bestscore = prob / 100.0;
265                         pairprob[left][pairnum[left]].bestpos = right;
266                         pairnum[left]++;
267                         pairprob[left][pairnum[left]].bestscore = -1.0;
268                         pairprob[left][pairnum[left]].bestpos = -1;
269                         fprintf( stderr, "%d-%d, %f\n", left, right, prob );
270
271                         pairprob[right] = (RNApair *)realloc( pairprob[right], (pairnum[right]+2) * sizeof( RNApair ) );
272                         pairprob[right][pairnum[right]].bestscore = prob / 100.0;
273                         pairprob[right][pairnum[right]].bestpos = left;
274                         pairnum[right]++;
275                         pairprob[right][pairnum[right]].bestscore = -1.0;
276                         pairprob[right][pairnum[right]].bestpos = -1;
277                         fprintf( stderr, "%d-%d, %f\n", left, right, prob );
278 #endif
279                 }
280         }
281         fclose( fp );
282         sprintf( cmd, "rm -f %s", fnamein );
283         system( cmd ); 
284
285         for( i=0; i<lgth; i++ )
286         {
287                 if( (right=pairprob[i][0].bestpos) > -1 )
288                 {
289                         pairprob[right][0].bestpos = i;
290                         pairprob[right][0].bestscore = pairprob[i][0].bestscore;
291                 }
292         }
293
294 #if 0
295         for( i=0; i<lgth; i++ ) // atode kesu
296                 if( pairprob[i][0].bestscore > -1 ) pairprob[i][0].bestscore = 1.0; // atode kesu
297 #endif
298
299 //      fprintf( stderr, "after taikakuka in rnaalifoldcall\n" );
300 //      for( i=0; i<lgth; i++ )
301 //      {
302 //              fprintf( stderr, "pair of %d = %d (%f) %c:%c\n", i, pairprob[i][0].bestpos, pairprob[i][0].bestscore, seq[0][i], seq[0][pairprob[i][0].bestpos] );
303 //      }
304
305         free( pairnum );
306 }
307
308
309 static void utot( int n, int l, char **s )
310 {
311         int i, j;
312         for( i=0; i<l; i++ )
313         {
314                 for( j=0; j<n; j++ )
315                 {
316                         if     ( s[j][i] == 'a' ) s[j][i] = 'a';
317                         else if( s[j][i] == 't' ) s[j][i] = 't';
318                         else if( s[j][i] == 'u' ) s[j][i] = 't';
319                         else if( s[j][i] == 'g' ) s[j][i] = 'g';
320                         else if( s[j][i] == 'c' ) s[j][i] = 'c';
321                         else if( s[j][i] == '-' ) s[j][i] = '-';
322                         else                                      s[j][i] = 'n';
323                 }
324         }
325 }
326
327
328 void foldrna( int nseq1, int nseq2, char **seq1, char **seq2, double *eff1, double *eff2, RNApair ***grouprna1, RNApair ***grouprna2, float **impmtx, int *gapmap1, int *gapmap2, RNApair *additionalpair )
329 {
330         int i, j;
331 //      int ui, uj;
332 //      int uiup, ujup;
333         int uido, ujdo;
334         static TLS char **useq1, **useq2;
335         static TLS char **oseq1, **oseq2, **oseq1r, **oseq2r, *odir1, *odir2;
336         static TLS RNApair **pairprob1, **pairprob2;
337         static TLS RNApair *pairpt1, *pairpt2;
338         int lgth1 = strlen( seq1[0] );
339         int lgth2 = strlen( seq2[0] );
340         static TLS float **impmtx2;
341         static TLS float **map;
342 //      double lenfac;
343         float prob;
344         int **sgapmap1, **sgapmap2;
345         char *nogapdum;
346         float **tbppmtx;
347
348
349 //      fprintf( stderr, "nseq1=%d, lgth1=%d\n", nseq1, lgth1 );
350         useq1 = AllocateCharMtx( nseq1, lgth1+10 );
351         useq2 = AllocateCharMtx( nseq2, lgth2+10 );
352         oseq1 = AllocateCharMtx( nseq1, lgth1+10 );
353         oseq2 = AllocateCharMtx( nseq2, lgth2+10 );
354         oseq1r = AllocateCharMtx( nseq1, lgth1+10 );
355         oseq2r = AllocateCharMtx( nseq2, lgth2+10 );
356         odir1 = AllocateCharVec( lgth1+10 );
357         odir2 = AllocateCharVec( lgth2+10 );
358         sgapmap1 = AllocateIntMtx( nseq1, lgth1+1 );
359         sgapmap2 = AllocateIntMtx( nseq2, lgth2+1 );
360         nogapdum = AllocateCharVec( MAX( lgth1, lgth2 ) );
361         pairprob1 = (RNApair **)calloc( lgth1, sizeof( RNApair *) );
362         pairprob2 = (RNApair **)calloc( lgth2, sizeof( RNApair *) );
363         map = AllocateFloatMtx( lgth1, lgth2 );
364         impmtx2 = AllocateFloatMtx( lgth1, lgth2 );
365         tbppmtx = AllocateFloatMtx( lgth1, lgth2 );
366
367         for( i=0; i<nseq1; i++ ) strcpy( useq1[i], seq1[i] );
368         for( i=0; i<nseq2; i++ ) strcpy( useq2[i], seq2[i] );
369         for( i=0; i<nseq1; i++ ) strcpy( oseq1[i], seq1[i] );
370         for( i=0; i<nseq2; i++ ) strcpy( oseq2[i], seq2[i] );
371
372         for( i=0; i<nseq1; i++ ) commongappick_record( 1, useq1+i, sgapmap1[i] );
373         for( i=0; i<nseq2; i++ ) commongappick_record( 1, useq2+i, sgapmap2[i] );
374
375         for( i=0; i<lgth1; i++ )
376         {
377                 pairprob1[i] = (RNApair *)calloc( 1, sizeof( RNApair ) );
378                 pairprob1[i][0].bestpos = -1;
379                 pairprob1[i][0].bestscore = -1;
380         }
381         for( i=0; i<lgth2; i++ )
382         {
383                 pairprob2[i] = (RNApair *)calloc( 1, sizeof( RNApair ) );
384                 pairprob2[i][0].bestpos = -1;
385                 pairprob2[i][0].bestscore = -1;
386         }
387
388         utot( nseq1, lgth1, oseq1 );
389         utot( nseq2, lgth2, oseq2 );
390
391 //      fprintf( stderr, "folding group1\n" );
392 //      rnalocal( oseq1, useq1, eff1, eff1, nseq1, nseq1, lgth1+10, pair1 );
393
394 /* base-pairing probability of group 1 */
395         if( rnaprediction == 'r' )
396                 rnaalifoldcall( oseq1, nseq1, pairprob1 );
397         else
398                 mccaskillextract( oseq1, useq1, nseq1, pairprob1, grouprna1, sgapmap1, eff1 );
399
400
401 //      fprintf( stderr, "folding group2\n" );
402 //      rnalocal( oseq2, useq2, eff2, eff2, nseq2, nseq2, lgth2+10, pair2 );
403
404 /* base-pairing probability of group 2 */
405         if( rnaprediction == 'r' )
406                 rnaalifoldcall( oseq2, nseq2, pairprob2 );
407         else
408                 mccaskillextract( oseq2, useq2, nseq2, pairprob2, grouprna2, sgapmap2, eff2 );
409
410
411
412 #if 0
413         makerseq( oseq1, oseq1r, odir1, pairprob1, nseq1, lgth1 );
414         makerseq( oseq2, oseq2r, odir2, pairprob2, nseq2, lgth2 );
415
416         fprintf( stderr, "%s\n", odir2 );
417
418         for( i=0; i<nseq1; i++ )
419         {
420                 fprintf( stdout, ">ori\n%s\n", oseq1[0] );
421                 fprintf( stdout, ">rev\n%s\n", oseq1r[0] );
422         }
423 #endif
424
425 /* similarity score */
426         Lalignmm_hmout( oseq1, oseq2, eff1, eff2, nseq1, nseq2, 10000, NULL, NULL, NULL, NULL, map );
427
428         if( 1 )
429         {
430                 if( RNAscoremtx == 'n' )
431                 {
432                         for( i=0; i<lgth1; i++ ) for( j=0; j<lgth2; j++ )
433                         {
434 //                              impmtx2[i][j] = osoiaveragescore( nseq1, nseq2, oseq1, oseq2, eff1, eff2, i, j ) * consweight_multi;
435                                 impmtx2[i][j] = 0.0;
436                         }
437                 }
438                 else if( RNAscoremtx == 'r' )
439                 {
440                         for( i=0; i<lgth1; i++ ) for( j=0; j<lgth2; j++ ) 
441                         {
442                                 tbppmtx[i][j] = 1.0;
443                                 impmtx2[i][j] = 0.0;
444                         }
445                         for( i=0; i<lgth1; i++ ) for( pairpt1=pairprob1[i]; pairpt1->bestpos!=-1; pairpt1++ )
446                         {
447                                 for( j=0; j<lgth2; j++ ) for( pairpt2=pairprob2[j]; pairpt2->bestpos!=-1; pairpt2++ )
448                                 {
449                                         uido = pairpt1->bestpos;
450                                         ujdo = pairpt2->bestpos;
451                                         prob = pairpt1->bestscore * pairpt2->bestscore;
452                                         if( uido > -1  && ujdo > -1 )
453                                         {
454                                                 if( uido > i && j > ujdo )
455                                                 {
456                                                         impmtx2[i][j] += prob * pairedribosumscore53( nseq1, nseq2, oseq1, oseq2, eff1, eff2, i, j, uido, ujdo ) * consweight_multi;
457                                                         tbppmtx[i][j] -= prob;
458                                                 }
459                                                 else if( i < uido && j < ujdo )
460                                                 {
461                                                         impmtx2[i][j] += prob * pairedribosumscore35( nseq1, nseq2, oseq1, oseq2, eff1, eff2, i, j, uido, ujdo ) * consweight_multi;
462                                                         tbppmtx[i][j] -= prob;
463                                                 }
464                                         }
465                                 }
466                         }
467         
468         
469                         for( i=0; i<lgth1; i++ )
470                         {
471                                 for( j=0; j<lgth2; j++ )
472                                 {
473                                         impmtx2[i][j] += tbppmtx[i][j] * singleribosumscore( nseq1, nseq2, oseq1, oseq2, eff1, eff2, i, j ) * consweight_multi;
474                                 }
475                         }
476                 }
477
478
479 /* four-way consistency */
480
481                 for( i=0; i<lgth1; i++ ) for( pairpt1=pairprob1[i]; pairpt1->bestpos!=-1; pairpt1++ )
482                 {
483
484 //                      if( pairprob1[i] == NULL ) continue;
485
486                         for( j=0; j<lgth2; j++ ) for( pairpt2=pairprob2[j]; pairpt2->bestpos!=-1; pairpt2++ )
487                         {
488 //                              fprintf( stderr, "i=%d, j=%d, pn1=%d, pn2=%d\n", i, j, pairpt1-pairprob1[i], pairpt2-pairprob2[j] ); 
489 //                              if( pairprob2[j] == NULL ) continue;
490
491                                 uido = pairpt1->bestpos;
492                                 ujdo = pairpt2->bestpos;
493                                 prob = pairpt1->bestscore * pairpt2->bestscore;
494 //                              prob = 1.0;
495 //                              fprintf( stderr, "i=%d->uido=%d, j=%d->ujdo=%d\n", i, uido, j, ujdo );
496
497 //                              fprintf( stderr, "impmtx2[%d][%d] = %f\n", i, j, impmtx2[i][j] );
498
499 //                              if( i < uido && j > ujdo ) continue;
500 //                              if( i > uido && j < ujdo ) continue;
501
502
503 //                              posdistj = abs( ujdo-j );
504
505 //                              if( uido > -1 && ujdo > -1 )  
506                                 if( uido > -1 && ujdo > -1 && ( ( i > uido && j > ujdo ) || ( i < uido && j < ujdo ) ) )
507                                 {
508                                         {
509                                                 impmtx2[i][j] += MAX( 0, map[uido][ujdo] ) * consweight_rna * 600 * prob; // osoi
510                                         }
511                                 }
512
513                         }
514                 }
515                 for( i=0; i<lgth1; i++ ) for( j=0; j<lgth2; j++ )
516                 {
517                         impmtx[i][j] += impmtx2[i][j];
518 //                      fprintf( stderr, "fastathreshold=%f, consweight_multi=%f, consweight_rna=%f\n", fastathreshold, consweight_multi, consweight_rna );
519 //                      impmtx[i][j] *= 0.5;
520                 }
521
522 //              impmtx[0][0] += 10000.0;
523 //              impmtx[lgth1-1][lgth2-1] += 10000.0;
524
525
526
527 #if 0
528                 fprintf( stdout, "#impmtx2 = \n" );
529                 for( i=0; i<lgth1; i++ )
530                 {
531                         for( j=0; j<lgth2; j++ )
532                         {
533                                 fprintf( stdout, "%d %d %f\n", i, j, impmtx2[i][j] );
534                         }
535                         fprintf( stdout, "\n" );
536                 }
537                 exit( 1 );
538 #endif
539         }
540
541         FreeCharMtx( useq1 );
542         FreeCharMtx( useq2 );
543         FreeCharMtx( oseq1 );
544         FreeCharMtx( oseq2 );
545         FreeCharMtx( oseq1r );
546         FreeCharMtx( oseq2r );
547         free( odir1 );
548         free( odir2 );
549         FreeFloatMtx( impmtx2 );
550         FreeFloatMtx( map );
551         FreeIntMtx( sgapmap1 );
552         FreeIntMtx( sgapmap2 );
553         FreeFloatMtx( tbppmtx );
554
555         for( i=0; i<lgth1; i++ ) free( pairprob1[i] );
556         for( i=0; i<lgth2; i++ ) free( pairprob2[i] );
557         free( pairprob1 );
558         free( pairprob2 );
559 }
560