Next version of JABA
[jabaws.git] / binaries / src / mafft / core / disttbfast_dummy.c
1 #include "mltaln.h"
2
3 #define DEBUG 0
4 #define IODEBUG 0
5 #define SCOREOUT 0
6
7 #define END_OF_VEC -1
8
9
10 void arguments()
11 {
12     int c;
13
14         inputfile = NULL;
15         fftkeika = 0;
16         constraint = 0;
17         nblosum = 62;
18         fmodel = 0;
19         calledByXced = 0;
20         devide = 0;
21         use_fft = 1;
22         fftscore = 1;
23         fftRepeatStop = 0;
24         fftNoAnchStop = 0;
25     weight = 3;
26     utree = 1;
27         tbutree = 1;
28     refine = 0;
29     check = 1;
30     cut = 0.0;
31     disp = 0;
32     outgap = 1;
33     alg = 'M';
34     mix = 0;
35         tbitr = 0;
36         scmtd = 5;
37         tbweight = 0;
38         tbrweight = 3;
39         checkC = 0;
40         treemethod = 'x';
41         contin = 0;
42         kobetsubunkatsu = 0;
43         makedistmtx = 1;
44         ppenalty = -1530;
45         ppenalty_ex = NOTSPECIFIED;
46         poffset = -123;
47         kimuraR = NOTSPECIFIED;
48         pamN = NOTSPECIFIED;
49         geta2 = GETA2;
50         fftWinSize = NOTSPECIFIED;
51         fftThreshold = NOTSPECIFIED;
52         TMorJTT = JTT;
53 }
54
55 static int maxl;
56 static int tsize;
57
58 void seq_grp_nuc( int *grp, char *seq )
59 {
60         int tmp;
61         while( *seq )
62         {
63                 tmp = amino_grp[*seq++];
64                 if( tmp < 4 )
65                         *grp++ = tmp;
66                 else
67                         fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
68         }
69         *grp = END_OF_VEC;
70 }
71
72 void seq_grp( int *grp, char *seq )
73 {
74         int tmp;
75         while( *seq )
76         {
77                 tmp = amino_grp[*seq++];
78                 if( tmp < 6 )
79                         *grp++ = tmp;
80                 else
81                         fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
82         }
83         *grp = END_OF_VEC;
84 }
85
86 void makecompositiontable_p( short *table, int *pointt )
87 {
88         int point;
89
90         while( ( point = *pointt++ ) != END_OF_VEC )
91                 table[point]++;
92 }
93
94 int commonsextet_p( short *table, int *pointt )
95 {
96         int value = 0;
97         short tmp;
98         int point;
99         static short *memo = NULL;
100         static int *ct = NULL;
101         static int *cp;
102
103         if( !memo )
104         {
105                 memo = calloc( tsize, sizeof( short ) );
106                 if( !memo ) ErrorExit( "Cannot allocate memo\n" );
107                 ct = calloc( MIN( maxl, tsize )+1, sizeof( int ) );
108                 if( !ct ) ErrorExit( "Cannot allocate memo\n" );
109         }
110
111         cp = ct;
112         while( ( point = *pointt++ ) != END_OF_VEC )
113         {
114                 tmp = memo[point]++;
115                 if( tmp < table[point] )
116                         value++;
117                 if( tmp == 0 ) *cp++ = point;
118         }
119         *cp = END_OF_VEC;
120         
121         cp =  ct;
122         while( *cp != END_OF_VEC )
123                 memo[*cp++] = 0;
124
125         return( value );
126 }
127
128 void makepointtable_nuc( int *pointt, int *n )
129 {
130         int point;
131         register int *p;
132
133         p = n;
134         point  = *n++ *  1024;
135         point += *n++ *   256;
136         point += *n++ *    64;
137         point += *n++ *    16;
138         point += *n++ *     4;
139         point += *n++;
140         *pointt++ = point;
141
142         while( *n != END_OF_VEC )
143         {
144                 point -= *p++ * 1024;
145                 point *= 4;
146                 point += *n++;
147                 *pointt++ = point;
148         }
149         *pointt = END_OF_VEC;
150 }
151
152 void makepointtable( int *pointt, int *n )
153 {
154         int point;
155         register int *p;
156
157         p = n;
158         point  = *n++ *  7776;
159         point += *n++ *  1296;
160         point += *n++ *   216;
161         point += *n++ *    36;
162         point += *n++ *     6;
163         point += *n++;
164         *pointt++ = point;
165
166         while( *n != END_OF_VEC )
167         {
168                 point -= *p++ * 7776;
169                 point *= 6;
170                 point += *n++;
171                 *pointt++ = point;
172         }
173         *pointt = END_OF_VEC;
174 }
175
176 void treebase( char name[M][B], int nlen[M], char **seq, char **aseq, char **mseq1, char **mseq2, int ***topol, double *effarr, int alloclen )
177 {
178         int i, j, l;
179         int clus1, clus2;
180         int s1, s2, r1, r2;
181         float pscore, tscore;
182         static char *indication1, *indication2;
183         FILE *trap;
184         static char name1[M][B], name2[M][B];
185         static double *effarr1 = NULL;
186         static double *effarr2 = NULL;
187         float dumfl = 0.0;
188         double total;
189         int intdum;
190
191
192         if( effarr1 == NULL ) 
193         {
194                 effarr1 = AllocateDoubleVec( njob );
195                 effarr2 = AllocateDoubleVec( njob );
196                 indication1 = AllocateCharVec( 150 );
197                 indication2 = AllocateCharVec( 150 );
198         }
199
200 #if 0
201         fprintf( stderr, "##### fftwinsize = %d, fftthreshold = %d\n", fftWinSize, fftThreshold );
202 #endif
203
204 #if 0
205         for( i=0; i<njob; i++ )
206                 fprintf( stderr, "TBFAST effarr[%d] = %f\n", i, effarr[i] );
207 #endif
208
209         for( i=0; i<njob; i++ ) strcpy( aseq[i], seq[i] );
210
211
212
213 //      writePre( njob, name, nlen, aseq, 0 );
214
215         tscore = 0.0;
216         for( l=0; l<njob-1; l++ )
217         {
218
219                 clus1 = fastconjuction( topol[l][0], aseq, mseq1, effarr1, effarr, name, name1, indication1 );
220                 clus2 = fastconjuction( topol[l][1], aseq, mseq2, effarr2, effarr, name, name2, indication2 );
221
222
223                 fprintf( stderr, "STEP %d /%d\r", l+1, njob-1 );
224 //              fprintf( stderr, "STEP %d /%d\n", l+1, njob-1 );
225 //              fprintf( stderr, "group1 = %.66s", indication1 );
226 //              if( strlen( indication1 ) > 66 ) fprintf( stderr, "..." );
227 //              fprintf( stderr, "\n" );
228 //              fprintf( stderr, "group2 = %.66s", indication2 );
229 //              if( strlen( indication2 ) > 66 ) fprintf( stderr, "..." );
230 //              fprintf( stderr, "\n" );
231 //              for( i=0; i<clus1; i++ ) fprintf( stderr, "## STEP%d-eff for mseq1-%d %f\n", l+1, i, effarr1[i] );
232
233 /*
234                 fprintf( stderr, "before align all\n" );
235                 display( aseq, njob );
236                 fprintf( stderr, "\n" );
237                 fprintf( stderr, "before align 1 %s \n", indication1 );
238                 display( mseq1, clus1 );
239                 fprintf( stderr, "\n" );
240                 fprintf( stderr, "before align 2 %s \n", indication2 );
241                 display( mseq2, clus2 );
242                 fprintf( stderr, "\n" );
243 */
244
245                 if( use_fft )
246                 {
247                         if( alg == 'M' )
248                                 pscore = Falign_noudp( mseq1, mseq2, effarr1, effarr2, clus1, clus2, alloclen, &intdum );
249                         else
250                                 pscore = Falign( mseq1, mseq2, effarr1, effarr2, clus1, clus2, alloclen, &intdum );
251                 }
252                 else
253                 {
254                         switch( alg )
255                         {
256                                 case( 'a' ):
257                                         pscore = Aalign( mseq1, mseq2, effarr1, effarr2, clus1, clus2, alloclen );
258                                         break;
259                                 case( 'M' ):
260                                         pscore = MSalignmm( mseq1, mseq2, effarr1, effarr2, clus1, clus2, alloclen, NULL, NULL, NULL, NULL );
261                                         break;
262                                 case( 'A' ):
263                                         if( clus1 == 1 && clus2 == 1 )
264                                         {
265                                                 pscore = G__align11( mseq1, mseq2, alloclen );
266                                         }
267                                         else
268                                         {
269                                                 pscore = A__align( mseq1, mseq2, effarr1, effarr2, clus1, clus2, alloclen, NULL, &dumfl, NULL, NULL, NULL, NULL );
270                                         }
271                                         break;
272                                 default:
273                                         ErrorExit( "ERROR IN SOURCE FILE" );
274                         }
275                 }
276 #if SCOREOUT
277                 fprintf( stderr, "score = %10.2f\n", pscore );
278 #endif
279                 tscore += pscore;
280 /*
281                 fprintf( stderr, "after align 1 %s \n", indication1 );
282                 display( mseq1, clus1 );
283                 fprintf( stderr, "\n" );
284                 fprintf( stderr, "after align 2 %s \n", indication2 );
285                 display( mseq2, clus2 );
286                 fprintf( stderr, "\n" );
287 */
288
289 //              writePre( njob, name, nlen, aseq, 0 );
290
291                 if( disp ) display( aseq, njob );
292 //              fprintf( stderr, "\n" );
293         }
294 #if SCOREOUT
295         fprintf( stderr, "totalscore = %10.2f\n\n", tscore );
296 #endif
297 }
298
299 static void WriteOptions( FILE *fp )
300 {
301
302         if( dorp == 'd' ) fprintf( fp, "DNA\n" );
303         else
304         {
305                 if     ( scoremtx ==  0 ) fprintf( fp, "JTT %dPAM\n", pamN );
306                 else if( scoremtx ==  1 ) fprintf( fp, "BLOSUM %d\n", nblosum );
307                 else if( scoremtx ==  2 ) fprintf( fp, "M-Y\n" );
308         }
309     fprintf( stderr, "Gap Penalty = %+5.2f, %+5.2f, %+5.2f\n", (double)ppenalty/1000, (double)ppenalty_ex/1000, (double)poffset/1000 );
310     if( use_fft ) fprintf( fp, "FFT on\n" );
311
312         fprintf( fp, "tree-base method\n" );
313         if( tbrweight == 0 ) fprintf( fp, "unweighted\n" );
314         else if( tbrweight == 3 ) fprintf( fp, "clustalw-like weighting\n" );
315         if( tbitr || tbweight ) 
316         {
317                 fprintf( fp, "iterate at each step\n" );
318                 if( tbitr && tbrweight == 0 ) fprintf( fp, "  unweighted\n" ); 
319                 if( tbitr && tbrweight == 3 ) fprintf( fp, "  reversely weighted\n" ); 
320                 if( tbweight ) fprintf( fp, "  weighted\n" ); 
321                 fprintf( fp, "\n" );
322         }
323
324          fprintf( fp, "Gap Penalty = %+5.2f, %+5.2f, %+5.2f\n", (double)ppenalty/1000, (double)ppenalty_ex/1000, (double)poffset/1000 );
325
326         if( alg == 'a' )
327                 fprintf( fp, "Algorithm A\n" );
328         else if( alg == 'A' ) 
329                 fprintf( fp, "Algorithm A+\n" );
330         else if( alg == 'S' ) 
331                 fprintf( fp, "Apgorithm S\n" );
332         else if( alg == 'C' ) 
333                 fprintf( fp, "Apgorithm A+/C\n" );
334         else
335                 fprintf( fp, "Unknown algorithm\n" );
336
337         if( treemethod == 'x' )
338                 fprintf( fp, "Tree = UPGMA (3).\n" );
339         else if( treemethod == 's' )
340                 fprintf( fp, "Tree = UPGMA (2).\n" );
341         else if( treemethod == 'p' )
342                 fprintf( fp, "Tree = UPGMA (1).\n" );
343         else
344                 fprintf( fp, "Unknown tree.\n" );
345
346     if( use_fft )
347     {
348         fprintf( fp, "FFT on\n" );
349         if( dorp == 'd' )
350             fprintf( fp, "Basis : 4 nucleotides\n" );
351         else
352         {
353             if( fftscore )
354                 fprintf( fp, "Basis : Polarity and Volume\n" );
355             else
356                 fprintf( fp, "Basis : 20 amino acids\n" );
357         }
358         fprintf( fp, "Threshold   of anchors = %d%%\n", fftThreshold );
359         fprintf( fp, "window size of anchors = %dsites\n", fftWinSize );
360     }
361         else
362         fprintf( fp, "FFT off\n" );
363         fflush( fp );
364 }
365          
366
367 int disttbfast( char **in, int nlen[M], char name[M][B] )
368 {
369         static char **seq;
370         static char **mseq1, **mseq2;
371         static char **aseq;
372         static char **bseq;
373         static int **pscore;
374         static double *eff;
375         int i, j;
376         int identity;
377         static int ***topol;
378         static double **len;
379         FILE *prep;
380         FILE *infp;
381         char c;
382         int alloclen;
383         double total;
384
385         FILE *fp;
386         FILE *orderfp;
387         int *grpseq;
388         char *tmpseq;
389         int  **pointt;
390         double **mtx;
391         double **mtx2; 
392         int score0; 
393         static short *table1;
394         char b[B];
395
396         arguments();
397
398         if( njob < 2 )
399         {
400                 fprintf( stderr, "At least 2 sequences should be input!\n"
401                                                  "Only %d sequence found.\n", njob ); 
402                 exit( 1 );
403         }
404
405         seq = in;
406         aseq = AllocateCharMtx( njob, nlenmax*5+1 );
407         bseq = AllocateCharMtx( njob, nlenmax*5+1 );
408         mseq1 = AllocateCharMtx( njob, 0 );
409         mseq2 = AllocateCharMtx( njob, 0 );
410         alloclen = nlenmax*5;
411
412         topol = AllocateIntCub( njob, 2, njob );
413         len = AllocateDoubleMtx( njob, 2 );
414         pscore = AllocateIntMtx( njob, njob );
415         eff = AllocateDoubleVec( njob );
416
417
418
419         constants( njob, seq );
420
421
422 #if 0
423         fprintf( stderr, "params = %d, %d, %d\n", penalty, penalty_ex, offset );
424 #endif
425
426         initSignalSM();
427
428         initFiles();
429
430
431         c = seqcheck( seq );
432         if( c )
433         {
434                 fprintf( stderr, "Illeagal character %c\n", c );
435                 exit( 1 );
436         }
437
438         fprintf( stderr, "\n" );
439
440 //      writePre( njob, name, nlen, seq, 0 );
441         if( makedistmtx )
442         {
443                 fprintf( stderr, "Making a distance matrix ..\n" );
444
445             tmpseq = AllocateCharVec( nlenmax+1 );
446                 grpseq = AllocateIntVec( nlenmax+1 );
447                 pointt = AllocateIntMtx( njob, nlenmax+1 );
448         mtx = AllocateDoubleMtx( njob, njob ); 
449                 if( dorp == 'd' ) tsize = (int)pow( 4, 6 );
450                 else              tsize = (int)pow( 6, 6 );
451
452                 maxl = 0;
453                 for( i=0; i<njob; i++ ) 
454                 {
455                         gappick0( tmpseq, seq[i] );
456                         nlen[i] = strlen( tmpseq );
457                         if( nlen[i] < 6 )
458                         {
459                                 fprintf( stderr, "Seq %d, too short, %d characters\n", i+1, nlen[i] );
460                                 exit( 1 );
461                         }
462                         if( nlen[i] > maxl ) maxl = nlen[i];
463                         if( dorp == 'd' ) /* nuc */
464                         {
465                                 seq_grp_nuc( grpseq, tmpseq );
466                                 makepointtable_nuc( pointt[i], grpseq );
467                         }
468                         else                 /* amino */
469                         {
470                                 seq_grp( grpseq, tmpseq );
471                                 makepointtable( pointt[i], grpseq );
472                         }
473                 }
474                 for( i=0; i<njob; i++ )
475                 {
476                         table1 = calloc( tsize, sizeof( short ) );
477                         if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
478                         if( i % 10 == 0 )
479                         {
480                                 fprintf( stderr, "%#4d / %#4d\r", i+1, njob );
481                         }
482                         makecompositiontable_p( table1, pointt[i] );
483         
484                         for( j=i; j<njob; j++ ) 
485                         {
486                                 mtx[i][j] = commonsextet_p( table1, pointt[j] );
487                         } 
488                         free( table1 );
489                 }
490                 for( i=0; i<njob; i++ )
491                 {
492                         score0 = mtx[i][i];
493                         for( j=0; j<njob; j++ ) 
494                                 pscore[i][j] = (int)( ( score0 - mtx[MIN(i,j)][MAX(i,j)] ) / score0 * 3 * INTMTXSCALE + 0.5 );
495                 }
496                 for( i=0; i<njob-1; i++ ) for( j=i+1; j<njob; j++ ) 
497                 {
498                         pscore[i][j] = MIN( pscore[i][j], pscore[j][i] );
499                 }
500         
501                 if( disopt )
502                 {
503                         for( i=0; i<njob; i++ ) 
504                         {
505                                 sprintf( b, "=lgth = %#04d", nlen[i] );
506                                 strins( b, name[i] );
507                         }
508                 }
509                 FreeDoubleMtx( mtx );
510                 free( tmpseq );
511                 free( grpseq );
512                 FreeIntMtx( pointt );
513                 fprintf( stderr, "\ndone.\n\n" );
514         }
515         else if( tbutree == 0 )
516         {
517                 for( i=1; i<njob; i++ ) 
518                 {
519                         if( nlen[i] != nlen[0] ) 
520                         {
521                                 fprintf( stderr, "Input pre-aligned seqences or make hat2.\n" );
522                                 exit( 1 );
523                         }
524                 }
525                 for( i=0; i<njob-1; i++ ) for( j=i+1; j<njob; j++ ) 
526                 {
527                 /*
528                         pscore[i][j] = (double)score_calc1( seq[i], seq[j] );
529                 */
530                         pscore[i][j] = (int)( substitution_hosei( seq[i], seq[j] ) * INTMTXSCALE + 0.5 );
531                 }
532         }
533         else
534         {
535 #if 1
536                 fprintf( stderr, "Loading 'hat2' ... " );
537                 prep = fopen( "hat2", "r" );
538                 if( prep == NULL ) ErrorExit( "Make hat2." );
539                 readhat2_int( prep, njob, name, pscore );
540                 fclose( prep );
541                 fprintf( stderr, "done.\n" );
542 #endif
543         }
544 #if 0
545                 prep = fopen( "hat2_check", "w" );
546                 WriteHat2_int( prep, njob, name, pscore );
547                 fclose( prep );
548 #endif
549
550         fprintf( stderr, "Constructing dendrogram ... " );
551         if( treemethod == 'x' )
552         {
553                 veryfastsupg_int( njob, pscore, topol, len );
554         }
555         else 
556                 ErrorExit( "Incorrect tree\n" );
557         fprintf( stderr, "\ndone.\n\n" );
558
559         orderfp = fopen( "order", "w" );
560         if( !orderfp )
561         {
562                 fprintf( stderr, "Cannot open 'order'\n" );
563                 exit( 1 );
564         }
565         for( i=0; (j=topol[njob-2][0][i])!=-1; i++ )
566         {
567                 fprintf( orderfp, "%d\n", j );
568         }
569         for( i=0; (j=topol[njob-2][1][i])!=-1; i++ )
570         {
571                 fprintf( orderfp, "%d\n", j );
572         }
573         fclose( orderfp );
574         
575
576         for( i=0; i<njob; i++ )
577         {
578                 len[i][0] /= INTMTXSCALE;
579                 len[i][1] /= INTMTXSCALE;
580         }
581
582         if( tbrweight )
583         {
584                 weight = 3; 
585 #if 0
586                 utree = 0; counteff( njob, topol, len, eff ); utree = 1;
587 #else
588                 counteff_simple( njob, topol, len, eff );
589 #endif
590         }
591         else
592         {
593                 for( i=0; i<njob; i++ ) eff[i] = 1.0;
594         }
595
596
597         for( i=0; i<njob; i++ ) gappick0( bseq[i], seq[i] );
598
599         FreeIntMtx( pscore );
600         FreeDoubleMtx( len );
601         fprintf( stderr, "Progressive alignment ... \n" );
602         treebase( name, nlen, bseq, aseq, mseq1, mseq2, topol, eff, alloclen );
603         fprintf( stderr, "\ndone.\n\n" );
604
605 #if DEBUG
606         fprintf( stderr, "writing alignment to stdout\n" );
607 #endif
608
609
610         for( i=0; i<njob; i++ ) strcpy( seq[i], aseq[i] );
611
612         fprintf( stderr, "mafft v%s\n", VERSION );
613         return( 0 );
614 }