14 static float lenfaca, lenfacb, lenfacc, lenfacd;
16 #define PLENFACA 0.0123
17 #define PLENFACB 10252
18 #define PLENFACC 10822
26 #define PLENFACB 10000
27 #define PLENFACC 10000
36 void arguments( int argc, char *argv[] )
79 ppenalty_ex = NOTSPECIFIED;
81 kimuraR = NOTSPECIFIED;
84 fftWinSize = NOTSPECIFIED;
85 fftThreshold = NOTSPECIFIED;
88 while( --argc > 0 && (*++argv)[0] == '-' )
90 while ( (c = *++argv[0]) )
96 fprintf( stderr, "inputfile = %s\n", inputfile );
100 ppenalty = (int)( atof( *++argv ) * 1000 - 0.5 );
101 // fprintf( stderr, "ppenalty = %d\n", ppenalty );
105 ppenalty_ex = (int)( atof( *++argv ) * 1000 - 0.5 );
106 fprintf( stderr, "ppenalty_ex = %d\n", ppenalty_ex );
110 poffset = (int)( atof( *++argv ) * 1000 - 0.5 );
111 // fprintf( stderr, "poffset = %d\n", poffset );
115 kimuraR = atoi( *++argv );
116 fprintf( stderr, "kappa = %d\n", kimuraR );
120 nblosum = atoi( *++argv );
122 // fprintf( stderr, "blosum %d / kimura 200 \n", nblosum );
126 pamN = atoi( *++argv );
129 fprintf( stderr, "jtt/kimura %d\n", pamN );
133 pamN = atoi( *++argv );
136 fprintf( stderr, "tm %d\n", pamN );
177 treemethod = 'X'; // mix
180 treemethod = 'E'; // upg (average)
183 treemethod = 'q'; // minimum
246 fftThreshold = atoi( *++argv );
250 fftWinSize = atoi( *++argv );
257 fprintf( stderr, "illegal option %c\n", c );
267 cut = atof( (*argv) );
272 fprintf( stderr, "options: Check source file !\n" );
275 if( tbitr == 1 && outgap == 0 )
277 fprintf( stderr, "conflicting options : o, m or u\n" );
280 if( alg == 'C' && outgap == 0 )
282 fprintf( stderr, "conflicting options : C, o\n" );
290 void seq_grp_nuc( int *grp, char *seq )
296 tmp = amino_grp[(int)*seq++];
300 fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
303 if( grp - grpbk < 6 )
305 fprintf( stderr, "\n\nWARNING: Too short.\nPlease also consider use mafft-ginsi, mafft-linsi or mafft-ginsi.\n\n\n" );
311 void seq_grp( int *grp, char *seq )
317 tmp = amino_grp[(int)*seq++];
321 fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
324 if( grp - grpbk < 6 )
326 fprintf( stderr, "\n\nWARNING: Too short.\nPlease also consider use mafft-ginsi, mafft-linsi or mafft-ginsi.\n\n\n" );
332 void makecompositiontable_p( short *table, int *pointt )
336 while( ( point = *pointt++ ) != END_OF_VEC )
340 int commonsextet_p( short *table, int *pointt )
345 static short *memo = NULL;
346 static int *ct = NULL;
354 memo = (short *)calloc( tsize, sizeof( short ) );
355 if( !memo ) ErrorExit( "Cannot allocate memo\n" );
356 ct = (int *)calloc( MIN( maxl, tsize )+1, sizeof( int ) );
357 if( !ct ) ErrorExit( "Cannot allocate memo\n" );
361 while( ( point = *pointt++ ) != END_OF_VEC )
364 if( tmp < table[point] )
366 if( tmp == 0 ) *cp++ = point;
371 while( *cp != END_OF_VEC )
377 void makepointtable_nuc( int *pointt, int *n )
397 while( *n != END_OF_VEC )
399 point -= *p++ * 1024;
404 *pointt = END_OF_VEC;
407 void makepointtable( int *pointt, int *n )
420 point += *n++ * 1296;
427 while( *n != END_OF_VEC )
429 point -= *p++ * 7776;
434 *pointt = END_OF_VEC;
437 void treebase( int *nlen, char **aseq, char **mseq1, char **mseq2, int ***topol, double *effarr, int *alloclen )
441 float pscore, tscore;
442 static char *indication1, *indication2;
443 static double *effarr1 = NULL;
444 static double *effarr2 = NULL;
445 static int *fftlog; // fixed at 2006/07/26
454 if( effarr1 == NULL )
456 effarr1 = AllocateDoubleVec( njob );
457 effarr2 = AllocateDoubleVec( njob );
458 indication1 = AllocateCharVec( 150 );
459 indication2 = AllocateCharVec( 150 );
460 fftlog = AllocateIntVec( njob );
462 for( l=0; l<njob; l++ ) fftlog[l] = 1;
465 fprintf( stderr, "##### fftwinsize = %d, fftthreshold = %d\n", fftWinSize, fftThreshold );
469 for( i=0; i<njob; i++ )
470 fprintf( stderr, "TBFAST effarr[%d] = %f\n", i, effarr[i] );
473 // for( i=0; i<njob; i++ ) strcpy( aseq[i], seq[i] );
476 // writePre( njob, name, nlen, aseq, 0 );
479 for( l=0; l<njob-1; l++ )
483 len1 = strlen( aseq[m1] );
484 len2 = strlen( aseq[m2] );
485 if( *alloclen < len1 + len2 )
487 fprintf( stderr, "\nReallocating.." );
488 *alloclen = ( len1 + len2 ) + 1000;
489 ReallocateCharMtx( aseq, njob, *alloclen + 10 );
490 fprintf( stderr, "done. *alloclen = %d\n", *alloclen );
494 clus1 = fastconjuction_noname( topol[l][0], aseq, mseq1, effarr1, effarr, indication1 );
495 clus2 = fastconjuction_noname( topol[l][1], aseq, mseq2, effarr2, effarr, indication2 );
497 clus1 = fastconjuction_noweight( topol[l][0], aseq, mseq1, effarr1, indication1 );
498 clus2 = fastconjuction_noweight( topol[l][1], aseq, mseq2, effarr2, indication2 );
503 for( i=0; i<clus1; i++ )
505 if( strlen( mseq1[i] ) != len1 )
507 fprintf( stderr, "i = %d / %d\n", i, clus1 );
508 fprintf( stderr, "hairetsu ga kowareta (in treebase, after conjuction) !\n" );
512 for( j=0; j<clus2; j++ )
514 if( strlen( mseq2[j] ) != len2 )
516 fprintf( stderr, "j = %d / %d\n", j, clus2 );
517 fprintf( stderr, "hairetsu ga kowareta (in treebase, after conjuction) !\n" );
527 for( i=0; i<clus1; i++ )
529 if( strlen( mseq1[i] ) != len1 )
531 fprintf( stderr, "i = %d / %d\n", i, clus1 );
532 fprintf( stderr, "hairetsu ga kowareta (in treebase, after free topol) !\n" );
536 for( j=0; j<clus2; j++ )
538 if( strlen( mseq2[j] ) != len2 )
540 fprintf( stderr, "j = %d / %d\n", j, clus2 );
541 fprintf( stderr, "hairetsu ga kowareta (in treebase, after free topol) !\n" );
548 // fprintf( trap_g, "\nSTEP-%d\n", l );
549 // fprintf( trap_g, "group1 = %s\n", indication1 );
550 // fprintf( trap_g, "group2 = %s\n", indication2 );
552 // fprintf( stderr, "\rSTEP % 5d / %d %d-%d", l+1, njob-1, clus1, clus2 );
553 fprintf( stderr, "\rSTEP % 5d / %d ", l+1, njob-1 );
556 fprintf( stderr, "STEP %d /%d\n", l+1, njob-1 );
557 fprintf( stderr, "group1 = %.66s", indication1 );
558 if( strlen( indication1 ) > 66 ) fprintf( stderr, "..." );
559 fprintf( stderr, "\n" );
560 fprintf( stderr, "group2 = %.66s", indication2 );
561 if( strlen( indication2 ) > 66 ) fprintf( stderr, "..." );
562 fprintf( stderr, "\n" );
566 fprintf( stderr, "before align all\n" );
567 display( aseq, njob );
568 fprintf( stderr, "\n" );
569 fprintf( stderr, "before align 1 %s \n", indication1 );
570 display( mseq1, clus1 );
571 fprintf( stderr, "\n" );
572 fprintf( stderr, "before align 2 %s \n", indication2 );
573 display( mseq2, clus2 );
574 fprintf( stderr, "\n" );
578 if( !nevermemsave && ( alg != 'M' && ( len1 > 10000 || len2 > 10000 ) ) )
580 fprintf( stderr, "\nlen1=%d, len2=%d, Switching to the memsave mode\n", len1, len2 );
582 if( commonIP ) FreeIntMtx( commonIP );
587 // if( fftlog[m1] && fftlog[m2] ) ffttry = ( nlen[m1] > clus1 && nlen[m2] > clus2 );
588 if( fftlog[m1] && fftlog[m2] ) ffttry = ( nlen[m1] > clus1 && nlen[m2] > clus2 && clus1 < 1000 && clus2 < 1000);
590 // ffttry = ( nlen[m1] > clus1 && nlen[m2] > clus2 && clus1 < 5000 && clus2 < 5000); // v6.708
591 // fprintf( stderr, "f=%d, len1/fftlog[m1]=%f, clus1=%d, len2/fftlog[m2]=%f, clus2=%d\n", ffttry, (float)len1/fftlog[m1], clus1, (float)len2/fftlog[m2], clus2 );
593 if( force_fft || ( use_fft && ffttry ) )
595 fprintf( stderr, "f" );
598 fprintf( stderr, "m" );
599 // fprintf( stderr, "%d-%d", clus1, clus2 );
600 pscore = Falign_udpari_long( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen, fftlog+m1 );
604 // fprintf( stderr, "%d-%d", clus1, clus2 );
605 pscore = Falign( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen, fftlog+m1 );
610 fprintf( stderr, "d" );
615 pscore = Aalign( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen );
618 fprintf( stderr, "m" );
619 // fprintf( stderr, "%d-%d", clus1, clus2 );
620 pscore = MSalignmm( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen, NULL, NULL, NULL, NULL );
623 if( clus1 == 1 && clus2 == 1 && 0 )
625 // fprintf( stderr, "%d-%d", clus1, clus2 );
626 pscore = G__align11( mseq1, mseq2, *alloclen );
630 pscore = Q__align( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen, NULL, &dumfl, NULL, NULL, NULL, NULL );
634 pscore = R__align( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen, NULL, &dumfl, NULL, NULL, NULL, NULL );
637 pscore = H__align( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen, NULL, &dumfl, NULL, NULL, NULL, NULL );
640 if( clus1 == 1 && clus2 == 1 )
642 // fprintf( stderr, "%d-%d", clus1, clus2 );
643 pscore = G__align11( mseq1, mseq2, *alloclen );
647 // fprintf( stderr, "%d-%d", clus1, clus2 );
648 pscore = A__align( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen, NULL, &dumfl, NULL, NULL, NULL, NULL );
652 ErrorExit( "ERROR IN SOURCE FILE" );
656 fprintf( stderr, "score = %10.2f\n", pscore );
659 nlen[m1] = 0.5 * ( nlen[m1] + nlen[m2] );
661 // writePre( njob, name, nlen, aseq, 0 );
663 if( disp ) display( aseq, njob );
664 // fprintf( stderr, "\n" );
667 fprintf( stderr, "totalscore = %10.2f\n\n", tscore );
671 static void WriteOptions( FILE *fp )
674 if( dorp == 'd' ) fprintf( fp, "DNA\n" );
677 if ( scoremtx == 0 ) fprintf( fp, "JTT %dPAM\n", pamN );
678 else if( scoremtx == 1 ) fprintf( fp, "BLOSUM %d\n", nblosum );
679 else if( scoremtx == 2 ) fprintf( fp, "M-Y\n" );
681 fprintf( stderr, "Gap Penalty = %+5.2f, %+5.2f, %+5.2f\n", (double)ppenalty/1000, (double)ppenalty_ex/1000, (double)poffset/1000 );
682 if( use_fft ) fprintf( fp, "FFT on\n" );
684 fprintf( fp, "tree-base method\n" );
685 if( tbrweight == 0 ) fprintf( fp, "unweighted\n" );
686 else if( tbrweight == 3 ) fprintf( fp, "clustalw-like weighting\n" );
687 if( tbitr || tbweight )
689 fprintf( fp, "iterate at each step\n" );
690 if( tbitr && tbrweight == 0 ) fprintf( fp, " unweighted\n" );
691 if( tbitr && tbrweight == 3 ) fprintf( fp, " reversely weighted\n" );
692 if( tbweight ) fprintf( fp, " weighted\n" );
696 fprintf( fp, "Gap Penalty = %+5.2f, %+5.2f, %+5.2f\n", (double)ppenalty/1000, (double)ppenalty_ex/1000, (double)poffset/1000 );
699 fprintf( fp, "Algorithm A\n" );
700 else if( alg == 'A' )
701 fprintf( fp, "Algorithm A+\n" );
702 else if( alg == 'S' )
703 fprintf( fp, "Apgorithm S\n" );
704 else if( alg == 'C' )
705 fprintf( fp, "Apgorithm A+/C\n" );
707 fprintf( fp, "Unknown algorithm\n" );
709 if( treemethod == 'X' )
710 fprintf( fp, "Tree = UPGMA (mix).\n" );
711 else if( treemethod == 'E' )
712 fprintf( fp, "Tree = UPGMA (average).\n" );
713 else if( treemethod == 'q' )
714 fprintf( fp, "Tree = Minimum linkage.\n" );
716 fprintf( fp, "Unknown tree.\n" );
720 fprintf( fp, "FFT on\n" );
722 fprintf( fp, "Basis : 4 nucleotides\n" );
726 fprintf( fp, "Basis : Polarity and Volume\n" );
728 fprintf( fp, "Basis : 20 amino acids\n" );
730 fprintf( fp, "Threshold of anchors = %d%%\n", fftThreshold );
731 fprintf( fp, "window size of anchors = %dsites\n", fftWinSize );
734 fprintf( fp, "FFT off\n" );
739 int main( int argc, char *argv[] )
742 static int *nogaplen;
743 static char **name, **seq;
744 static char **mseq1, **mseq2;
753 float longer, shorter;
757 FILE *orderfp, *hat2p;
761 float **mtx = NULL; // by D. Mathog
762 static short *table1;
768 arguments( argc, argv );
772 infp = fopen( inputfile, "r" );
775 fprintf( stderr, "Cannot open %s\n", inputfile );
787 fprintf( stderr, "The number of sequences must be < %d\n", 20000 );
788 fprintf( stderr, "Please try the --parttree option for such large data.\n" );
794 fprintf( stderr, "At least 2 sequences should be input!\n"
795 "Only %d sequence found.\n", njob );
799 seq = AllocateCharMtx( njob, nlenmax*1+1 );
800 mseq1 = AllocateCharMtx( njob, 0 );
801 mseq2 = AllocateCharMtx( njob, 0 );
803 topol = AllocateIntCub( njob, 2, 0 );
804 len = AllocateFloatMtx( njob, 2 );
805 eff = AllocateDoubleVec( njob );
809 Read( name, nlen, seq );
810 readData( infp, name, nlen, seq );
812 name = AllocateCharMtx( njob, B+1 );
813 nlen = AllocateIntVec( njob );
814 nogaplen = AllocateIntVec( njob );
815 readData_pointer( infp, name, nlen, seq );
819 constants( njob, seq );
823 fprintf( stderr, "params = %d, %d, %d\n", penalty, penalty_ex, offset );
830 WriteOptions( trap_g );
835 fprintf( stderr, "Illegal character %c\n", c );
839 fprintf( stderr, "\n" );
843 fprintf( stderr, "\n\nMaking a distance matrix ..\n" );
845 tmpseq = AllocateCharVec( nlenmax+1 );
846 grpseq = AllocateIntVec( nlenmax+1 );
847 pointt = AllocateIntMtx( njob, nlenmax+1 );
848 mtx = AllocateFloatHalfMtx( njob );
849 if( dorp == 'd' ) tsize = (int)pow( 4, 6 );
850 else tsize = (int)pow( 6, 6 );
868 for( i=0; i<njob; i++ )
870 gappick0( tmpseq, seq[i] );
871 nogaplen[i] = strlen( tmpseq );
872 if( nogaplen[i] < 6 )
874 // fprintf( stderr, "Seq %d, too short, %d characters\n", i+1, nogaplen[i] );
875 // fprintf( stderr, "Please use mafft-ginsi, mafft-linsi or mafft-ginsi\n\n\n" );
878 if( nogaplen[i] > maxl ) maxl = nogaplen[i];
879 if( dorp == 'd' ) /* nuc */
881 seq_grp_nuc( grpseq, tmpseq );
882 makepointtable_nuc( pointt[i], grpseq );
886 seq_grp( grpseq, tmpseq );
887 makepointtable( pointt[i], grpseq );
890 for( i=0; i<njob; i++ )
892 table1 = (short *)calloc( tsize, sizeof( short ) );
893 if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
896 fprintf( stderr, "\r% 5d / %d", i+1, njob );
898 makecompositiontable_p( table1, pointt[i] );
900 for( j=i; j<njob; j++ )
902 mtx[i][j-i] = (float)commonsextet_p( table1, pointt[j] );
906 fprintf( stderr, "\ndone.\n\n" );
909 for( i=0; i<ien; i++ )
911 for( j=i+1; j<njob; j++ )
913 if( nogaplen[i] > nogaplen[j] )
915 longer=(float)nogaplen[i];
916 shorter=(float)nogaplen[j];
920 longer=(float)nogaplen[j];
921 shorter=(float)nogaplen[i];
923 // lenfac = 3.0 / ( LENFACA + LENFACB / ( longer + LENFACC ) + shorter / longer * LENFACD );
924 lenfac = 1.0 / ( shorter / longer * lenfacd + lenfacb / ( longer + lenfacc ) + lenfaca );
926 // fprintf( stderr, "lenfac = %f (%.0f,%.0f)\n", lenfac, longer, shorter );
927 bunbo = MIN( mtx[i][0], mtx[j][0] );
931 mtx[i][j-i] = ( 1.0 - mtx[i][j-i] / bunbo ) * lenfac;
932 // fprintf( stdout, "##### mtx = %f, mtx[i][0]=%f, mtx[j][0]=%f, bunbo=%f\n", mtx[i][j-i], mtx[i][0], mtx[j][0], bunbo );
937 for( i=0; i<njob; i++ )
939 sprintf( b, "=lgth = %04d", nogaplen[i] );
940 strins( b, name[i] );
945 FreeIntMtx( pointt );
947 #if 1 // writehat2 wo kakinaosu
950 hat2p = fopen( "hat2", "w" );
951 WriteFloatHat2_pointer_halfmtx( hat2p, njob, name, mtx );
958 #if 0 // readhat2 wo kakinaosu
959 fprintf( stderr, "Loading 'hat2' ... " );
960 prep = fopen( "hat2", "r" );
961 if( prep == NULL ) ErrorExit( "Make hat2." );
962 readhat2_float( prep, njob, name, mtx ); // name chuui
964 fprintf( stderr, "done.\n" );
970 fprintf( stderr, "Loading a tree ... " );
971 loadtree( njob, topol, len, name, nogaplen );
975 fprintf( stderr, "Loading a topology ... " );
976 loadtop( njob, mtx, topol, len );
977 FreeFloatHalfMtx( mtx, njob );
981 fprintf( stderr, "Constructing a UPGMA tree ... " );
983 fixed_musclesupg_float_realloc_nobk_halfmtx_treeout( njob, mtx, topol, len, name, nogaplen );
984 // veryfastsupg_float_realloc_nobk_halfmtx_treeout( njob, mtx, topol, len, name, nogaplen );
987 FreeFloatHalfMtx( mtx, njob );
991 fprintf( stderr, "Constructing a UPGMA tree ... " );
992 fixed_musclesupg_float_realloc_nobk_halfmtx( njob, mtx, topol, len );
993 FreeFloatHalfMtx( mtx, njob );
996 // ErrorExit( "Incorrect tree\n" );
997 fprintf( stderr, "\ndone.\n\n" );
999 orderfp = fopen( "order", "w" );
1002 fprintf( stderr, "Cannot open 'order'\n" );
1005 for( i=0; (j=topol[njob-2][0][i])!=-1; i++ )
1007 fprintf( orderfp, "%d\n", j );
1009 for( i=0; (j=topol[njob-2][1][i])!=-1; i++ )
1011 fprintf( orderfp, "%d\n", j );
1015 if( ( treeout || distout ) && noalign )
1017 writeData_pointer( stdout, njob, name, nlen, seq );
1018 fprintf( stderr, "\n" );
1028 utree = 0; counteff( njob, topol, len, eff ); utree = 1;
1030 counteff_simple_float( njob, topol, len, eff );
1035 for( i=0; i<njob; i++ ) eff[i] = 1.0;
1039 for( i=0; i<njob; i++ )
1040 fprintf( stdout, "eff[%d] = %20.16f\n", i, eff[i] );
1045 FreeFloatMtx( len );
1047 bseq = AllocateCharMtx( njob, nlenmax*2+1 );
1048 alloclen = nlenmax*2;
1049 for( i=0; i<njob; i++ ) gappick0( bseq[i], seq[i] );
1051 fprintf( stderr, "Progressive alignment ... \n" );
1052 treebase( nlen, bseq, mseq1, mseq2, topol, eff, &alloclen );
1053 fprintf( stderr, "\ndone.\n\n" );
1055 fprintf( stderr, "closing trap_g\n" );
1059 // writePre( njob, name, nlen, aseq, !contin );
1061 fprintf( stderr, "writing alignment to stdout\n" );
1063 writeData_pointer( stdout, njob, name, nlen, bseq );
1065 writeData( stdout, njob, name, nlen, bseq );
1068 fprintf( stderr, "OSHIMAI\n" );