4 #define PICKSIZE 50 // must be >= 3
7 #define TOKYORIPARA 0.70
10 // kouzoutai ni sasareru pointer ha static
18 //static int doalign = 'f';
28 static int maxdepth = 0;
30 static double lenfaca, lenfacb, lenfacc, lenfacd;
32 #define PLENFACB 10000
33 #define PLENFACC 10000
40 static char datafile[1000];
41 static char queryfile[1000];
42 static char resultfile[1000];
44 typedef struct _scores
52 // char *seq; // reallo
57 int intcompare( const int *a, const int *b )
62 int dcompare( const Scores *a, const Scores *b )
64 if( a->score > b->score ) return 1;
65 else if( a->score < b->score ) return -1;
68 if( a->selfscore < b->selfscore ) return 1;
69 else if( a->selfscore > b->selfscore ) return -1;
72 if( a->orilen < b->orilen ) return 1;
73 else if( a->orilen > b->orilen ) return -1;
79 static void gappickandx0( char *out, char *in )
86 if( (c=*in++) == '-' )
90 else if( amino_n[c] < 4 && amino_n[c] > -1 )
100 if( (c=*in++) == '-' )
102 else if( amino_n[c] < 20 && amino_n[c] > -1 )
111 static int getkouho( int *pickkouho, double prob, int nin, Scores *scores, char **seq ) // 0 < prob < 1
115 int *iptr = pickkouho;
116 for( i=1; i<nin; i++ )
118 if( ( nkouho==0 || rnd() < prob ) && ( scores[i].shimon != scores->shimon || strcmp( seq[scores->numinseq], seq[scores[i].numinseq] ) ) )
121 for( j=0; j<nkouho; j++ )
123 if( scores[i].shimon == scores[pickkouho[j]].shimon || !strcmp( seq[scores[pickkouho[j]].numinseq], seq[scores[i].numinseq] ) )
131 // fprintf( stderr, "ok! nkouho=%d\n", nkouho );
137 // fprintf( stderr, "no! %d-%d\n", 0, scores[i].numinseq );
140 fprintf( stderr, "\ndone\n\n" );
144 static void getblastscoremtx( int **tmpaminodis )
155 static char *tmpname;
160 tmpaminodis['a']['a'] = 1;
161 tmpaminodis['g']['g'] = 1;
162 tmpaminodis['c']['c'] = 1;
163 tmpaminodis['t']['t'] = 1;
169 tmpseq = calloc( 2000, sizeof( char ) );
170 tmpname = calloc( B, sizeof( char ) );
171 resvec = calloc( 1, sizeof( double ) );
173 // fprintf( stderr, "xformatting .. " );
174 dfp = fopen( datafile, "w" );
175 if( !dfp ) ErrorExit( "Cannot open datafile." );
176 sprintf( tmpname, "\0", i );
177 strcpy( tmpseq, "AAAAAAXXXXXX" );
178 strcat( tmpseq, "CCCCCCXXXXXX" );
179 strcat( tmpseq, "DDDDDDXXXXXX" );
180 strcat( tmpseq, "EEEEEEXXXXXX" );
181 strcat( tmpseq, "FFFFFFXXXXXX" );
182 strcat( tmpseq, "GGGGGGXXXXXX" );
183 strcat( tmpseq, "HHHHHHXXXXXX" );
184 strcat( tmpseq, "IIIIIIXXXXXX" );
185 strcat( tmpseq, "KKKKKKXXXXXX" );
186 strcat( tmpseq, "LLLLLLXXXXXX" );
187 strcat( tmpseq, "MMMMMMXXXXXX" );
188 strcat( tmpseq, "NNNNNNXXXXXX" );
189 strcat( tmpseq, "PPPPPPXXXXXX" );
190 strcat( tmpseq, "QQQQQQXXXXXX" );
191 strcat( tmpseq, "RRRRRRXXXXXX" );
192 strcat( tmpseq, "SSSSSSXXXXXX" );
193 strcat( tmpseq, "TTTTTTXXXXXX" );
194 strcat( tmpseq, "VVVVVVXXXXXX" );
195 strcat( tmpseq, "WWWWWWXXXXXX" );
196 strcat( tmpseq, "YYYYYYXXXXXX" );
197 slen = strlen( tmpseq );
198 writeData_pointer( dfp, 1, &tmpname, &slen, &tmpseq );
201 sprintf( com, "formatdb -p f -i %s -o F", datafile );
203 sprintf( com, "formatdb -i %s -o F", datafile );
205 fprintf( stderr, "done.\n" );
207 for( i=0; i<20; i++ )
210 fprintf( stderr, "checking %c\n", aa );
213 sprintf( tmpseq+strlen( tmpseq ), "%c", aa );
214 qfp = fopen( queryfile, "w" );
215 if( !qfp ) ErrorExit( "Cannot open queryfile." );
216 writeData_pointer( qfp, 1, &tmpname, &slen, &tmpseq );
219 sprintf( com, "blastall -b %d -G 10 -E 1 -e 1e10 -p blastp -m 7 -i %s -d %s > %s\0", 1, queryfile, datafile, resultfile );
221 if( res ) ErrorExit( "error in blast" );
223 rfp = fopen( resultfile, "r" );
225 ErrorExit( "file 'fasta.$$' does not exist\n" );
226 res = ReadBlastm7_scoreonly( rfp, resvec, 1 );
227 fprintf( stdout, "%c: %f\n", 'A'+i, *resvec/6 );
229 if( ( (int)*resvec % 6 ) > 0.0 )
231 fprintf( stderr, "Error in blast, *resvec=%f\n", *resvec );
232 fprintf( stderr, "Error in blast, *resvec/6=%f\n", *resvec/6 );
235 tmpaminodis[aa][aa] = (int)( *resvec / 6 );
237 tmpaminodis['X']['X'] = 0;
244 static double *callfasta( char **seq, Scores *scores, int nin, int query, int rewritedata )
252 static char datafile[1000];
253 static char queryfile[1000];
254 static char resultfile[1000];
257 static char *tmpname;
261 static Scores *scoresbk = NULL;
262 static int ninbk = 0;
267 sprintf( datafile, "/tmp/data-%d\0", pid );
268 sprintf( queryfile, "/tmp/query-%d\0", pid );
269 sprintf( resultfile, "/tmp/fasta-%d\0", pid );
271 tmpseq = calloc( nlenmax+1, sizeof( char ) );
272 tmpname = calloc( B+1, sizeof( char ) );
275 val = calloc( nin, sizeof( double ) );
276 // fprintf( stderr, "nin=%d, q=%d\n", nin, query );
282 fprintf( stderr, "\nformatting .. " );
283 dfp = fopen( datafile, "w" );
284 if( !dfp ) ErrorExit( "Cannot open datafile." );
285 for( i=0; i<nin; i++ )
287 // fprintf( stderr, "i=%d / %d / %d\n", i, nin, njob );
288 // fprintf( stderr, "nlenmax = %d\n", nlenmax );
289 // fprintf( stderr, "scores[i].orilen = %d\n", scores[i].orilen );
290 // fprintf( stderr, "strlen( seq[scores[i].numinseq] = %d\n", strlen( seq[scores[i].numinseq] ) );
291 gappick0( tmpseq, seq[scores[i].numinseq] );
292 sprintf( tmpname, "+===========+%d \0", i );
293 slen = scores[i].orilen;
294 writeData_pointer( dfp, 1, &tmpname, &slen, &tmpseq );
300 gappick0( tmpseq, seq[scores[query].numinseq] );
301 sprintf( tmpname, "+==========+%d \0", 0 );
302 slen = scores[query].orilen;
303 qfp = fopen( queryfile, "w" );
304 if( !qfp ) ErrorExit( "Cannot open queryfile." );
305 writeData_pointer( qfp, 1, &tmpname, &slen, &tmpseq );
307 // fprintf( stderr, "q=%s\n", tmpseq );
310 sprintf( com, "fasta34 -z3 -m10 -n -Q -b%d -E%d -d%d %s %s %d > %s\0", M, M, M, queryfile, datafile, 1, resultfile );
312 sprintf( com, "fastea34 -z3 -m10 -Q -b%d -E%d -d%d %s %s %d > %s\0", M, M, M, queryfile, datafile, 6, resultfile );
314 if( res ) ErrorExit( "error in fasta" );
318 rfp = fopen( resultfile, "r" );
320 ErrorExit( "file 'fasta.$$' does not exist\n" );
321 res = ReadBlastm7_scoreonly( rfp, val, nin );
325 for( i=0; i<nin; i++ )
326 fprintf( stderr, "r[%d-%d] = %f\n", 0, i, val[i] );
331 static double *callblast( char **seq, Scores *scores, int nin, int query, int rewritedata )
339 static char datafile[1000];
340 static char queryfile[1000];
341 static char resultfile[1000];
344 static char *tmpname;
348 static Scores *scoresbk = NULL;
349 static int ninbk = 0;
354 sprintf( datafile, "/tmp/data-%d\0", pid );
355 sprintf( queryfile, "/tmp/query-%d\0", pid );
356 sprintf( resultfile, "/tmp/fasta-%d\0", pid );
358 tmpseq = calloc( nlenmax+1, sizeof( char ) );
359 tmpname = calloc( B+1, sizeof( char ) );
362 val = calloc( nin, sizeof( double ) );
363 // fprintf( stderr, "nin=%d, q=%d\n", nin, query );
369 fprintf( stderr, "\nformatting .. " );
370 dfp = fopen( datafile, "w" );
371 if( !dfp ) ErrorExit( "Cannot open datafile." );
372 for( i=0; i<nin; i++ )
374 // fprintf( stderr, "i=%d / %d / %d\n", i, nin, njob );
375 // fprintf( stderr, "nlenmax = %d\n", nlenmax );
376 // fprintf( stderr, "scores[i].orilen = %d\n", scores[i].orilen );
377 // fprintf( stderr, "strlen( seq[scores[i].numinseq] = %d\n", strlen( seq[scores[i].numinseq] ) );
378 gappick0( tmpseq, seq[scores[i].numinseq] );
379 sprintf( tmpname, "+===========+%d \0", i );
380 slen = scores[i].orilen;
381 writeData_pointer( dfp, 1, &tmpname, &slen, &tmpseq );
386 sprintf( com, "formatdb -p f -i %s -o F", datafile );
388 sprintf( com, "formatdb -i %s -o F", datafile );
390 // fprintf( stderr, "done.\n" );
394 gappick0( tmpseq, seq[scores[query].numinseq] );
395 sprintf( tmpname, "+==========+%d \0", 0 );
396 slen = scores[query].orilen;
397 qfp = fopen( queryfile, "w" );
398 if( !qfp ) ErrorExit( "Cannot open queryfile." );
399 writeData_pointer( qfp, 1, &tmpname, &slen, &tmpseq );
401 // fprintf( stderr, "q=%s\n", tmpseq );
403 fprintf( stderr, "\ncalling blast .. \n" );
405 sprintf( com, "blastall -b %d -e 1e10 -p blastn -m 7 -i %s -d %s > %s\0", nin, queryfile, datafile, resultfile );
407 sprintf( com, "blastall -b %d -G 10 -E 1 -e 1e10 -p blastp -m 7 -i %s -d %s > %s\0", nin, queryfile, datafile, resultfile );
409 if( res ) ErrorExit( "error in blast" );
411 rfp = fopen( resultfile, "r" );
413 ErrorExit( "file 'fasta.$$' does not exist\n" );
414 res = ReadBlastm7_scoreonly( rfp, val, nin );
418 for( i=0; i<nin; i++ )
419 fprintf( stderr, "r[%d-%d] = %f\n", 0, i, val[i] );
425 static void selhead( int *ar, int n )
435 if( ( tmp = *ptr++ ) < min )
450 void arguments( int argc, char *argv[] )
492 ppenalty_ex = NOTSPECIFIED;
494 kimuraR = NOTSPECIFIED;
497 fftWinSize = NOTSPECIFIED;
498 fftThreshold = NOTSPECIFIED;
500 classsize = NOTSPECIFIED;
501 picksize = NOTSPECIFIED;
503 while( --argc > 0 && (*++argv)[0] == '-' )
505 while ( c = *++argv[0] )
510 picksize = atoi( *++argv );
511 fprintf( stderr, "picksize = %d\n", picksize );
515 classsize = atoi( *++argv );
516 fprintf( stderr, "groupsize = %d\n", classsize );
521 fprintf( stderr, "inputfile = %s\n", inputfile );
525 ppenalty = (int)( atof( *++argv ) * 1000 - 0.5 );
526 // fprintf( stderr, "ppenalty = %d\n", ppenalty );
530 ppenalty_ex = (int)( atof( *++argv ) * 1000 - 0.5 );
531 fprintf( stderr, "ppenalty_ex = %d\n", ppenalty_ex );
535 poffset = (int)( atof( *++argv ) * 1000 - 0.5 );
536 // fprintf( stderr, "poffset = %d\n", poffset );
540 kimuraR = atoi( *++argv );
541 fprintf( stderr, "kimuraR = %d\n", kimuraR );
545 nblosum = atoi( *++argv );
547 // fprintf( stderr, "blosum %d\n", nblosum );
551 pamN = atoi( *++argv );
554 fprintf( stderr, "jtt %d\n", pamN );
558 pamN = atoi( *++argv );
561 fprintf( stderr, "tm %d\n", pamN );
642 fftThreshold = atoi( *++argv );
646 fftWinSize = atoi( *++argv );
653 fprintf( stderr, "illegal option %c\n", c );
663 cut = atof( (*argv) );
668 fprintf( stderr, "options: Check source file !\n" );
671 if( tbitr == 1 && outgap == 0 )
673 fprintf( stderr, "conflicting options : o, m or u\n" );
676 if( alg == 'C' && outgap == 0 )
678 fprintf( stderr, "conflicting options : C, o\n" );
686 void seq_grp_nuc( int *grp, char *seq )
691 tmp = amino_grp[*seq++];
695 fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
700 void seq_grp( int *grp, char *seq )
705 tmp = amino_grp[*seq++];
709 fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
714 void makecompositiontable_p( short *table, int *pointt )
718 while( ( point = *pointt++ ) != END_OF_VEC )
722 int commonsextet_p( short *table, int *pointt )
727 static short *memo = NULL;
728 static int *ct = NULL;
733 memo = (short *)calloc( tsize, sizeof( short ) );
734 if( !memo ) ErrorExit( "Cannot allocate memo\n" );
735 ct = (int *)calloc( MIN( maxl, tsize )+1, sizeof( int ) );
736 if( !ct ) ErrorExit( "Cannot allocate memo\n" );
740 while( ( point = *pointt++ ) != END_OF_VEC )
743 if( tmp < table[point] )
745 if( tmp == 0 ) *cp++ = point;
750 while( *cp != END_OF_VEC )
756 void makepointtable_nuc( int *pointt, int *n )
770 while( *n != END_OF_VEC )
772 point -= *p++ * 1024;
777 *pointt = END_OF_VEC;
780 void makepointtable( int *pointt, int *n )
787 point += *n++ * 1296;
794 while( *n != END_OF_VEC )
796 point -= *p++ * 7776;
801 *pointt = END_OF_VEC;
805 static void pairalign( int nseq, int *nlen, char **seq, int *mem1, int *mem2, double *weight, int *alloclen )
811 float pscore, tscore;
813 static char *indication1, *indication2;
814 static double *effarr1 = NULL;
815 static double *effarr2 = NULL;
816 static char **mseq1, **mseq2;
825 if( effarr1 == NULL )
827 fftlog = AllocateIntVec( nseq );
828 effarr1 = AllocateDoubleVec( nseq );
829 effarr2 = AllocateDoubleVec( nseq );
830 indication1 = AllocateCharVec( 150 );
831 indication2 = AllocateCharVec( 150 );
832 mseq1 = AllocateCharMtx( nseq, 0 );
833 mseq2 = AllocateCharMtx( nseq, 0 );
834 for( l=0; l<nseq; l++ ) fftlog[l] = 1;
840 len1 = strlen( seq[m1] );
841 len2 = strlen( seq[m2] );
842 if( *alloclen < len1 + len2 )
844 fprintf( stderr, "\nReallocating.." );
845 *alloclen = ( len1 + len2 ) + 1000;
846 ReallocateCharMtx( seq, nseq, *alloclen + 10 );
847 fprintf( stderr, "done. *alloclen = %d\n", *alloclen );
851 clus1 = fastconjuction_noname( mem1, seq, mseq1, effarr1, weight, indication1 );
852 clus2 = fastconjuction_noname( mem2, seq, mseq2, effarr2, weight, indication2 );
854 clus1 = fastconjuction_noweight( mem1, seq, mseq1, effarr1, indication1 );
855 clus2 = fastconjuction_noweight( mem2, seq, mseq2, effarr2, indication2 );
859 for( i=0; i<clus1; i++ )
860 fprintf( stderr, "in p seq[%d] = %s\n", mem1[i], seq[mem1[i]] );
861 for( i=0; i<clus2; i++ )
862 fprintf( stderr, "in p seq[%d] = %s\n", mem2[i], seq[mem2[i]] );
866 fprintf( stderr, "group1 = %.66s", indication1 );
867 if( strlen( indication1 ) > 66 ) fprintf( stderr, "..." );
868 fprintf( stderr, "\n" );
869 fprintf( stderr, "group2 = %.66s", indication2 );
870 if( strlen( indication2 ) > 66 ) fprintf( stderr, "..." );
871 fprintf( stderr, "\n" );
874 // fprintf( stdout, "mseq1 = %s\n", mseq1[0] );
875 // fprintf( stdout, "mseq2 = %s\n", mseq2[0] );
877 if( !nevermemsave && ( alg != 'M' && ( len1 > 10000 || len2 > 10000 ) ) )
879 fprintf( stderr, "\nlen1=%d, len2=%d, Switching to the memsave mode\n", len1, len2 );
881 if( commonIP ) FreeShortMtx( commonIP );
886 if( fftlog[m1] && fftlog[m2] ) ffttry = ( nlen[m1] > clus1 && nlen[m2] > clus2 );
889 if( force_fft || ( use_fft && ffttry ) )
891 fprintf( stderr, "\bf" );
894 fprintf( stderr, "\bm" );
895 // fprintf( stderr, "%d-%d", clus1, clus2 );
896 pscore = Falign_noudp( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen, fftlog+m1 );
900 // fprintf( stderr, "%d-%d", clus1, clus2 );
901 pscore = Falign( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen, fftlog+m1 );
906 fprintf( stderr, "\bd" );
911 pscore = Aalign( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen );
914 fprintf( stderr, "\bm" );
915 // fprintf( stderr, "%d-%d", clus1, clus2 );
916 pscore = MSalignmm( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen, NULL, NULL, NULL, NULL );
919 if( clus1 == 1 && clus2 == 1 )
921 // fprintf( stderr, "%d-%d", clus1, clus2 );
922 pscore = G__align11( mseq1, mseq2, *alloclen );
926 // fprintf( stderr, "%d-%d", clus1, clus2 );
927 pscore = A__align( mseq1, mseq2, effarr1, effarr2, clus1, clus2, *alloclen, NULL, &dumfl, NULL, NULL, NULL, NULL );
931 ErrorExit( "ERROR IN SOURCE FILE" );
935 fprintf( stderr, "score = %10.2f\n", pscore );
937 nlen[m1] = 0.5 * ( nlen[m1] + nlen[m2] );
942 static void treebase( int nseq, int *nlen, char **aseq, double *eff, int nalign, int ***topol, int *alloclen ) // topol
949 for( l=0; l<nlim; l++ )
951 fprintf( stderr, "in treebase, l = %d\n", l );
952 fprintf( stderr, "aseq[0] = %s\n", aseq[0] );
953 fprintf( stderr, "aseq[topol[l][0][0]] = %s\n", aseq[topol[l][0][0]] );
954 pairalign( nseq, nlen, aseq, topol[l][0], topol[l][1], eff, alloclen );
960 static void WriteOptions( FILE *fp )
963 if( dorp == 'd' ) fprintf( fp, "DNA\n" );
966 if ( scoremtx == 0 ) fprintf( fp, "JTT %dPAM\n", pamN );
967 else if( scoremtx == 1 ) fprintf( fp, "BLOSUM %d\n", nblosum );
968 else if( scoremtx == 2 ) fprintf( fp, "M-Y\n" );
970 fprintf( stderr, "Gap Penalty = %+5.2f, %+5.2f, %+5.2f\n", (double)ppenalty/1000, (double)ppenalty_ex/1000, (double)poffset/1000 );
971 if( use_fft ) fprintf( fp, "FFT on\n" );
973 fprintf( fp, "tree-base method\n" );
974 if( tbrweight == 0 ) fprintf( fp, "unweighted\n" );
975 else if( tbrweight == 3 ) fprintf( fp, "clustalw-like weighting\n" );
976 if( tbitr || tbweight )
978 fprintf( fp, "iterate at each step\n" );
979 if( tbitr && tbrweight == 0 ) fprintf( fp, " unweighted\n" );
980 if( tbitr && tbrweight == 3 ) fprintf( fp, " reversely weighted\n" );
981 if( tbweight ) fprintf( fp, " weighted\n" );
985 fprintf( fp, "Gap Penalty = %+5.2f, %+5.2f, %+5.2f\n", (double)ppenalty/1000, (double)ppenalty_ex/1000, (double)poffset/1000 );
988 fprintf( fp, "Algorithm A\n" );
989 else if( alg == 'A' )
990 fprintf( fp, "Algorithm A+\n" );
991 else if( alg == 'S' )
992 fprintf( fp, "Apgorithm S\n" );
993 else if( alg == 'C' )
994 fprintf( fp, "Apgorithm A+/C\n" );
996 fprintf( fp, "Unknown algorithm\n" );
998 if( treemethod == 'x' )
999 fprintf( fp, "Tree = UPGMA (3).\n" );
1000 else if( treemethod == 's' )
1001 fprintf( fp, "Tree = UPGMA (2).\n" );
1002 else if( treemethod == 'p' )
1003 fprintf( fp, "Tree = UPGMA (1).\n" );
1005 fprintf( fp, "Unknown tree.\n" );
1009 fprintf( fp, "FFT on\n" );
1011 fprintf( fp, "Basis : 4 nucleotides\n" );
1015 fprintf( fp, "Basis : Polarity and Volume\n" );
1017 fprintf( fp, "Basis : 20 amino acids\n" );
1019 fprintf( fp, "Threshold of anchors = %d%%\n", fftThreshold );
1020 fprintf( fp, "window size of anchors = %dsites\n", fftWinSize );
1023 fprintf( fp, "FFT off\n" );
1028 static int splitseq_mq( Scores *scores, int nin, int *nlen, char **seq, char **name, char *inputfile, int uniform, char **tree, int *alloclen, int *order, int *whichgroup, double *weight, int *depthpt )
1035 static int groupid = 0;
1036 static int branchid = 0;
1037 static int uniformid = 0;
1039 int selfscore0, selfscore1;
1043 static short *table1;
1044 Scores **outs, *ptr;
1054 static int *orderpos = NULL;
1074 double *minscoreinpick;
1080 static char **mseq1 = NULL;
1081 static char **mseq2 = NULL;
1082 double *blastresults;
1083 static int palloclen = 0;
1086 if( orderpos == NULL )
1088 if( palloclen == 0 )
1089 palloclen = *alloclen * 2;
1090 if( mseq1 == NULL && doalign && doalign != 'b' )
1092 mseq1 = AllocateCharMtx( 1, palloclen );
1093 mseq2 = AllocateCharMtx( 1, palloclen );
1103 *tree = (char *)calloc( 1, sizeof( char ) );
1113 fprintf( stderr, "checking before swap!!\n" );
1114 for( i=0; i<nin; i++ )
1116 fprintf( stderr, "scores[%d].seq (%d) = \n%s\n", i, scores[i].numinseq, scores[i].seq );
1117 if( strlen( seq[scores[i].numinseq] ) == 0 )
1120 fprintf( stderr, "OKASHII before swap!!\n" );
1127 for( i=0; i<njob; i++ )
1129 // fprintf( stderr, "seq[%d] = \n%s\n", i, seq[i] );
1130 if( strlen( seq[i] ) == 0 )
1132 fprintf( stderr, "OKASHII in seq!!\n" );
1141 selfscore0 = scores->selfscore;
1145 // fprintf( stderr, "ptr-scores=%d, selfscore = %d, score = %f\n", ptr-scores, ptr->selfscore, ptr->score );
1146 if( ptr->selfscore > selfscore0 )
1148 selfscore0 = ptr->selfscore;
1149 belongto = ptr-scores;
1156 // fprintf( stderr, "swap %d %s\n<->\n%d %s\n", 0, scores->name, belongto, (scores+belongto)->name );
1157 ptr = calloc( 1, sizeof( Scores ) );
1158 *ptr = scores[belongto];
1159 scores[belongto] = *scores;
1168 if( doalign == 'f' )
1170 blastresults = callfasta( seq, scores, nin, 0, 1 );
1171 if( scores->selfscore != blastresults[0] )
1173 fprintf( stderr, "WARNING: selfscore\n" );
1174 fprintf( stderr, "scores->numinseq = %d\n", scores->numinseq+1 );
1175 fprintf( stderr, "scores->orilen = %d\n", scores->orilen );
1176 fprintf( stderr, "scores->selfscore = %d, but blastresults[0] = %f\n", scores->selfscore, blastresults[0] );
1182 gappick0( mseq1[0], seq[scores->numinseq] );
1186 table1 = (short *)calloc( tsize, sizeof( short ) );
1187 if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
1188 makecompositiontable_p( table1, scores[0].pointt );
1191 selfscore0 = scores[0].selfscore;
1192 for( i=0; i<nin; i++ )
1194 if( scores->orilen > scores[i].orilen )
1196 longer = (double)scores->orilen;
1197 shorter = (double)scores[i].orilen;
1201 longer = (double)scores[i].orilen; // nai
1202 shorter = (double)scores->orilen; //nai
1206 lenfac = 1.0 / ( shorter / longer * lenfacd + lenfacb / ( longer + lenfacc ) + lenfaca );
1207 // lenfac = 1.0 / ( (double)LENFACA + (double)LENFACB / ( (double)longer + (double)LENFACC ) + (double)shorter / (double)longer * LENFACD );
1208 // fprintf( stderr, "lenfac = %f l=%d,%d\n", lenfac,scores->orilen, scores[i].orilen );
1215 if( doalign == 'b' )
1217 scores[i].score = ( 1.0 - blastresults[i] / MIN( scores->selfscore, scores[i].selfscore ) ) * 1;
1221 fprintf( stderr, "\r%d / %d ", i, nin );
1222 gappick0( mseq2[0], seq[scores[i].numinseq] );
1223 scores[i].score = ( 1.0 - (double)L__align11_noalign( mseq1, mseq2, palloclen, &off1, &off2 ) / MIN( selfscore0, scores[i].selfscore ) ) * 1;
1224 // fprintf( stderr, "scores[i] = %f\n", scores[i].score );
1225 // fprintf( stderr, "m1=%s\n", seq[scores[0].numinseq] );
1226 // fprintf( stderr, "m2=%s\n", seq[scores[i].numinseq] );
1230 scores[i].score = ( 1.0 - (double)commonsextet_p( table1, scores[i].pointt ) / MIN( selfscore0, scores[i].selfscore ) ) * lenfac;
1231 // if( i ) fprintf( stderr, "%d-%d d %4.2f len %d %d\n", 1, i+1, scores[i].score, scores->orilen, scores[i].orilen );
1233 if( doalign == 'b' ) free( blastresults );
1234 if( doalign == 0 ) free( table1 );
1237 // fprintf( stderr, "sorting .. " );
1238 qsort( scores, nin, sizeof( Scores ), (int (*)())dcompare );
1239 // fprintf( stderr, "done.\n" );
1241 maxdist = scores[nin-1].score;
1242 // fprintf( stderr, "maxdist? = %f, nin=%d\n", scores[nin-1].score, nin );
1244 if( nin < 2 || uniform == -1 ) // kokodato chotto muda
1246 fprintf( stderr, "\nLeaf %d / %d ", ++branchid, njob );
1248 outputfile = AllocateCharVec( strlen( inputfile ) + 100 );
1249 sprintf( outputfile, "%s-%d", inputfile, branchid );
1251 sprintf( outputfile + strlen(outputfile), "u%d", uniform );
1252 fprintf( stderr, "GROUP %d: %d member(s) (%d) %s\n", branchid, nin, scores[0].numinseq, outputfile );
1253 outfp = fopen( outputfile, "w" );
1257 fprintf( stderr, "Cannot open %s\n", outputfile );
1260 for( j=0; j<nin; j++ )
1261 fprintf( outfp, ">G%d %s\n%s\n", branchid, scores[j].name+1, seq[scores[j].numinseq] );
1270 tmptree = calloc( 100, sizeof( char ) );
1271 for( j=0; j<nin; j++ )
1273 treelen += sprintf( tmptree, "%d", scores[j].numinseq+1 );
1277 *tree = (char *)calloc( treelen + nin + 5, sizeof( char ) );
1278 if( nin > 1 ) **tree = '(';
1281 for( j=0; j<nin-1; j++ )
1283 sprintf( *tree+strlen( *tree ), "%d,", scores[j].numinseq+1 );
1285 sprintf( *tree+strlen( *tree ), "%d", scores[j].numinseq+1 );
1286 if( nin > 1 ) strcat( *tree, ")" );
1287 // fprintf( stdout, "*tree = %s\n", *tree );
1291 for( j=0; j<nin; j++ )
1293 *orderpos++ = scores[j].numinseq;
1294 // fprintf( stderr, "*order = %d\n", scores[j].numinseq );
1299 picks = AllocateIntVec( nin+1 );
1300 s_p_map = AllocateIntVec( nin+1 );
1301 s_y_map = AllocateIntVec( nin+1 );
1302 pickkouho = AllocateIntVec( nin+1 );
1304 // nkouho = getkouho( pickkouho, (picksize+100)/nin, nin, scores, seq );
1305 // nkouho = getkouho( pickkouho, 1.0, nin, scores, seq ); // zenbu
1306 // fprintf( stderr, "selecting kouhos phase 2\n" );
1307 // if( nkouho == 0 )
1309 // fprintf( stderr, "selecting kouhos, phase 2\n" );
1310 // nkouho = getkouho( pickkouho, 1.0, nin, scores, seq );
1312 // fprintf( stderr, "\ndone\n\n" );
1313 for( i=0; i<nin; i++ ) pickkouho[i] = i+1; nkouho = nin-1; // zenbu
1322 // fprintf( stderr, "pickkouho[0] = %d\n", pickkouho[0] );
1323 // fprintf( stderr, "pickkouho[nin-1] = %d\n", pickkouho[nin-1] );
1324 // fprintf( stderr, "\nMOST DISTANT kouho=%d, nin=%d, nkouho=%d\n", pickkouho[nkouho], nin, nkouho );
1325 picktmp = pickkouho[nkouho-1];
1327 if( ( scores[picktmp].shimon == scores[0].shimon ) && ( !strcmp( seq[scores[0].numinseq], seq[scores[picktmp].numinseq] ) ) )
1329 // fprintf( stderr, "known, j=%d (%d inori)\n", 0, scores[picks[0]].numinseq );
1330 // fprintf( stderr, "%s\n%s\n", seq[scores[picktmp].numinseq], seq[scores[picks[0]].numinseq] );
1337 // fprintf( stderr, "ok, %dth pick = %d (%d inori)\n", npick, picktmp, scores[picktmp].numinseq );
1341 while( npick<picksize && nkouho>0 )
1347 // fprintf( stderr, "rn = %d\n", rn );
1351 rn = rnd() * (nkouho);
1353 picktmp = pickkouho[rn];
1354 // fprintf( stderr, "rn=%d/%d (%d inori), kouho=%d, nin=%d, nkouho=%d\n", rn, nkouho, scores[pickkouho[rn]].numinseq, pickkouho[rn], nin, nkouho );
1356 // fprintf( stderr, "#kouho before swap\n" );
1357 // for( i=0; i<nkouho; i++ ) fprintf( stderr, "%d ", pickkouho[i] ); fprintf( stderr, "\n" );
1360 pickkouho[rn] = pickkouho[nkouho];
1362 // fprintf( stderr, "#kouho after swap\n" );
1363 // for( i=0; i<nkouho; i++ ) fprintf( stderr, "%d ", pickkouho[i] ); fprintf( stderr, "\n" );
1364 for( j=0; j<npick; j++ )
1366 if( scores[picktmp].shimon == scores[picks[j]].shimon && !strcmp( seq[scores[picks[j]].numinseq], seq[scores[picktmp].numinseq] ) )
1372 // fprintf( stderr, "ok, %dth pick = %d (%d inori)\n", npick, picktmp, scores[picktmp].numinseq );
1378 // fprintf( stderr, "known, j=%d (%d inori)\n", j, scores[picks[j]].numinseq );
1382 for( i=0; i<nin; i++ )
1384 fprintf( stderr, "i=%d/%d, scores[%d].score = %f, inori=%d\n", i, nin, i, scores[i].score, scores[i].numinseq );
1386 fprintf( stderr, "range:nin=%d scores[%d].score <= %f\n", nin, npick, scores[nin-1].score);
1387 for( i=0; i<npick; i++ )
1389 fprintf( stderr, "i=%d/%d, scores[%d].score = %f, inori=%d\n", i, npick, picks[i], scores[picks[i]].score, scores[picks[i]].numinseq );
1394 // fprintf( stderr, "\nnkouho=%d, defaultq2 = %d (%d inori)\n", nkouho, picks[npick-1], scores[picks[npick-1]].numinseq );
1396 qsort( picks, npick, sizeof( int ), (int (*)())intcompare );
1398 // fprintf( stderr, "allocating..\n" );
1400 // fprintf( stderr, "allocating outs, npick = %d\n", npick );
1401 numin = calloc( npick, sizeof( int ) );
1402 tsukau = calloc( npick, sizeof( int ) );
1403 outs = calloc( npick, sizeof( Scores * ) );
1404 for( i=0; i<npick; i++ ) outs[i] = NULL;
1405 topol = AllocateIntCub( npick, 2, 0 );
1406 treeorder = AllocateIntVec( npick + 1 );
1407 len = AllocateFloatMtx( npick, 2 );
1408 pickmtx = AllocateFloatHalfMtx( npick );
1409 yukomtx = AllocateFloatHalfMtx( npick );
1410 minscoreinpick = AllocateDoubleVec( npick );
1411 yukos = AllocateIntVec( npick );
1412 p_o_map = AllocateIntVec( npick+1 );
1413 y_o_map = AllocateIntVec( npick+1 );
1415 for( i=0; i<nin; i++ ) s_p_map[i] = -1;
1416 // fprintf( stderr, "npick = %d\n", npick );
1417 // fprintf( stderr, "picks =" );
1418 for( i=0; i<npick; i++ )
1420 s_p_map[picks[i]] = i;
1421 p_o_map[i] = scores[picks[i]].numinseq;
1422 // fprintf( stderr, " %d\n", picks[i] );
1424 // fprintf( stderr, "\n" );
1427 fprintf( stderr, "p_o_map =" );
1428 for( i=0; i<npick; i++ )
1430 fprintf( stderr, " %d", p_o_map[i] );
1432 fprintf( stderr, "\n" );
1435 for( j=0; j<nin; j++ )
1437 if( s_p_map[j] != -1 )
1439 pickmtx[0][s_p_map[j]] = (float)scores[j].score;
1440 // fprintf( stderr, "pickmtx[0][%d] = %f\n", s_p_map[j], pickmtx[0][s_p_map[j]] );
1444 for( j=1; j<npick; j++ )
1448 if( doalign == 'f' )
1450 blastresults = callfasta( seq, scores, nin, picks[j], 1 );
1451 if( scores[picks[j]].selfscore != blastresults[picks[j]] )
1453 fprintf( stderr, "WARNING: selfscore\n" );
1454 fprintf( stderr, "scores->numinseq = %d\n", scores->numinseq+1 );
1455 fprintf( stderr, "scores->orilen = %d\n", scores->orilen );
1456 fprintf( stderr, "scores->selfscore = %d, but blastresults[0] = %f\n", scores->selfscore, blastresults[0] );
1462 gappick0( mseq1[0], seq[scores[picks[j]].numinseq] );
1466 table1 = (short *)calloc( tsize, sizeof( short ) );
1467 if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
1468 makecompositiontable_p( table1, scores[picks[j]].pointt );
1471 selfscore0 = scores[picks[j]].selfscore;
1472 pickmtx[j][0] = 0.0;
1473 for( i=j+1; i<npick; i++ )
1475 if( scores[picks[j]].orilen > scores[picks[i]].orilen )
1477 longer = (double)scores[picks[j]].orilen;
1478 shorter = (double)scores[picks[i]].orilen;
1482 longer = (double)scores[picks[i]].orilen;
1483 shorter = (double)scores[picks[j]].orilen;
1487 lenfac = 1.0 / ( shorter / longer * lenfacd + lenfacb / ( longer + lenfacc ) + lenfaca );
1488 // lenfac = 1.0 / ( (double)LENFACA + (double)LENFACB / ( (double)longer + (double)LENFACC ) + (double)shorter / (double)longer * LENFACD );
1489 // fprintf( stderr, "lenfac = %f l=%d,%d\n", lenfac,scores->orilen, scores[i].orilen );
1496 if( doalign == 'b' )
1498 pickmtx[j][i-j] = ( 1.0 - blastresults[picks[i]] / MIN( scores->selfscore, scores[picks[i]].selfscore ) ) * 1;
1502 fprintf( stderr, "\r%d / %d ", i, nin );
1503 gappick0( mseq2[0], seq[scores[picks[i]].numinseq] );
1504 pickmtx[j][i-j] = ( 1.0 - (double)L__align11_noalign( mseq1, mseq2, palloclen, &off1, &off2 ) / MIN( selfscore0, scores[picks[i]].selfscore ) ) * 1;
1505 // fprintf( stderr, "scores[picks[i]] = %f\n", scores[picks[i]].score );
1509 pickmtx[j][i-j] = ( 1.0 - (double)commonsextet_p( table1, scores[picks[i]].pointt ) / MIN( selfscore0, scores[picks[i]].selfscore ) ) * lenfac;
1512 if( doalign == 'b' ) free( blastresults );
1513 if( doalign == 0 ) free( table1 );
1517 fprintf( stderr, "pickmtx = \n" );
1518 for( i=0; i<npick; i++ )
1520 for( j=i; j<npick; j++ )
1522 fprintf( stderr, "pickmtx[%d][%d] = %f\n", p_o_map[i]+1, p_o_map[j]+1, pickmtx[i][j-i] );
1528 // for( i=0; i<npick-1; i++ ) for( j=i; j<npick; j++ )
1529 // fprintf( stderr, "dist[%d][%d] = %f\n", p_o_map[i]+1, p_o_map[j]+1, pickmtx[i][j-i] );
1531 for( i=0; i<npick; i++ ) tsukau[i] = 1;
1535 // fprintf( stderr, "check kaishi =>, npick=%d members = \n", npick );
1536 // for( i=0; i<npick; i++ ) fprintf( stderr, "%d (%d)", p_o_map[i]+1, picks[i] );
1537 // fprintf( stderr, "\n" );
1538 for( i=0; i<npick-1; i++ )
1540 if( tsukau[i] == 0 ) continue;
1541 for( j=i+1; j<npick; j++ )
1543 // float kijun = maxdist * 1/(npick-2);
1544 float kijun = maxdist * TOKYORIPARA;
1545 // if( i==0 && j == 1 ) continue; // machigai
1546 // if( maxdist == pickmtx[i][j-i] ) continue;
1547 if( tsukau[j] == 0 ) continue;
1548 // fprintf( stderr, "checking %d-%d (%d-%d) %f, kijun=%f\n", p_o_map[i]+1, p_o_map[j]+1, i, j, pickmtx[i][j-i], kijun );
1549 if( pickmtx[i][j-i] < kijun )
1551 // fprintf( stderr, "dame!! %d => %d, because %f < %f\n", p_o_map[j]+1, p_o_map[i]+1, pickmtx[i][j-i], kijun );
1553 if( scores[picks[i]].orilen > scores[picks[j]].orilen )
1555 fprintf( stderr, "%d => %d\n", p_o_map[j]+1, p_o_map[i]+1 );
1560 fprintf( stderr, "%d => %d\n", p_o_map[i]+1, p_o_map[j]+1 );
1563 if( 0 && j == npick-1 ) tsukau[i] = 0;
1565 fprintf( stderr, "tsukau[%d] = %d (%d inori)\n", j, tsukau[j], p_o_map[j]+1 );
1575 for( ii=0,i=0; i<npick; i++ )
1579 for( jj=ii,j=i; j<npick; j++ )
1583 yukomtx[ii][jj-ii] = pickmtx[i][j-i];
1590 for( ii=0,i=0; i<npick; i++ )
1593 yukos[ii++] = picks[i];
1599 for( i=0; i<npick; i++ ) for( j=i; j<npick; j++ )
1601 if( tsukau[i] == 1 && tsukau[j] == 1 )
1602 fprintf( stderr, "dist[%d][%d] = %f (ok)\n", p_o_map[i]+1, p_o_map[j]+1, pickmtx[i][j-i] );
1603 else if( tsukau[i] == 0 && tsukau[j] == 0 )
1604 fprintf( stderr, "dist[%d][%d] = %f (xx)\n", p_o_map[i]+1, p_o_map[j]+1, pickmtx[i][j-i] );
1606 fprintf( stderr, "%d-%d, okashii\n", p_o_map[i]+1, p_o_map[j]+1 );
1611 for( i=0; i<nin; i++ ) s_y_map[i] = -1;
1612 // fprintf( stderr, "npick = %d\n", npick );
1613 // fprintf( stderr, "picks =" );
1614 for( i=0; i<nyuko; i++ )
1616 s_y_map[yukos[i]] = i;
1617 y_o_map[i] = scores[yukos[i]].numinseq;
1618 // fprintf( stderr, " %d\n", yukos[i] );
1620 // fprintf( stderr, "\n" );
1622 for( i=0; i<nyuko; i++ ) for( j=i; j<nyuko; j++ )
1624 fprintf( stderr, "yukodist[%d][%d] = %f (ok)\n", y_o_map[i]+1, y_o_map[j]+1, yukomtx[i][j-i] );
1633 children = calloc( nyuko+1, sizeof( char * ) );
1634 for( i=0; i<nyuko+1; i++ ) children[i] = NULL;
1637 // fprintf( stderr, "done..\n" );
1639 // fprintf( stderr, "classifying, pick=%d \n", nyuko );
1644 fprintf( stderr, "okashii, nyuko = 1, shikashi npick = %d\n", npick );
1647 // fprintf( stderr, "### itchi suru hazu, nazenara scores[nin-1].score=%f, selfscores=%d,%d\n", scores[nin-1].score, scores[nin-1].selfscore, scores->selfscore );
1648 // fprintf( stderr, "seq[%d] = scores->seq = \n%s\n", scores->numinseq, seq[scores->numinseq] );
1651 for( j=0; j<nin; j++ )
1654 outs[belongto] = realloc( outs[belongto], sizeof( Scores ) * ( numin[belongto] + 1 ) );
1655 outs[belongto][numin[belongto]] = scores[j];
1661 dfromc = AllocateDoubleMtx( nyuko, nin );
1663 for( i=0; i<nyuko; i++ ) for( j=0; j<nin; j++ )
1664 dfromc[i][j] = -0.5;
1665 for( j=0; j<nin; j++ )
1667 dfromc[0][j] = ( scores[j].score );
1668 // fprintf( stderr, "j=%d, s_p_map[j]=%d\n", j, s_p_map[j] );
1671 fprintf( stderr, "\n\n%dx%d distance matrix\n", nyuko, nin );
1674 fprintf( stderr, "yukos = \n" );
1675 for( i=0; i<nyuko; i++ ) fprintf( stderr, "%d ", y_o_map[i] + 1 );
1676 fprintf( stderr, "\n" );
1679 for( i=1; i<nyuko; i++ )
1681 fprintf( stderr, "%d / %d \r", i, nyuko );
1685 if( doalign == 'f' )
1686 blastresults = callfasta( seq, scores, nin, yukos[i], 0 );
1688 gappick0( mseq1[0], seq[scores[yukos[i]].numinseq] );
1692 table1 = (short *)calloc( tsize, sizeof( short ) );
1693 if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
1694 makecompositiontable_p( table1, scores[yukos[i]].pointt );
1697 selfscore0 = scores[yukos[i]].selfscore;
1698 for( j=0; j<nin; j++ )
1700 if( scores[yukos[i]].orilen > scores[j].orilen )
1702 longer = scores[yukos[i]].orilen;
1703 shorter = scores[j].orilen;
1707 shorter = scores[yukos[i]].orilen;
1708 longer = scores[j].orilen;
1712 // lenfac = 1.0 / ( (double)LENFACA + (double)LENFACB / ( (double)longer + (double)LENFACC ) + (double)shorter / (double)longer * LENFACD );
1713 lenfac = 1.0 / ( shorter / longer * lenfacd + lenfacb / ( longer + lenfacc ) + lenfaca );
1714 // lenfac = 1.0 / ( shorter / longer * LENFACD + LENFACB / ( longer + LENFACC ) + LENFACA );
1715 // fprintf( stderr, "lenfac = %f, l=%d, %d\n", lenfac, scores[yukos[i]].orilen, scores[j].orilen );
1720 ii = s_y_map[j]; jj=s_y_map[yukos[i]];
1721 if( ii != -1 && jj != -1 )
1723 if( dfromc[ii][yukos[jj]] != -0.5 )
1725 dfromc[i][j] = dfromc[ii][yukos[jj]];
1735 dfromc[ii][yukos[jj]] =
1736 dfromc[i][j] = yukomtx[ii][jj-ii];
1744 if( doalign == 'b' )
1747 ( 1.0 - blastresults[j] / MIN( scores[yukos[i]].selfscore, scores[j].selfscore ) ) * 1;
1751 gappick0( mseq2[0], seq[scores[j].numinseq] );
1752 dfromc[i][j] = ( 1.0 - (double)L__align11_noalign( mseq1, mseq2, palloclen, &off1, &off2 ) / MIN( selfscore0, scores[j].selfscore ) ) * 1;
1756 dfromc[i][j] = ( 1.0 - (double)commonsextet_p( table1, scores[j].pointt ) / MIN( selfscore0, scores[j].selfscore ) ) * lenfac;
1758 // fprintf( stderr, "i,j=%d,%d (%d,%d)/ %d,%d, dfromc[][]=%f \n", i, j, scores[yukos[i]].numinseq, scores[j].numinseq, nyuko, nin, dfromc[i][j] );
1761 // fprintf( stdout, "&&& dfromc[%d][%d] (%d,%d) = %f\n", i, j, p_o_map[i], scores[j].numinseq, dfromc[i][j] );
1763 // fprintf( stderr, "i=%d, freeing\n", i );
1764 if( !doalign ) free( table1 );
1765 if( doalign && doalign == 'b' ) free( blastresults );
1767 fprintf( stderr, " \r" );
1771 for( i=0; i<nyuko; i++ )
1773 minscoreinpick[i] = 99.99;
1774 for( j=1; j<npick-i; j++ )
1776 // fprintf( stderr, "minscoreinpick[%d] ?o %f\n", p_o_map[i]+1, pickmtx[i][j] );
1777 if( minscoreinpick[i] > pickmtx[i][j] )
1778 minscoreinpick[i] = pickmtx[i][j];
1780 for( j=0; j<i; j++ )
1782 // fprintf( stderr, "minscoreinpick[%d] ?x %f\n", p_o_map[i]+1, pickmtx[j][i-j] );
1783 if( minscoreinpick[i] > pickmtx[j][i-j] )
1784 minscoreinpick[i] = pickmtx[j][i-j];
1786 minscoreinpick[i] *= 1.0;
1787 fprintf( stderr, "minscoreinpick[%d] = %f\n", p_o_map[i]+1, minscoreinpick[i] );
1793 for( i=0; i<nyuko; i++ ) numin[i] = 0;
1794 // fprintf( stderr, "### itchi shinai hazu, nazenara scores[nin-1].score=%f, selfscores=%d,%d, len=%d,%d, nin=%d\n", scores[nin-1].score, scores[nin-1].selfscore, scores->selfscore, scores->orilen, scores[nin-1].orilen, nin );
1795 for( j=0; j<nin; j++ )
1797 belongto = s_y_map[j];
1798 if( belongto == -1 )
1800 belongto = 0; // default ha horyu
1801 minscore = dfromc[0][j];
1802 for( i=0; i<nyuko; i++ )
1804 // fprintf( stderr, "checking %d/%d,%d/%d (%d-%d inori) minscore=%f, dfromc[0][j]=%f, dfromc[i][j]=%f\n", i, nyuko, j, nin, y_o_map[i], scores[j].numinseq, minscore, dfromc[0][j], dfromc[i][j] );
1805 if( scores[j].shimon == scores[yukos[i]].shimon && !strcmp( seq[scores[j].numinseq], seq[y_o_map[i]] ) )
1807 // fprintf( stderr, "yuko-%d (%d in ori) to score-%d (%d inori) ha kanzen itch\n", i, y_o_map[i], j, scores[j].numinseq );
1811 if( dfromc[i][j] < minscore )
1812 // if( rnd() < 0.5 ) // CHUUI !!!!!
1814 // fprintf( stderr, "yuko-%d (%d in ori) to score-%d (%d inori) ha tikai, %f>%f\n", i, y_o_map[i]+1, j, scores[j].numinseq+1, minscore, dfromc[i][j] );
1815 minscore = dfromc[i][j];
1821 if( dfromc[belongto][j] > minscoreinpick[belongto] )
1823 fprintf( stderr, "dame, %f > %f\n", dfromc[belongto][j], minscoreinpick[belongto] );
1827 fprintf( stderr, "ok, %f < %f\n", dfromc[belongto][j], minscoreinpick[belongto] );
1829 // fprintf( stderr, "j=%d (%d inori) -> %d (%d inori)\n", j, scores[j].numinseq+1, belongto, y_o_map[belongto]+1 );
1830 // fprintf( stderr, "numin = %d\n", numin[belongto] );
1831 outs[belongto] = realloc( outs[belongto], sizeof( Scores ) * ( numin[belongto] + 1 ) );
1832 outs[belongto][numin[belongto]] = scores[j];
1836 FreeDoubleMtx( dfromc );
1838 // fprintf( stderr, "##### npick = %d\n", npick );
1839 // fprintf( stderr, "##### nyuko = %d\n", nyuko );
1844 fprintf( stderr, "upgma " );
1845 veryfastsupg_float_realloc_nobk_halfmtx( nyuko, yukomtx, topol, len );
1846 fprintf( stderr, "\r \r" );
1850 topol[0][0] = (int *)realloc( topol[0][0], 2 * sizeof( int ) );
1851 topol[0][1] = (int *)realloc( topol[0][1], 2 * sizeof( int ) );
1853 topol[0][0][1] = -1;
1855 topol[0][1][1] = -1;
1860 fprintf( stderr, "nyuko = %d, topol[][] = \n", nyuko );
1861 for( j=0; j<nyuko-1; j++ )
1863 fprintf( stderr, "STEP%d \n", j );
1866 fprintf( stderr, "%d ", ( topol[j][0][i] )+0 );
1867 if( topol[j][0][i] == -1 ) break;
1869 fprintf( stderr, "\n" );
1872 fprintf( stderr, "%d ", ( topol[j][1][i] )+0 );
1873 if( topol[j][1][i] == -1 ) break;
1875 fprintf( stderr, "\n" );
1876 fprintf( stderr, "\n" );
1881 iptr = topol[nyuko-2][0]; while( *iptr != -1 ) *jptr++ = *iptr++;
1882 iptr = topol[nyuko-2][1]; while( *iptr != -1 ) *jptr++ = *iptr++;
1884 for( j=0; j<nyuko; j++ )
1886 // fprintf( stderr, "treeorder[%d] = %d\n", j, treeorder[j] );
1887 if( treeorder[j] == -1 ) break;
1892 for( i=0; i<nyuko; i++ )
1898 fprintf( stderr, "\ncalling a child, pick%d (%d inori): # of mem=%d\n", i, p_o_map[ii]+1, numin[ii] );
1899 for( j=0; j<numin[ii]; j++ )
1901 fprintf( stderr, "%d ", outs[ii][j].numinseq+1 );
1903 fprintf( stderr, "\n" );
1906 aligned *= splitseq_mq( outs[ii], numin[ii], nlen, seq, name, inputfile, uniform, children+ii, alloclen, order, whichgroup, weight, depthpt );
1910 for( i=0; i<nyuko; i++ )
1914 fprintf( stderr, "i=%d/%d, ERROR!\n", i, nyuko );
1915 for( j=0; j<nyuko; j++ )
1916 fprintf( stderr, "numin[%d] = %d (rep=%d inori)\n", j, numin[j], y_o_map[j] );
1925 for( i=0; i<nyuko; i++ )
1926 treelen += strlen( children[i] );
1927 *tree = calloc( treelen + nin * 3, sizeof ( char ) );
1932 if( nin >= classsize || !aligned )
1940 int mem1size, mem2size;
1945 static int *mem1 = NULL;
1946 static int *mem2 = NULL;
1952 parttree = (char **)calloc( nyuko, sizeof( char * ) );
1953 for( i=0; i<nyuko; i++ )
1955 // fprintf( stderr, "allocating parttree, size = %d\n", treelen + nin * 5 );
1956 parttree[i] = calloc( treelen + nin * 5, sizeof ( char ) );
1957 strcpy( parttree[i], children[i] );
1958 free( children[i] );
1965 mem1 = AllocateIntVec( njob+1 );
1966 mem2 = AllocateIntVec( njob+1 );
1969 // veryfastsupg_float_realloc_nobk_halfmtx( nyuko, yukomtx, topol, len );
1971 // counteff_simple_float( nyuko, topol, len, eff );
1975 for( l=0; l<nlim; l++ )
1977 mem1p = topol[l][0];
1980 while( *mem1p != -1 )
1982 // fprintf( stderr, "*mem1p = %d (%d inori), numin[]=%d\n", *mem1p, p_o_map[*mem1p], numin[*mem1p] );
1983 i = numin[*mem1p]; ptr = outs[*(mem1p++)];
1987 *mptr++ = (ptr++)->numinseq;
1992 mem2p = topol[l][1];
1995 while( *mem2p != -1 )
1997 // fprintf( stderr, "*mem2p = %d (%d inori), numin[]=%d\n", *mem2p, p_o_map[*mem2p], numin[*mem2p] );
1998 i = numin[*mem2p]; ptr = outs[*(mem2p++)];
2002 *mptr++ = (ptr++)->numinseq;
2007 qsort( mem1, mem1size, sizeof( int ), (int (*)())intcompare );
2008 qsort( mem2, mem2size, sizeof( int ), (int (*)())intcompare );
2009 // selhead( mem1, numin[0] );
2010 // selhead( mem2, numin[1] );
2014 fprintf( stderr, "\n" );
2015 fprintf( stderr, "mem1 (nin=%d) = \n", nin );
2018 fprintf( stderr, "%d ", mem1[i]+1 );
2019 if( mem1[i] == -1 ) break;
2021 fprintf( stderr, "\n" );
2022 fprintf( stderr, "mem2 (nin=%d) = \n", nin );
2025 fprintf( stderr, "%d ", mem2[i]+1 );
2026 if( mem2[i] == -1 ) break;
2028 fprintf( stderr, "\n" );
2032 fprintf( stderr, "before pairalign, l = %d, nyuko=%d, mem1size=%d, mem2size=%d\n", l, nyuko, mem1size, mem2size );
2033 fprintf( stderr, "before alignment\n" );
2034 for( j=0; j<mem1size; j++ )
2035 fprintf( stderr, "%s\n", seq[mem1[j]] );
2036 fprintf( stderr, "----\n" );
2037 for( j=0; j<mem2size; j++ )
2038 fprintf( stderr, "%s\n", seq[mem2[j]] );
2039 fprintf( stderr, "----\n\n" );
2041 fprintf( stderr, "\r Alignment %d-%d \r", mem1size, mem2size );
2046 pairalign( njob, nlen, seq, mem1, mem2, weight, alloclen );
2048 pairalign( njob, nlen, seq, mem2, mem1, weight, alloclen );
2054 v1 = topol[l][0][0];
2055 v2 = topol[l][1][0];
2057 // fprintf( stderr, "nyuko=%d, v1=%d, v2=%d\n", nyuko, v1, v2 );
2064 // fprintf( stderr, "nyuko=%d, v1=%d, v2=%d after sort\n", nyuko, v1, v2 );
2065 // fprintf( stderr, "nyuko=%d, v1=%d, v2=%d\n", nyuko, v1, v2 );
2066 // fprintf( stderr, "v1=%d, v2=%d, parttree[v1]=%s, parttree[v2]=%s\n", v1, v2, parttree[v1], parttree[v2] );
2067 sprintf( *tree, "(%s,%s)", parttree[v1], parttree[v2] );
2068 strcpy( parttree[v1], *tree );
2069 // fprintf( stderr, "parttree[%d] = %s\n", v1, parttree[v1] );
2070 // fprintf( stderr, "*tree = %s\n", *tree );
2071 free( parttree[v2] ); parttree[v2] = NULL;
2076 fprintf( stderr, "after alignment\n" );
2077 for( j=0; j<mem1size; j++ )
2078 fprintf( stderr, "%s\n", seq[mem1[j]] );
2079 fprintf( stderr, "----\n" );
2080 for( j=0; j<mem2size; j++ )
2081 fprintf( stderr, "%s\n", seq[mem2[j]] );
2082 fprintf( stderr, "----\n\n" );
2088 free( parttree[v1] ); parttree[v1] = NULL;
2089 // fprintf( stderr, "*tree = %s\n", *tree );
2090 // FreeCharMtx( parttree );
2091 free( parttree ); parttree = NULL;
2096 fprintf( stderr, "after alignment\n" );
2097 for( i=0; i<nyuko; i++ )
2099 for( j=0; j<numin[i]; j++ )
2100 fprintf( stderr, "%s\n", seq[outs[i][j].numinseq] );
2107 mptr = mem1; while( *mptr != -1 )
2110 fprintf( stdout, "==g1-%d \n", *mptr+1 );
2111 fprintf( stdout, "%s \n", seq[*mptr] );
2113 whichgroup[*mptr] = groupid;
2114 weight[*mptr++] *= 0.5;
2117 mptr = mem2; while( *mptr != -1 )
2120 fprintf( stdout, "=g2-%d ", *mptr+1 );
2121 fprintf( stdout, "%s \n", seq[*mptr] );
2123 whichgroup[*mptr] = groupid;
2124 weight[*mptr++] *= 0.5;
2129 mptr = mem1; while( *mptr != -1 )
2131 whichgroup[*mptr] = groupid;
2132 weight[*mptr++] *= (double)2.0/numin[0];
2137 if( *depthpt > maxdepth ) maxdepth = *depthpt;
2146 sprintf( *tree, "%s", children[0] );
2147 free( children[0] );
2152 for( i=0; i<npick; i++ ) free( (void *)outs[i] );
2153 FreeFloatHalfMtx( pickmtx, npick );
2154 FreeFloatHalfMtx( yukomtx, npick );
2155 FreeFloatMtx( len );
2156 FreeIntCub( topol );
2157 FreeIntVec( treeorder );
2168 free( minscoreinpick );
2173 static void alignparaphiles( int nseq, int *nlen, double *weight, char **seq, int nmem, int *members, int *alloclen )
2176 int *mem1 = AllocateIntVec( nmem );
2177 int *mem2 = AllocateIntVec( 2 );
2181 for( i=0; i<ilim; i++ )
2183 mem1[i] = members[i];
2185 mem2[0] = members[i+1];
2186 pairalign( nseq, nlen, seq, mem1, mem2, weight, alloclen );
2202 int main( int argc, char *argv[] )
2204 static char **name, **seq;
2206 static char *tmpseq;
2207 static int **pointt;
2217 static int *whichgroup;
2218 static double *weight;
2219 static char tmpname[B+100];
2230 double *blastresults;
2231 static char com[1000];
2234 static Scores *scores;
2235 static short *table1;
2240 arguments( argc, argv );
2244 infp = fopen( inputfile, "r" );
2247 fprintf( stderr, "Cannot open %s\n", inputfile );
2259 fprintf( stderr, "At least 2 sequences should be input!\n"
2260 "Only %d sequence found.\n", njob );
2264 if( picksize == NOTSPECIFIED )
2265 picksize = PICKSIZE;
2267 if( classsize == NOTSPECIFIED || classsize == 0 )
2269 classsize = njob + 1;
2273 // picksize = MIN( picksize, (int)sqrt( classsize ) + 1);
2276 alloclen = nlenmax * 2;
2277 name = AllocateCharMtx( njob, B+1 );
2278 seq = AllocateCharMtx( njob, alloclen+1 );
2279 nlen = AllocateIntVec( njob );
2280 tmpseq = calloc( nlenmax+1, sizeof( char ) );
2281 pointt = AllocateIntMtx( njob, nlenmax+1 );
2282 grpseq = AllocateIntVec( nlenmax + 1 );
2283 order = (int *)calloc( njob + 1, sizeof( int ) );
2284 whichgroup = (int *)calloc( njob, sizeof( int ) );
2285 weight = (double *)calloc( njob, sizeof( double ) );
2287 fprintf( stderr, "alloclen = %d in main\n", alloclen );
2289 for( i=0; i<njob; i++ ) whichgroup[i] = 0;
2290 for( i=0; i<njob; i++ ) weight[i] = 1.0;
2291 for( i=0; i<njob; i++ ) order[i] = -1;
2293 // readData( infp, name, nlen, seq );
2294 readData_pointer( infp, name, nlen, seq );
2298 constants( njob, seq );
2300 if( dorp == 'd' ) tsize = (int)pow( 4, 6 );
2301 else tsize = (int)pow( 6, 6 );
2319 for( i=0; i<njob; i++ )
2321 gappick0( tmpseq, seq[i] );
2322 nlen[i] = strlen( tmpseq );
2323 strcpy( seq[i], tmpseq );
2327 fprintf( stderr, "Seq %d, too short, %d characters\n", i+1, nlen[i] );
2328 fprintf( stderr, "name = %s\n", name[i] );
2329 fprintf( stderr, "seq = %s\n", seq[i] );
2332 if( nlen[i] > maxl ) maxl = nlen[i];
2333 if( dorp == 'd' ) /* nuc */
2335 seq_grp_nuc( grpseq, tmpseq );
2336 makepointtable_nuc( pointt[i], grpseq );
2340 seq_grp( grpseq, tmpseq );
2341 makepointtable( pointt[i], grpseq );
2346 fprintf( stderr, "params = %d, %d, %d\n", penalty, penalty_ex, offset );
2353 WriteOptions( trap_g );
2355 c = seqcheck( seq );
2358 fprintf( stderr, "Illeagal character %c\n", c );
2362 pid = (int)getpid();
2363 sprintf( datafile, "/tmp/data-%d\0", pid );
2364 sprintf( queryfile, "/tmp/query-%d\0", pid );
2365 sprintf( resultfile, "/tmp/fasta-%d\0", pid );
2367 scores = (Scores *)calloc( njob, sizeof( Scores ) );
2369 fprintf( stderr, "\nCalculating i-i scores ... \n" );
2370 for( i=0; i<njob; i++ )
2372 orilen = strlen( seq[i] );
2373 scores[i].numinseq = i; // irimasu
2374 scores[i].orilen = orilen;
2377 for( i=0; i<njob; i++ )
2379 scores[i].pointt = pointt[i];
2380 scores[i].shimon = (int)pointt[i][0] + (int)pointt[i][1] + (int)pointt[i][scores[i].orilen-6];
2381 scores[i].name = name[i];
2384 fprintf( stderr, "\r %05d/%05d ", i, njob );
2385 free( scores[i].pointt );
2386 if( doalign == 'f' )
2389 int ipos = (int)( i / KIZAMI ) * KIZAMI;
2390 int iposamari = i % KIZAMI;
2392 fprintf( stderr, "%d / %d\r", i, njob );
2393 // fprintf( stderr, "ipos = %d\n", ipos );
2394 // fprintf( stderr, "iposamari = %d\n", iposamari );
2396 // fprintf( stderr, " calling blast, i=%d\n", i );
2397 // blastresults = callfasta( seq, scores+i, 1, 0, 1 );
2398 blastresults = callfasta( seq, scores+ipos, MIN(KIZAMI,njob-ipos), iposamari, (iposamari==0) );
2399 // fprintf( stderr, "done., i=%d\n\n", i );
2400 scores[i].selfscore = (int)blastresults[iposamari];
2402 for( j=0; j<100; j++ )
2404 fprintf( stderr, "res[%d] = %f\n", j, blastresults[j] );
2407 // fprintf( stderr, "%d->selfscore = %d\n", i, scores[i].selfscore );
2408 free( blastresults );
2413 for( pt=seq[i]; *pt; pt++ )
2415 pscore += amino_dis[(int)*pt][(int)*pt];
2417 scores[i].selfscore = pscore;
2419 // fprintf( stderr, "selfscore[%d] = %d\n", i+1, scores[i].selfscore );
2423 table1 = (short *)calloc( tsize, sizeof( short ) );
2424 if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
2425 makecompositiontable_p( table1, pointt[i] );
2426 scores[i].selfscore = commonsextet_p( table1, pointt[i] );
2435 tree = (char **)calloc( 1, sizeof( char *) );
2437 // splitseq_bin( scores, njob, nlen, seq, name, inputfile, 0, tree, &alloclen, order, whichgroup, weight );
2438 completed = splitseq_mq( scores, njob, nlen, seq, name, inputfile, 0, tree, &alloclen, order, whichgroup, weight, &depth );
2439 treefile = (char *)calloc( strlen( inputfile ) + 10, sizeof( char ) );
2441 sprintf( treefile, "%s.tree", inputfile );
2443 sprintf( treefile, "splittbfast.tree" );
2444 treefp = fopen( treefile, "w" );
2445 fprintf( treefp, "%s\n", *tree );
2449 completed = splitseq_mq( scores, njob, nlen, seq, name, inputfile, 0, tree, &alloclen, order, whichgroup, weight, &depth );
2451 completed = splitseq_mq( scores, njob, nlen, seq, name, inputfile, 0, tree, &alloclen, order, whichgroup, weight, &depth );
2454 fprintf( stderr, "\nDone.\n\n" );
2461 for( i=0; i<njob; i++ )
2464 if( whichgroup[pos] != groupid )
2467 groupid = whichgroup[pos];
2469 if( whichgroup[pos] )
2473 paramem[npara] = -1;
2474 if( npara > 1 && classsize > 2 )
2476 qsort( paramem, npara, sizeof( int ), (int (*)(const void *, const void*))intcompare );
2477 // selhead( paramem, npara );
2478 alignparaphiles( njob, nlen, weight, seq, npara, paramem, &alloclen );
2480 free( paramem ); paramem = NULL; npara = 0;
2482 sprintf( tmpname, "Group-%d %s", groupnum, name[pos]+1 );
2486 paramem = realloc( paramem, sizeof( int) * ( npara + 2 ) );
2487 paramem[npara++] = pos;
2488 sprintf( tmpname, "Group-para %s", name[pos]+1 );
2491 strcpy( name[pos]+1, tmpname );
2495 paramem[npara] = -1;
2496 if( npara > 1 && classsize > 2 )
2498 qsort( paramem, npara, sizeof( int ), (int (*)(const void *, const void*))intcompare );
2499 // selhead( paramem, npara );
2500 alignparaphiles( njob, nlen, weight, seq, npara, paramem, &alloclen );
2502 free( paramem ); paramem = NULL; npara = 0;
2505 for( i=0; i<njob; i++ )
2507 sprintf( tmpname, "Group-%d %s", whichgroup[i], name[i]+1 );
2508 strcpy( name[i]+1, tmpname );
2513 // maketanni( name, seq, njob, nlenmax, nlen );
2518 fprintf( stderr, "writing alignment to stdout\n" );
2521 writeData_reorder_pointer( stdout, njob, name, nlen, seq, order );
2523 writeData_pointer( stdout, njob, name, nlen, seq );
2525 fprintf( stderr, "OSHIMAI\n" );
2527 if( classsize == 1 )
2529 fprintf( stderr, "\n\n", njob );
2530 fprintf( stderr, "----------------------------------------------------------------------------\n", njob );
2531 fprintf( stderr, "\n", njob );
2532 fprintf( stderr, "nseq = %d\n", njob );
2533 fprintf( stderr, "groupsize = %d, picksize=%d\n", classsize, picksize );
2534 fprintf( stderr, "The input sequences have been sorted so that similar sequences are close.\n" );
2538 fprintf( stderr, "\n" );
2539 fprintf( stderr, "A guide tree is in the '%s' file.\n", treefile );
2543 // fprintf( stderr, "To output guide tree,\n" );
2544 // fprintf( stderr, "%% %s -t -i %s\n", progName( argv[0] ), "inputfile" );
2547 fprintf( stderr, "\n", njob );
2548 fprintf( stderr, "----------------------------------------------------------------------------\n", njob );
2550 else if( groupnum > 1 )
2552 fprintf( stderr, "\n\n" );
2553 fprintf( stderr, "----------------------------------------------------------------------------\n", njob );
2554 fprintf( stderr, "\n" );
2555 fprintf( stderr, "groupsize = %d, picksize=%d\n", classsize, picksize );
2556 fprintf( stderr, "The input sequences have been classified into %d groups + some paraphyletic groups\n", groupnum );
2557 fprintf( stderr, "Note that the alignment is not completed.\n" );
2561 fprintf( stderr, "\n" );
2562 fprintf( stderr, "A guide tree is in the '%s' file.\n", treefile );
2566 // fprintf( stderr, "To output guide tree,\n" );
2567 // fprintf( stderr, "%% %s -t -i %s\n", progName( argv[0] ), "inputfile" );
2570 fprintf( stderr, "\n" );
2571 fprintf( stderr, "----------------------------------------------------------------------------\n", njob );
2575 fprintf( stderr, "\n\n" );
2576 fprintf( stderr, "----------------------------------------------------------------------------\n", njob );
2577 fprintf( stderr, "\n", njob );
2578 fprintf( stderr, "nseq = %d\n", njob );
2579 fprintf( stderr, "groupsize = %d, partsize=%d\n", classsize, picksize );
2580 // fprintf( stderr, "A single alignment containing all the input sequences has been computed.\n" );
2581 // fprintf( stderr, "If the sequences are highly diverged and you feel there are too many gaps,\n" );
2582 // fprintf( stderr, "please try \n" );
2583 // fprintf( stderr, "%% mafft --parttree --groupsize 100 inputfile\n" );
2584 // fprintf( stderr, "which classifies the sequences into several groups with <~ 100 sequences\n" );
2585 // fprintf( stderr, "and performs only intra-group alignments.\n" );
2589 fprintf( stderr, "\n" );
2590 fprintf( stderr, "A guide tree is in the '%s' file.\n", treefile );
2594 // fprintf( stderr, "To output guide tree,\n" );
2595 // fprintf( stderr, "%% %s -t -i %s\n", progName( argv[0] ), "inputfile" );
2598 fprintf( stderr, "\n", njob );
2599 fprintf( stderr, "----------------------------------------------------------------------------\n", njob );
2602 if( treeout ) free( treefile );
2606 fprintf( stdout, "weight =\n" );
2607 for( i=0; i<njob; i++ )
2608 fprintf( stdout, "%d: %f\n", i+1, weight[i] );
2611 if( doalign == 'b' )
2613 strcpy( com, "rm -f" );
2615 strcat( com, datafile );
2616 strcat( com, "* " );
2617 strcat( com, queryfile );
2619 strcat( com, resultfile );
2620 fprintf( stderr, "%s\n", com );