Next version of JABA
[jabaws.git] / binaries / src / mafft / core / setcore.c
1 #include "mltaln.h"
2
3 #define DEBUG 0
4 #define IODEBUG 0
5 #define SCOREOUT 1
6
7 double corethr;
8 int coreext;
9
10 void arguments( int argc, char *argv[] )
11 {
12     int c;
13
14         fftkeika = 1;
15         constraint = 0;
16         nblosum = 62;
17         fmodel = 0;
18         calledByXced = 0;
19         devide = 0;
20         use_fft = 0;
21         fftscore = 1;
22         fftRepeatStop = 0;
23         fftNoAnchStop = 0;
24     weight = 3;
25     utree = 1;
26         tbutree = 1;
27     refine = 0;
28     check = 1;
29     cut = 0.0;
30     disp = 0;
31     outgap = 1;
32     alg = 'A';
33     mix = 0;
34         tbitr = 0;
35         scmtd = 5;
36         tbweight = 0;
37         tbrweight = 3;
38         checkC = 0;
39         treemethod = 'x';
40         contin = 0;
41         scoremtx = 0;
42         kobetsubunkatsu = 0;
43         dorp = NOTSPECIFIED;
44         ppenalty = NOTSPECIFIED;
45         ppenalty_ex = NOTSPECIFIED;
46         poffset = NOTSPECIFIED;
47         kimuraR = NOTSPECIFIED;
48         pamN = NOTSPECIFIED;
49         geta2 = GETA2;
50         fftWinSize = NOTSPECIFIED;
51         fftThreshold = NOTSPECIFIED;
52         corethr = .5;
53         coreext = 0;
54
55     while( --argc > 0 && (*++argv)[0] == '-' )
56         {
57         while ( ( c = *++argv[0] ) )
58                 {
59             switch( c )
60             {
61                                 case 'f':
62                                         ppenalty = (int)( atof( *++argv ) * 1000 - 0.5 );
63                                         fprintf( stderr, "ppenalty = %d\n", ppenalty );
64                                         --argc;
65                                         goto nextoption;
66                                 case 'g':
67                                         ppenalty_ex = (int)( atof( *++argv ) * 1000 - 0.5 );
68                                         fprintf( stderr, "ppenalty_ex = %d\n", ppenalty_ex );
69                                         --argc;
70                                         goto nextoption;
71                                 case 'h':
72                                         poffset = (int)( atof( *++argv ) * 1000 - 0.5 );
73                                         fprintf( stderr, "poffset = %d\n", poffset );
74                                         --argc;
75                                         goto nextoption;
76                                 case 'k':
77                                         kimuraR = atoi( *++argv );
78                                         fprintf( stderr, "kimuraR = %d\n", kimuraR );
79                                         --argc;
80                                         goto nextoption;
81                                 case 'b':
82                                         nblosum = atoi( *++argv );
83                                         scoremtx = 1;
84                                         fprintf( stderr, "blosum %d\n", nblosum );
85                                         --argc;
86                                         goto nextoption;
87                                 case 'j':
88                                         pamN = atoi( *++argv );
89                                         scoremtx = 0;
90                                         fprintf( stderr, "jtt %d\n", pamN );
91                                         --argc;
92                                         goto nextoption;
93                                 case 'l':
94                                         fastathreshold = atof( *++argv );
95                                         constraint = 2;
96                                         fprintf( stderr, "weighti = %f\n", fastathreshold );
97                                         --argc;
98                                         goto nextoption;
99                                 case 'i':
100                                         corethr = atof( *++argv );
101                                         fprintf( stderr, "corethr = %f\n", corethr );
102                                         --argc;
103                                         goto nextoption;
104                                 case 'm':
105                                         fmodel = 1;
106                                         break;
107                                 case 'c':
108                                         coreext = 1;
109                                         break;
110                                 case 'r':
111                                         fmodel = -1;
112                                         break;
113                                 case 'D':
114                                         dorp = 'd';
115                                         break;
116                                 case 'P':
117                                         dorp = 'p';
118                                         break;
119                                 case 'e':
120                                         fftscore = 0;
121                                         break;
122                                 case 'O':
123                                         fftNoAnchStop = 1;
124                                         break;
125                                 case 'R':
126                                         fftRepeatStop = 1;
127                                         break;
128                                 case 'Q':
129                                         calledByXced = 1;
130                                         break;
131                                 case 's':
132                                         treemethod = 's';
133                                         break;
134                                 case 'x':
135                                         treemethod = 'x';
136                                         break;
137                                 case 'p':
138                                         treemethod = 'p';
139                                         break;
140                                 case 'a':
141                                         alg = 'a';
142                                         break;
143                                 case 'A':
144                                         alg = 'A';
145                                         break;
146                                 case 'S':
147                                         alg = 'S';
148                                         break;
149                                 case 'C':
150                                         alg = 'C';
151                                         break;
152                                 case 'F':
153                                         use_fft = 1;
154                                         break;
155                                 case 'v':
156                                         tbrweight = 3;
157                                         break;
158                                 case 'd':
159                                         disp = 1;
160                                         break;
161                                 case 'o':
162                                         outgap = 0;
163                                         break;
164 /* Modified 01/08/27, default: user tree */
165                                 case 'J':
166                                         tbutree = 0;
167                                         break;
168 /* modification end. */
169                                 case 'z':
170                                         fftThreshold = atoi( *++argv );
171                                         --argc; 
172                                         goto nextoption;
173                                 case 'w':
174                                         fftWinSize = atoi( *++argv );
175                                         --argc;
176                                         goto nextoption;
177                                 case 'Z':
178                                         checkC = 1;
179                                         break;
180                 default:
181                     fprintf( stderr, "illegal option %c\n", c );
182                     argc = 0;
183                     break;
184             }
185                 }
186                 nextoption:
187                         ;
188         }
189     if( argc == 1 )
190     {
191         cut = atof( (*argv) );
192         argc--;
193     }
194     if( argc != 0 ) 
195     {
196         fprintf( stderr, "options: Check source file !\n" );
197         exit( 1 );
198     }
199         if( tbitr == 1 && outgap == 0 )
200         {
201                 fprintf( stderr, "conflicting options : o, m or u\n" );
202                 exit( 1 );
203         }
204         if( alg == 'C' && outgap == 0 )
205         {
206                 fprintf( stderr, "conflicting options : C, o\n" );
207                 exit( 1 );
208         }
209 }
210
211
212
213 static void WriteOptions( FILE *fp )
214 {
215
216         if( dorp == 'd' ) fprintf( fp, "DNA\n" );
217         else
218         {
219                 if     ( scoremtx ==  0 ) fprintf( fp, "JTT %dPAM\n", pamN );
220                 else if( scoremtx ==  1 ) fprintf( fp, "BLOSUM %d\n", nblosum );
221                 else if( scoremtx ==  2 ) fprintf( fp, "M-Y\n" );
222         }
223     fprintf( stderr, "Gap Penalty = %+5.2f, %+5.2f, %+5.2f\n", (double)ppenalty/1000, (double)ppenalty_ex/1000, (double)poffset/1000 );
224     if( use_fft ) fprintf( fp, "FFT on\n" );
225
226         fprintf( fp, "tree-base method\n" );
227         if( tbrweight == 0 ) fprintf( fp, "unweighted\n" );
228         else if( tbrweight == 3 ) fprintf( fp, "clustalw-like weighting\n" );
229         if( tbitr || tbweight ) 
230         {
231                 fprintf( fp, "iterate at each step\n" );
232                 if( tbitr && tbrweight == 0 ) fprintf( fp, "  unweighted\n" ); 
233                 if( tbitr && tbrweight == 3 ) fprintf( fp, "  reversely weighted\n" ); 
234                 if( tbweight ) fprintf( fp, "  weighted\n" ); 
235                 fprintf( fp, "\n" );
236         }
237
238          fprintf( fp, "Gap Penalty = %+5.2f, %+5.2f, %+5.2f\n", (double)ppenalty/1000, (double)ppenalty_ex/1000, (double)poffset/1000 );
239
240         if( alg == 'a' )
241                 fprintf( fp, "Algorithm A\n" );
242         else if( alg == 'A' ) 
243                 fprintf( fp, "Algorithm A+\n" );
244         else if( alg == 'S' ) 
245                 fprintf( fp, "Apgorithm S\n" );
246         else if( alg == 'C' ) 
247                 fprintf( fp, "Apgorithm A+/C\n" );
248         else
249                 fprintf( fp, "Unknown algorithm\n" );
250
251         if( treemethod == 'x' )
252                 fprintf( fp, "Tree = UPGMA (3).\n" );
253         else if( treemethod == 's' )
254                 fprintf( fp, "Tree = UPGMA (2).\n" );
255         else if( treemethod == 'p' )
256                 fprintf( fp, "Tree = UPGMA (1).\n" );
257         else
258                 fprintf( fp, "Unknown tree.\n" );
259
260     if( use_fft )
261     {
262         fprintf( fp, "FFT on\n" );
263         if( dorp == 'd' )
264             fprintf( fp, "Basis : 4 nucleotides\n" );
265         else
266         {
267             if( fftscore )
268                 fprintf( fp, "Basis : Polarity and Volume\n" );
269             else
270                 fprintf( fp, "Basis : 20 amino acids\n" );
271         }
272         fprintf( fp, "Threshold   of anchors = %d%%\n", fftThreshold );
273         fprintf( fp, "window size of anchors = %dsites\n", fftWinSize );
274     }
275         else
276         fprintf( fp, "FFT off\n" );
277         fflush( fp );
278 }
279          
280
281 int main( int argc, char *argv[] )
282 {
283         static int  nlen[M];    
284         static char name[M][B], **seq;
285         static char **oseq;
286         static double **pscore;
287         static double *eff;
288         static double **node0, **node1;
289         static double *gapc;
290         static double *avgap;
291         double tmpavgap;
292         int i, j, m, goffset;
293         static int ***topol;
294         static double **len;
295         FILE *prep;
296         char c;
297         int corestart, coreend;
298         int alloclen;
299         int winsize;
300         char *pt, *ot;
301         double gapmin;
302
303         arguments( argc, argv );
304
305         getnumlen( stdin );
306         rewind( stdin );
307
308         if( njob < 2 )
309         {
310                 fprintf( stderr, "At least 2 sequences should be input!\n"
311                                                  "Only %d sequence found.\n", njob ); 
312                 exit( 1 );
313         }
314
315         seq = AllocateCharMtx( njob, nlenmax*9+1 );
316         oseq = AllocateCharMtx( njob, nlenmax*9+1 );
317         alloclen = nlenmax*9;
318
319         topol = AllocateIntCub( njob, 2, njob );
320         len = AllocateDoubleMtx( njob, 2 );
321         pscore = AllocateDoubleMtx( njob, njob );
322         eff = AllocateDoubleVec( njob );
323         node0 = AllocateDoubleMtx( njob, njob );
324         node1 = AllocateDoubleMtx( njob, njob );
325         gapc = AllocateDoubleVec( alloclen );
326         avgap = AllocateDoubleVec( alloclen );
327
328 #if 0
329         Read( name, nlen, seq );
330 #else
331         readData( stdin, name, nlen, seq );
332 #endif
333
334         constants( njob, seq );
335
336 #if 0
337         fprintf( stderr, "params = %d, %d, %d\n", penalty, penalty_ex, offset );
338 #endif
339
340         initSignalSM();
341
342         initFiles();
343
344         WriteOptions( trap_g );
345
346         c = seqcheck( seq );
347         if( c )
348         {
349                 fprintf( stderr, "Illeagal character %c\n", c );
350                 exit( 1 );
351         }
352
353         writePre( njob, name, nlen, seq, 0 );
354
355         if( tbutree == 0 )
356         {
357                 for( i=1; i<njob; i++ ) 
358                 {
359                         if( nlen[i] != nlen[0] ) 
360                         {
361                                 fprintf( stderr, "Input pre-aligned seqences or make hat2.\n" );
362                                 exit( 1 );
363                         }
364                 }
365                 for( i=0; i<njob-1; i++ ) for( j=i+1; j<njob; j++ ) 
366                 {
367                 /*
368                         pscore[i][j] = (double)score_calc1( seq[i], seq[j] );
369                 */
370                         pscore[i][j] = (double)substitution_hosei( seq[i], seq[j] );
371                 }
372         }
373         else
374         {
375                 fprintf( stderr, "Loading 'hat2' ... " );
376                 prep = fopen( "hat2", "r" );
377                 if( prep == NULL ) ErrorExit( "Make hat2." );
378                 readhat2( prep, njob, name, pscore );
379                 fclose( prep );
380                 fprintf( stderr, "done.\n" );
381
382 #if 0
383                 prep = fopen( "hat2_check", "w" );
384                 WriteHat2( prep, njob, name, pscore );
385                 fclose( prep );
386 #endif
387
388         }
389
390         fprintf( stderr, "Constructing dendrogram ... " );
391         if( treemethod == 'x' )
392                 supg( njob, pscore, topol, len );
393         else if( treemethod == 's' )
394                 spg( njob, pscore, topol, len );
395         else if( treemethod == 'p' )
396                 upg2( njob, pscore, topol, len );
397         else 
398                 ErrorExit( "Incorrect tree\n" );
399         fprintf( stderr, "done.\n" );
400
401         countnode( njob, topol, node0 );
402         if( tbrweight )
403         {
404                 weight = 3; 
405 #if 0
406                 utree = 0; counteff( njob, topol, len, eff ); utree = 1;
407 #else
408                 counteff_simple( njob, topol, len, eff );
409 #endif
410         }
411         else
412         {
413                 for( i=0; i<njob; i++ ) eff[i] = 1.0;
414         }
415
416
417         for( i=0; i<nlenmax; i++ )
418         {
419                 gapc[i] = 0.0;
420                 for( j=0; j<njob; j++ )
421                 {
422                         if( seq[j][i] == '-' ) gapc[i] += eff[j];
423                 }
424         }
425
426         gapmin = 1.0;
427         winsize = fftWinSize;
428         goffset = winsize/2;
429         tmpavgap = 0.0;
430         corestart = coreend = -1;
431         for( i=0; i<winsize; i++ )
432         {
433                 tmpavgap += gapc[i];
434         }
435         for( i=winsize; i<nlenmax; i++ )
436         {
437                 m = i - goffset;
438                 avgap[m] = tmpavgap / winsize;
439 //              fprintf( stdout, "%d %f %f\n", m, avgap[m], gapc[i] );
440                 if( avgap[m] < corethr )
441                 {
442                         if( corestart == -1 )
443                                 corestart = i - winsize;
444 //                      fprintf( stdout, "ok, gapmin = %f, corestart = %d, coreend = %d\n", gapmin, corestart, coreend );
445                         if( avgap[m] < gapmin )
446                         { 
447                                 gapmin = avgap[m];
448                         }
449                         coreend = i;
450                 }
451                 tmpavgap -= gapc[i-winsize];
452                 tmpavgap += gapc[i];
453         }
454         if( corestart == -1 || coreend == -1 )
455         {
456                 corestart = 0;
457                 coreend = nlenmax-1;
458         }
459
460         for( i=0; i<njob; i++ )
461         {
462                 pt = oseq[i];
463                 m = winsize;
464                 while( m-- ) *pt++ = '-';
465                 for( j=corestart; j<=coreend; j++ )
466                         *pt++ = seq[i][j];
467                 m = winsize;
468                 while( m-- ) *pt++ = '-';
469                 *pt = 0;
470
471                 ot = oseq[i]+winsize-1;
472                 pt = seq[i]+corestart-1;
473                 if( coreext ) m = winsize;
474                 else m = 0;
475                 while( m && --pt > seq[i] )
476                         if( *pt != '-' )
477                         {
478                                 *ot-- = *pt;
479                                 m--;
480                         }
481
482                 ot = oseq[i]+winsize+coreend-corestart+1;
483                 pt = seq[i]+coreend;
484                 if( coreext ) m = winsize;
485                 else m = 0;
486                 while( m && *(++pt) )
487                 {
488                         if( *pt != '-' ) 
489                         {
490                                 *ot++ = *pt;
491                                 m--;
492                         }
493                 }
494                 fprintf( stdout, ">%s\n", name[i] );
495                 fprintf( stdout, "%s\n", oseq[i] );
496         }
497
498         exit( 1 );
499
500         SHOWVERSION;
501         return( 0 );
502 }