new mafft v 6.857 with extensions
[jabaws.git] / binaries / src / mafft / core / setcore.c
1 #include "mltaln.h"
2
3 #define DEBUG 0
4 #define IODEBUG 0
5 #define SCOREOUT 1
6
7 double corethr;
8 int coreext;
9
10 void arguments( int argc, char *argv[] )
11 {
12     int c;
13
14         fftkeika = 1;
15         constraint = 0;
16         nblosum = 62;
17         fmodel = 0;
18         calledByXced = 0;
19         devide = 0;
20         use_fft = 0;
21         fftscore = 1;
22         fftRepeatStop = 0;
23         fftNoAnchStop = 0;
24     weight = 3;
25     utree = 1;
26         tbutree = 1;
27     refine = 0;
28     check = 1;
29     cut = 0.0;
30     disp = 0;
31     outgap = 1;
32     alg = 'A';
33     mix = 0;
34         tbitr = 0;
35         scmtd = 5;
36         tbweight = 0;
37         tbrweight = 3;
38         checkC = 0;
39         treemethod = 'x';
40         contin = 0;
41         scoremtx = 0;
42         kobetsubunkatsu = 0;
43         dorp = NOTSPECIFIED;
44         ppenalty = NOTSPECIFIED;
45         ppenalty_ex = NOTSPECIFIED;
46         poffset = NOTSPECIFIED;
47         kimuraR = NOTSPECIFIED;
48         pamN = NOTSPECIFIED;
49         geta2 = GETA2;
50         fftWinSize = NOTSPECIFIED;
51         fftThreshold = NOTSPECIFIED;
52         corethr = .5;
53         coreext = 0;
54
55     while( --argc > 0 && (*++argv)[0] == '-' )
56         {
57         while ( ( c = *++argv[0] ) )
58                 {
59             switch( c )
60             {
61                                 case 'f':
62                                         ppenalty = (int)( atof( *++argv ) * 1000 - 0.5 );
63                                         fprintf( stderr, "ppenalty = %d\n", ppenalty );
64                                         --argc;
65                                         goto nextoption;
66                                 case 'g':
67                                         ppenalty_ex = (int)( atof( *++argv ) * 1000 - 0.5 );
68                                         fprintf( stderr, "ppenalty_ex = %d\n", ppenalty_ex );
69                                         --argc;
70                                         goto nextoption;
71                                 case 'h':
72                                         poffset = (int)( atof( *++argv ) * 1000 - 0.5 );
73                                         fprintf( stderr, "poffset = %d\n", poffset );
74                                         --argc;
75                                         goto nextoption;
76                                 case 'k':
77                                         kimuraR = atoi( *++argv );
78                                         fprintf( stderr, "kimuraR = %d\n", kimuraR );
79                                         --argc;
80                                         goto nextoption;
81                                 case 'b':
82                                         nblosum = atoi( *++argv );
83                                         scoremtx = 1;
84                                         fprintf( stderr, "blosum %d\n", nblosum );
85                                         --argc;
86                                         goto nextoption;
87                                 case 'j':
88                                         pamN = atoi( *++argv );
89                                         scoremtx = 0;
90                                         fprintf( stderr, "jtt %d\n", pamN );
91                                         --argc;
92                                         goto nextoption;
93                                 case 'l':
94                                         fastathreshold = atof( *++argv );
95                                         constraint = 2;
96                                         fprintf( stderr, "weighti = %f\n", fastathreshold );
97                                         --argc;
98                                         goto nextoption;
99                                 case 'i':
100                                         corethr = atof( *++argv );
101                                         fprintf( stderr, "corethr = %f\n", corethr );
102                                         --argc;
103                                         goto nextoption;
104                                 case 'm':
105                                         fmodel = 1;
106                                         break;
107                                 case 'c':
108                                         coreext = 1;
109                                         break;
110                                 case 'r':
111                                         fmodel = -1;
112                                         break;
113                                 case 'D':
114                                         dorp = 'd';
115                                         break;
116                                 case 'P':
117                                         dorp = 'p';
118                                         break;
119                                 case 'e':
120                                         fftscore = 0;
121                                         break;
122                                 case 'O':
123                                         fftNoAnchStop = 1;
124                                         break;
125                                 case 'R':
126                                         fftRepeatStop = 1;
127                                         break;
128                                 case 'Q':
129                                         calledByXced = 1;
130                                         break;
131                                 case 's':
132                                         treemethod = 's';
133                                         break;
134                                 case 'x':
135                                         treemethod = 'x';
136                                         break;
137                                 case 'p':
138                                         treemethod = 'p';
139                                         break;
140                                 case 'a':
141                                         alg = 'a';
142                                         break;
143                                 case 'A':
144                                         alg = 'A';
145                                         break;
146                                 case 'S':
147                                         alg = 'S';
148                                         break;
149                                 case 'C':
150                                         alg = 'C';
151                                         break;
152                                 case 'F':
153                                         use_fft = 1;
154                                         break;
155                                 case 'v':
156                                         tbrweight = 3;
157                                         break;
158                                 case 'd':
159                                         disp = 1;
160                                         break;
161                                 case 'o':
162                                         outgap = 0;
163                                         break;
164 /* Modified 01/08/27, default: user tree */
165                                 case 'J':
166                                         tbutree = 0;
167                                         break;
168 /* modification end. */
169                                 case 'z':
170                                         fftThreshold = atoi( *++argv );
171                                         --argc; 
172                                         goto nextoption;
173                                 case 'w':
174                                         fftWinSize = atoi( *++argv );
175                                         --argc;
176                                         goto nextoption;
177                                 case 'Z':
178                                         checkC = 1;
179                                         break;
180                 default:
181                     fprintf( stderr, "illegal option %c\n", c );
182                     argc = 0;
183                     break;
184             }
185                 }
186                 nextoption:
187                         ;
188         }
189     if( argc == 1 )
190     {
191         cut = atof( (*argv) );
192         argc--;
193     }
194     if( argc != 0 ) 
195     {
196         fprintf( stderr, "options: Check source file !\n" );
197         exit( 1 );
198     }
199         if( tbitr == 1 && outgap == 0 )
200         {
201                 fprintf( stderr, "conflicting options : o, m or u\n" );
202                 exit( 1 );
203         }
204         if( alg == 'C' && outgap == 0 )
205         {
206                 fprintf( stderr, "conflicting options : C, o\n" );
207                 exit( 1 );
208         }
209 }
210
211
212
213 static void WriteOptions( FILE *fp )
214 {
215
216         if( dorp == 'd' ) fprintf( fp, "DNA\n" );
217         else
218         {
219                 if     ( scoremtx ==  0 ) fprintf( fp, "JTT %dPAM\n", pamN );
220                 else if( scoremtx ==  1 ) fprintf( fp, "BLOSUM %d\n", nblosum );
221                 else if( scoremtx ==  2 ) fprintf( fp, "M-Y\n" );
222         }
223     fprintf( stderr, "Gap Penalty = %+5.2f, %+5.2f, %+5.2f\n", (double)ppenalty/1000, (double)ppenalty_ex/1000, (double)poffset/1000 );
224     if( use_fft ) fprintf( fp, "FFT on\n" );
225
226         fprintf( fp, "tree-base method\n" );
227         if( tbrweight == 0 ) fprintf( fp, "unweighted\n" );
228         else if( tbrweight == 3 ) fprintf( fp, "clustalw-like weighting\n" );
229         if( tbitr || tbweight ) 
230         {
231                 fprintf( fp, "iterate at each step\n" );
232                 if( tbitr && tbrweight == 0 ) fprintf( fp, "  unweighted\n" ); 
233                 if( tbitr && tbrweight == 3 ) fprintf( fp, "  reversely weighted\n" ); 
234                 if( tbweight ) fprintf( fp, "  weighted\n" ); 
235                 fprintf( fp, "\n" );
236         }
237
238          fprintf( fp, "Gap Penalty = %+5.2f, %+5.2f, %+5.2f\n", (double)ppenalty/1000, (double)ppenalty_ex/1000, (double)poffset/1000 );
239
240         if( alg == 'a' )
241                 fprintf( fp, "Algorithm A\n" );
242         else if( alg == 'A' ) 
243                 fprintf( fp, "Algorithm A+\n" );
244         else if( alg == 'S' ) 
245                 fprintf( fp, "Apgorithm S\n" );
246         else if( alg == 'C' ) 
247                 fprintf( fp, "Apgorithm A+/C\n" );
248         else
249                 fprintf( fp, "Unknown algorithm\n" );
250
251         if( treemethod == 'x' )
252                 fprintf( fp, "Tree = UPGMA (3).\n" );
253         else if( treemethod == 's' )
254                 fprintf( fp, "Tree = UPGMA (2).\n" );
255         else if( treemethod == 'p' )
256                 fprintf( fp, "Tree = UPGMA (1).\n" );
257         else
258                 fprintf( fp, "Unknown tree.\n" );
259
260     if( use_fft )
261     {
262         fprintf( fp, "FFT on\n" );
263         if( dorp == 'd' )
264             fprintf( fp, "Basis : 4 nucleotides\n" );
265         else
266         {
267             if( fftscore )
268                 fprintf( fp, "Basis : Polarity and Volume\n" );
269             else
270                 fprintf( fp, "Basis : 20 amino acids\n" );
271         }
272         fprintf( fp, "Threshold   of anchors = %d%%\n", fftThreshold );
273         fprintf( fp, "window size of anchors = %dsites\n", fftWinSize );
274     }
275         else
276         fprintf( fp, "FFT off\n" );
277         fflush( fp );
278 }
279          
280
281 int main( int argc, char *argv[] )
282 {
283         static int  nlen[M];    
284         static char **name, **seq;
285         static char **oseq;
286         static double **pscore;
287         static double *eff;
288         static double **node0, **node1;
289         static double *gapc;
290         static double *avgap;
291         double tmpavgap;
292         int i, j, m, goffset;
293         static int ***topol;
294         static double **len;
295         FILE *prep;
296         char c;
297         int corestart, coreend;
298         int alloclen;
299         int winsize;
300         char *pt, *ot;
301         double gapmin;
302
303         arguments( argc, argv );
304
305         getnumlen( stdin );
306         rewind( stdin );
307
308         if( njob < 2 )
309         {
310                 fprintf( stderr, "At least 2 sequences should be input!\n"
311                                                  "Only %d sequence found.\n", njob ); 
312                 exit( 1 );
313         }
314
315         seq = AllocateCharMtx( njob, nlenmax*9+1 );
316         name = AllocateCharMtx( njob, B+1 );
317         oseq = AllocateCharMtx( njob, nlenmax*9+1 );
318         alloclen = nlenmax*9;
319
320         topol = AllocateIntCub( njob, 2, njob );
321         len = AllocateDoubleMtx( njob, 2 );
322         pscore = AllocateDoubleMtx( njob, njob );
323         eff = AllocateDoubleVec( njob );
324         node0 = AllocateDoubleMtx( njob, njob );
325         node1 = AllocateDoubleMtx( njob, njob );
326         gapc = AllocateDoubleVec( alloclen );
327         avgap = AllocateDoubleVec( alloclen );
328
329 #if 0
330         Read( name, nlen, seq );
331 #else
332         readData_pointer( stdin, name, nlen, seq );
333 #endif
334
335         constants( njob, seq );
336
337 #if 0
338         fprintf( stderr, "params = %d, %d, %d\n", penalty, penalty_ex, offset );
339 #endif
340
341         initSignalSM();
342
343         initFiles();
344
345         WriteOptions( trap_g );
346
347         c = seqcheck( seq );
348         if( c )
349         {
350                 fprintf( stderr, "Illeagal character %c\n", c );
351                 exit( 1 );
352         }
353
354         writePre( njob, name, nlen, seq, 0 );
355
356         if( tbutree == 0 )
357         {
358                 for( i=1; i<njob; i++ ) 
359                 {
360                         if( nlen[i] != nlen[0] ) 
361                         {
362                                 fprintf( stderr, "Input pre-aligned seqences or make hat2.\n" );
363                                 exit( 1 );
364                         }
365                 }
366                 for( i=0; i<njob-1; i++ ) for( j=i+1; j<njob; j++ ) 
367                 {
368                 /*
369                         pscore[i][j] = (double)score_calc1( seq[i], seq[j] );
370                 */
371                         pscore[i][j] = (double)substitution_hosei( seq[i], seq[j] );
372                 }
373         }
374         else
375         {
376                 fprintf( stderr, "Loading 'hat2' ... " );
377                 prep = fopen( "hat2", "r" );
378                 if( prep == NULL ) ErrorExit( "Make hat2." );
379                 readhat2_pointer( prep, njob, name, pscore );
380                 fclose( prep );
381                 fprintf( stderr, "done.\n" );
382
383 #if 0
384                 prep = fopen( "hat2_check", "w" );
385                 WriteHat2( prep, njob, name, pscore );
386                 fclose( prep );
387 #endif
388
389         }
390
391         fprintf( stderr, "Constructing dendrogram ... " );
392         if( treemethod == 'x' )
393                 supg( njob, pscore, topol, len );
394         else if( treemethod == 's' )
395                 spg( njob, pscore, topol, len );
396         else if( treemethod == 'p' )
397                 upg2( njob, pscore, topol, len );
398         else 
399                 ErrorExit( "Incorrect tree\n" );
400         fprintf( stderr, "done.\n" );
401
402         countnode( njob, topol, node0 );
403         if( tbrweight )
404         {
405                 weight = 3; 
406 #if 0
407                 utree = 0; counteff( njob, topol, len, eff ); utree = 1;
408 #else
409                 counteff_simple( njob, topol, len, eff );
410 #endif
411         }
412         else
413         {
414                 for( i=0; i<njob; i++ ) eff[i] = 1.0;
415         }
416
417
418         for( i=0; i<nlenmax; i++ )
419         {
420                 gapc[i] = 0.0;
421                 for( j=0; j<njob; j++ )
422                 {
423                         if( seq[j][i] == '-' ) gapc[i] += eff[j];
424                 }
425         }
426
427         gapmin = 1.0;
428         winsize = fftWinSize;
429         goffset = winsize/2;
430         tmpavgap = 0.0;
431         corestart = coreend = -1;
432         for( i=0; i<winsize; i++ )
433         {
434                 tmpavgap += gapc[i];
435         }
436         for( i=winsize; i<nlenmax; i++ )
437         {
438                 m = i - goffset;
439                 avgap[m] = tmpavgap / winsize;
440 //              fprintf( stdout, "%d %f %f\n", m, avgap[m], gapc[i] );
441                 if( avgap[m] < corethr )
442                 {
443                         if( corestart == -1 )
444                                 corestart = i - winsize;
445 //                      fprintf( stdout, "ok, gapmin = %f, corestart = %d, coreend = %d\n", gapmin, corestart, coreend );
446                         if( avgap[m] < gapmin )
447                         { 
448                                 gapmin = avgap[m];
449                         }
450                         coreend = i;
451                 }
452                 tmpavgap -= gapc[i-winsize];
453                 tmpavgap += gapc[i];
454         }
455         if( corestart == -1 || coreend == -1 )
456         {
457                 corestart = 0;
458                 coreend = nlenmax-1;
459         }
460
461         for( i=0; i<njob; i++ )
462         {
463                 pt = oseq[i];
464                 m = winsize;
465                 while( m-- ) *pt++ = '-';
466                 for( j=corestart; j<=coreend; j++ )
467                         *pt++ = seq[i][j];
468                 m = winsize;
469                 while( m-- ) *pt++ = '-';
470                 *pt = 0;
471
472                 ot = oseq[i]+winsize-1;
473                 pt = seq[i]+corestart-1;
474                 if( coreext ) m = winsize;
475                 else m = 0;
476                 while( m && --pt > seq[i] )
477                         if( *pt != '-' )
478                         {
479                                 *ot-- = *pt;
480                                 m--;
481                         }
482
483                 ot = oseq[i]+winsize+coreend-corestart+1;
484                 pt = seq[i]+coreend;
485                 if( coreext ) m = winsize;
486                 else m = 0;
487                 while( m && *(++pt) )
488                 {
489                         if( *pt != '-' ) 
490                         {
491                                 *ot++ = *pt;
492                                 m--;
493                         }
494                 }
495                 fprintf( stdout, ">%s\n", name[i] );
496                 fprintf( stdout, "%s\n", oseq[i] );
497         }
498
499         exit( 1 );
500
501         SHOWVERSION;
502         return( 0 );
503 }