new mafft v 6.857 with extensions
[jabaws.git] / binaries / src / mafft / core / mafft-distance.c
1 #include "mltaln.h"
2 #include "mtxutl.h"
3
4 #define DEBUG 0
5 #define TEST  0
6
7 #define END_OF_VEC -1
8
9 static int maxl;
10 static int tsize;
11 static char outputformat;
12 static float lenfaca, lenfacb, lenfacc, lenfacd;
13 #define PLENFACA 0.01
14 #define PLENFACB 10000
15 #define PLENFACC 10000
16 #define PLENFACD 0.1
17 #define DLENFACA 0.01
18 #define DLENFACB 2500
19 #define DLENFACC 2500
20 #define DLENFACD 0.1
21
22 void arguments( int argc, char *argv[] )
23 {
24         int c;
25
26         inputfile = NULL;
27         outputformat = 's';
28         scoremtx = 1;
29         nblosum = 62;
30         dorp = NOTSPECIFIED;
31         alg = 'X';
32
33     while( --argc > 0 && (*++argv)[0] == '-' )
34         {
35         while ( (c = *++argv[0]) )
36                 {
37             switch( c )
38             {
39                                 case 'i':
40                                         inputfile = *++argv;
41                                         fprintf( stderr, "inputfile = %s\n", inputfile );
42                                         --argc;
43                                         goto nextoption;
44                                 case 'p':
45                                         outputformat = 'p';
46                                         break;
47                                 case 'D':
48                                         dorp = 'd';
49                                         break;
50                                 case 'P':
51                                         dorp = 'p';
52                                         break;
53                 default:
54                     fprintf( stderr, "illegal option %c\n", c );
55                     argc = 0;
56                     break;
57             }
58                 }
59                 nextoption:
60                         ;
61         }
62         if( inputfile == NULL )
63         {
64                 argc--;
65                 inputfile = *argv;
66                 fprintf( stderr, "inputfile = %s\n", inputfile );
67         }
68     if( argc != 0 )
69     {
70         fprintf( stderr, "Usage: mafft-distance [-PD] [-i inputfile] inputfile > outputfile\n" );
71         exit( 1 );
72     }
73 }
74
75 void seq_grp_nuc( int *grp, char *seq )
76 {
77         int tmp;
78         int *grpbk = grp;
79         while( *seq )
80         {
81                 tmp = amino_grp[(int)*seq++];
82                 if( tmp < 4 )
83                         *grp++ = tmp;
84                 else
85                         fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
86         }
87         *grp = END_OF_VEC;
88         if( grp - grpbk < 6 )
89         {
90                 fprintf( stderr, "\n\nWARNING: Too short.\nPlease also consider use mafft-ginsi, mafft-linsi or mafft-ginsi.\n\n\n" );
91 //              exit( 1 );
92                 *grpbk = -1;
93         }
94 }
95
96 void seq_grp( int *grp, char *seq )
97 {
98         int tmp;
99         int *grpbk = grp;
100         while( *seq )
101         {
102                 tmp = amino_grp[(int)*seq++];
103                 if( tmp < 6 )
104                         *grp++ = tmp;
105                 else
106                         fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
107         }
108         *grp = END_OF_VEC;
109         if( grp - grpbk < 6 )
110         {
111                 fprintf( stderr, "\n\nWARNING: Too short.\nPlease also consider use mafft-ginsi, mafft-linsi or mafft-ginsi.\n\n\n" );
112 //              exit( 1 );
113                 *grpbk = -1;
114         }
115 }
116
117 void makecompositiontable_p( short *table, int *pointt )
118 {
119         int point;
120
121         while( ( point = *pointt++ ) != END_OF_VEC )
122                 table[point]++;
123 }
124
125 int commonsextet_p( short *table, int *pointt )
126 {
127         int value = 0;
128         short tmp;
129         int point;
130         static short *memo = NULL;
131         static int *ct = NULL;
132         static int *cp;
133
134         if( *pointt == -1 )
135                 return( 0 );
136
137         if( !memo )
138         {
139                 memo = (short *)calloc( tsize, sizeof( short ) );
140                 if( !memo ) ErrorExit( "Cannot allocate memo\n" );
141                 ct = (int *)calloc( MIN( maxl, tsize)+1, sizeof( int ) );
142                 if( !ct ) ErrorExit( "Cannot allocate memo\n" );
143         }
144
145         cp = ct;
146         while( ( point = *pointt++ ) != END_OF_VEC )
147         {
148                 tmp = memo[point]++;
149                 if( tmp < table[point] )
150                         value++;
151                 if( tmp == 0 ) *cp++ = point;
152 //              fprintf( stderr, "cp - ct = %d (tsize = %d)\n", cp - ct, tsize );
153         }
154         *cp = END_OF_VEC;
155         
156         cp =  ct;
157         while( *cp != END_OF_VEC )
158                 memo[*cp++] = 0;
159
160         return( value );
161 }
162
163 void makepointtable_nuc( int *pointt, int *n )
164 {
165         int point;
166         register int *p;
167
168         if( *n == -1 )
169         {
170                 *pointt = -1;
171                 return;
172         }
173
174         p = n;
175         point  = *n++ *  1024;
176         point += *n++ *   256;
177         point += *n++ *    64;
178         point += *n++ *    16;
179         point += *n++ *     4;
180         point += *n++;
181         *pointt++ = point;
182
183         while( *n != END_OF_VEC )
184         {
185                 point -= *p++ * 1024;
186                 point *= 4;
187                 point += *n++;
188                 *pointt++ = point;
189         }
190         *pointt = END_OF_VEC;
191 }
192
193 void makepointtable( int *pointt, int *n )
194 {
195         int point;
196         register int *p;
197
198         if( *n == -1 )
199         {
200                 *pointt = -1;
201                 return;
202         }
203
204         p = n;
205         point  = *n++ *  7776;
206         point += *n++ *  1296;
207         point += *n++ *   216;
208         point += *n++ *    36;
209         point += *n++ *     6;
210         point += *n++;
211         *pointt++ = point;
212
213         while( *n != END_OF_VEC )
214         {
215                 point -= *p++ * 7776;
216                 point *= 6;
217                 point += *n++;
218                 *pointt++ = point;
219         }
220         *pointt = END_OF_VEC;
221 }
222
223 int main( int argc, char **argv )
224 {
225         int i, j, initj;
226         FILE *infp;
227         char **seq;
228         int *grpseq;
229         char *tmpseq;
230         int  **pointt;
231         static char **name;
232         static int *nlen;
233         double *mtxself;
234         float score;
235         static short *table1;
236         float longer, shorter;
237         float lenfac;
238         float bunbo;
239
240         arguments( argc, argv );
241
242         if( inputfile )
243         {
244                 infp = fopen( inputfile, "r" );
245                 if( !infp )
246                 {
247                         fprintf( stderr, "Cannot open %s\n", inputfile );
248                         exit( 1 );
249                 }
250         }
251         else
252                 infp = stdin;
253
254 #if 0
255         PreRead( stdin, &njob, &nlenmax );
256 #else
257         getnumlen( infp );
258 #endif
259         rewind( infp );
260         if( njob < 2 )
261         {
262                 fprintf( stderr, "At least 2 sequences should be input!\n"
263                                                  "Only %d sequence found.\n", njob );
264                 exit( 1 );
265         }
266
267         tmpseq = AllocateCharVec( nlenmax+1 );
268         seq = AllocateCharMtx( njob, nlenmax+1 );
269         grpseq = AllocateIntVec( nlenmax+1 );
270         pointt = AllocateIntMtx( njob, nlenmax+1 );
271         mtxself = AllocateDoubleVec( njob );
272         pamN = NOTSPECIFIED;
273         name = AllocateCharMtx( njob, B );
274         nlen = AllocateIntVec( njob );
275
276 #if 0
277         FRead( infp, name, nlen, seq );
278 #else
279         readData_pointer( infp, name, nlen, seq );
280 #endif
281
282         fclose( infp );
283
284         constants( njob, seq );
285
286         if( dorp == 'd' ) tsize = (int)pow( 4, 6 );
287         else              tsize = (int)pow( 6, 6 );
288
289         if( dorp == 'd' )
290         {
291                 lenfaca = DLENFACA;
292                 lenfacb = DLENFACB;
293                 lenfacc = DLENFACC;
294                 lenfacd = DLENFACD;
295         }
296         else    
297         {
298                 lenfaca = PLENFACA;
299                 lenfacb = PLENFACB;
300                 lenfacc = PLENFACC;
301                 lenfacd = PLENFACD;
302         }
303
304         maxl = 0;
305         for( i=0; i<njob; i++ ) 
306         {
307                 gappick0( tmpseq, seq[i] );
308                 nlen[i] = strlen( tmpseq );
309 //              if( nlen[i] < 6 )
310 //              {
311 //                      fprintf( stderr, "Seq %d, too short, %d characters\n", i+1, nlen[i] );
312 //                      exit( 1 );
313 //              }
314                 if( nlen[i] > maxl ) maxl = nlen[i];
315                 if( dorp == 'd' ) /* nuc */
316                 {
317                         seq_grp_nuc( grpseq, tmpseq );
318                         makepointtable_nuc( pointt[i], grpseq );
319                 }
320                 else                 /* amino */
321                 {
322                         seq_grp( grpseq, tmpseq );
323                         makepointtable( pointt[i], grpseq );
324                 }
325         }
326         fprintf( stderr, "\nCalculating i-i scores ... " );
327         for( i=0; i<njob; i++ )
328         {
329                 table1 = (short *)calloc( tsize, sizeof( short ) );
330                 if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
331                 makecompositiontable_p( table1, pointt[i] );
332
333                 score = commonsextet_p( table1, pointt[i] );
334                 mtxself[i] = score;
335                 free( table1 );
336         }
337
338         fprintf( stderr, "done.\n" );
339         fprintf( stderr, "\nCalculating i-j scores ... \n" );
340         if( outputformat == 'p' ) fprintf( stdout, "%-5d", njob );
341         for( i=0; i<njob; i++ )
342         {
343                 if( outputformat == 'p' ) fprintf( stdout, "\n%-9d ", i+1 );
344                 table1 = (short *)calloc( tsize, sizeof( short ) );
345                 if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
346                 if( i % 10 == 0 )
347                 {
348                         fprintf( stderr, "%4d / %4d\r", i+1, njob );
349                 }
350                 makecompositiontable_p( table1, pointt[i] );
351
352
353                 if( outputformat == 'p' ) initj = 0;
354                 else                      initj = i+1;
355                 for( j=initj; j<njob; j++ ) 
356                 {
357                         if( nlen[i] > nlen[j] )
358                         {
359                                 longer=(float)nlen[i];
360                                 shorter=(float)nlen[j];
361                         }
362                         else
363                         {
364                                 longer=(float)nlen[j];
365                                 shorter=(float)nlen[i];
366                         }
367 //                      lenfac = 3.0 / ( LENFACA + LENFACB / ( longer + LENFACC ) + shorter / longer * LENFACD );
368                         lenfac = 1.0 / ( shorter / longer * lenfacd + lenfacb / ( longer + lenfacc ) + lenfaca );
369 //                      lenfac = 1.0;
370 //                      fprintf( stderr, "lenfac = %f (%.0f,%.0f)\n", lenfac, longer, shorter );
371                         score = commonsextet_p( table1, pointt[j] );
372                         bunbo = MIN( mtxself[i], mtxself[j] );
373                         if( outputformat == 'p' )
374                         {
375                                 if( bunbo == 0.0 )
376                                         fprintf( stdout, " %8.6f", 1.0 );
377                                 else
378                                         fprintf( stdout, " %8.6f", ( 1.0 - score / bunbo ) * lenfac );
379                                 if( j % 7 == 6 ) fprintf( stdout, "\n" );
380                         }
381                         else
382                         {
383                                 if( bunbo == 0.0 )
384                                         fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, 1.0, nlen[i], nlen[j] );
385                                 else
386                                         fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, ( 1.0 - score / bunbo ) * lenfac, nlen[i], nlen[j] );
387                         }
388 //                      fprintf( stderr, "##### mtx = %f, mtx[i][0]=%f, mtx[j][0]=%f, bunbo=%f\n", mtx[i][j-i], mtx[i][0], mtx[j][0], bunbo );
389 //          score = (double)commonsextet_p( table1, pointt[j] );
390 //                      fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, ( 1.0 - score / MIN( mtxself[i], mtxself[j] ) ) * 3, nlen[i], nlen[j] );
391
392
393                 } 
394                 free( table1 );
395         }
396                 
397         fprintf( stderr, "\n" );
398         if( outputformat == 'p' ) fprintf( stdout, "\n" );
399         SHOWVERSION;
400         exit( 0 );
401 }