todo update
[jabaws.git] / binaries / src / mafft / core / mafft-distance.c
1 #include "mltaln.h"
2 #include "mtxutl.h"
3
4 #define DEBUG 0
5 #define TEST  0
6
7 #define END_OF_VEC -1
8
9 static int maxl;
10 static int tsize;
11 static float lenfaca, lenfacb, lenfacc, lenfacd;
12 #define PLENFACA 0.01
13 #define PLENFACB 10000
14 #define PLENFACC 10000
15 #define PLENFACD 0.1
16 #define DLENFACA 0.01
17 #define DLENFACB 2500
18 #define DLENFACC 2500
19 #define DLENFACD 0.1
20
21 void arguments( int argc, char *argv[] )
22 {
23         int c;
24
25         inputfile = NULL;
26         scoremtx = 1;
27         nblosum = 62;
28         dorp = NOTSPECIFIED;
29         alg = 'X';
30
31     while( --argc > 0 && (*++argv)[0] == '-' )
32         {
33         while ( (c = *++argv[0]) )
34                 {
35             switch( c )
36             {
37                                 case 'i':
38                                         inputfile = *++argv;
39                                         fprintf( stderr, "inputfile = %s\n", inputfile );
40                                         --argc;
41                                         goto nextoption;
42                                 case 'D':
43                                         dorp = 'd';
44                                         break;
45                                 case 'P':
46                                         dorp = 'p';
47                                         break;
48                 default:
49                     fprintf( stderr, "illegal option %c\n", c );
50                     argc = 0;
51                     break;
52             }
53                 }
54                 nextoption:
55                         ;
56         }
57         if( inputfile == NULL )
58         {
59                 argc--;
60                 inputfile = *argv;
61                 fprintf( stderr, "inputfile = %s\n", inputfile );
62         }
63     if( argc != 0 )
64     {
65         fprintf( stderr, "Usage: mafft-distance [-PD] [-i inputfile] inputfile > outputfile\n" );
66         exit( 1 );
67     }
68 }
69
70 void seq_grp_nuc( int *grp, char *seq )
71 {
72         int tmp;
73         int *grpbk = grp;
74         while( *seq )
75         {
76                 tmp = amino_grp[(int)*seq++];
77                 if( tmp < 4 )
78                         *grp++ = tmp;
79                 else
80                         fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
81         }
82         *grp = END_OF_VEC;
83         if( grp - grpbk < 6 )
84         {
85                 fprintf( stderr, "\n\nWARNING: Too short.\nPlease also consider use mafft-ginsi, mafft-linsi or mafft-ginsi.\n\n\n" );
86 //              exit( 1 );
87                 *grpbk = -1;
88         }
89 }
90
91 void seq_grp( int *grp, char *seq )
92 {
93         int tmp;
94         int *grpbk = grp;
95         while( *seq )
96         {
97                 tmp = amino_grp[(int)*seq++];
98                 if( tmp < 6 )
99                         *grp++ = tmp;
100                 else
101                         fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
102         }
103         *grp = END_OF_VEC;
104         if( grp - grpbk < 6 )
105         {
106                 fprintf( stderr, "\n\nWARNING: Too short.\nPlease also consider use mafft-ginsi, mafft-linsi or mafft-ginsi.\n\n\n" );
107 //              exit( 1 );
108                 *grpbk = -1;
109         }
110 }
111
112 void makecompositiontable_p( short *table, int *pointt )
113 {
114         int point;
115
116         while( ( point = *pointt++ ) != END_OF_VEC )
117                 table[point]++;
118 }
119
120 int commonsextet_p( short *table, int *pointt )
121 {
122         int value = 0;
123         short tmp;
124         int point;
125         static short *memo = NULL;
126         static int *ct = NULL;
127         static int *cp;
128
129         if( *pointt == -1 )
130                 return( 0 );
131
132         if( !memo )
133         {
134                 memo = (short *)calloc( tsize, sizeof( short ) );
135                 if( !memo ) ErrorExit( "Cannot allocate memo\n" );
136                 ct = (int *)calloc( MIN( maxl, tsize)+1, sizeof( int ) );
137                 if( !ct ) ErrorExit( "Cannot allocate memo\n" );
138         }
139
140         cp = ct;
141         while( ( point = *pointt++ ) != END_OF_VEC )
142         {
143                 tmp = memo[point]++;
144                 if( tmp < table[point] )
145                         value++;
146                 if( tmp == 0 ) *cp++ = point;
147 //              fprintf( stderr, "cp - ct = %d (tsize = %d)\n", cp - ct, tsize );
148         }
149         *cp = END_OF_VEC;
150         
151         cp =  ct;
152         while( *cp != END_OF_VEC )
153                 memo[*cp++] = 0;
154
155         return( value );
156 }
157
158 void makepointtable_nuc( int *pointt, int *n )
159 {
160         int point;
161         register int *p;
162
163         if( *n == -1 )
164         {
165                 *pointt = -1;
166                 return;
167         }
168
169         p = n;
170         point  = *n++ *  1024;
171         point += *n++ *   256;
172         point += *n++ *    64;
173         point += *n++ *    16;
174         point += *n++ *     4;
175         point += *n++;
176         *pointt++ = point;
177
178         while( *n != END_OF_VEC )
179         {
180                 point -= *p++ * 1024;
181                 point *= 4;
182                 point += *n++;
183                 *pointt++ = point;
184         }
185         *pointt = END_OF_VEC;
186 }
187
188 void makepointtable( int *pointt, int *n )
189 {
190         int point;
191         register int *p;
192
193         if( *n == -1 )
194         {
195                 *pointt = -1;
196                 return;
197         }
198
199         p = n;
200         point  = *n++ *  7776;
201         point += *n++ *  1296;
202         point += *n++ *   216;
203         point += *n++ *    36;
204         point += *n++ *     6;
205         point += *n++;
206         *pointt++ = point;
207
208         while( *n != END_OF_VEC )
209         {
210                 point -= *p++ * 7776;
211                 point *= 6;
212                 point += *n++;
213                 *pointt++ = point;
214         }
215         *pointt = END_OF_VEC;
216 }
217
218 int main( int argc, char **argv )
219 {
220         int i, j;
221         FILE *infp;
222         char **seq;
223         int *grpseq;
224         char *tmpseq;
225         int  **pointt;
226         static char **name;
227         static int *nlen;
228         double *mtxself;
229         float score;
230         static short *table1;
231         float longer, shorter;
232         float lenfac;
233         float bunbo;
234
235         arguments( argc, argv );
236
237         if( inputfile )
238         {
239                 infp = fopen( inputfile, "r" );
240                 if( !infp )
241                 {
242                         fprintf( stderr, "Cannot open %s\n", inputfile );
243                         exit( 1 );
244                 }
245         }
246         else
247                 infp = stdin;
248
249 #if 0
250         PreRead( stdin, &njob, &nlenmax );
251 #else
252         getnumlen( infp );
253 #endif
254         rewind( infp );
255         if( njob < 2 )
256         {
257                 fprintf( stderr, "At least 2 sequences should be input!\n"
258                                                  "Only %d sequence found.\n", njob );
259                 exit( 1 );
260         }
261
262         tmpseq = AllocateCharVec( nlenmax+1 );
263         seq = AllocateCharMtx( njob, nlenmax+1 );
264         grpseq = AllocateIntVec( nlenmax+1 );
265         pointt = AllocateIntMtx( njob, nlenmax+1 );
266         mtxself = AllocateDoubleVec( njob );
267         pamN = NOTSPECIFIED;
268         name = AllocateCharMtx( njob, B );
269         nlen = AllocateIntVec( njob );
270
271 #if 0
272         FRead( infp, name, nlen, seq );
273 #else
274         readData_pointer( infp, name, nlen, seq );
275 #endif
276
277         fclose( infp );
278
279         constants( njob, seq );
280
281         if( dorp == 'd' ) tsize = (int)pow( 4, 6 );
282         else              tsize = (int)pow( 6, 6 );
283
284         if( dorp == 'd' )
285         {
286                 lenfaca = DLENFACA;
287                 lenfacb = DLENFACB;
288                 lenfacc = DLENFACC;
289                 lenfacd = DLENFACD;
290         }
291         else    
292         {
293                 lenfaca = PLENFACA;
294                 lenfacb = PLENFACB;
295                 lenfacc = PLENFACC;
296                 lenfacd = PLENFACD;
297         }
298
299         maxl = 0;
300         for( i=0; i<njob; i++ ) 
301         {
302                 gappick0( tmpseq, seq[i] );
303                 nlen[i] = strlen( tmpseq );
304 //              if( nlen[i] < 6 )
305 //              {
306 //                      fprintf( stderr, "Seq %d, too short, %d characters\n", i+1, nlen[i] );
307 //                      exit( 1 );
308 //              }
309                 if( nlen[i] > maxl ) maxl = nlen[i];
310                 if( dorp == 'd' ) /* nuc */
311                 {
312                         seq_grp_nuc( grpseq, tmpseq );
313                         makepointtable_nuc( pointt[i], grpseq );
314                 }
315                 else                 /* amino */
316                 {
317                         seq_grp( grpseq, tmpseq );
318                         makepointtable( pointt[i], grpseq );
319                 }
320         }
321         fprintf( stderr, "\nCalculating i-i scores ... " );
322         for( i=0; i<njob; i++ )
323         {
324                 table1 = (short *)calloc( tsize, sizeof( short ) );
325                 if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
326                 makecompositiontable_p( table1, pointt[i] );
327
328                 score = commonsextet_p( table1, pointt[i] );
329                 mtxself[i] = score;
330                 free( table1 );
331         }
332
333         fprintf( stderr, "done.\n" );
334         fprintf( stderr, "\nCalculating i-j scores ... \n" );
335         for( i=0; i<njob; i++ )
336         {
337                 table1 = (short *)calloc( tsize, sizeof( short ) );
338                 if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
339                 if( i % 10 == 0 )
340                 {
341                         fprintf( stderr, "%4d / %4d\r", i+1, njob );
342                 }
343                 makecompositiontable_p( table1, pointt[i] );
344
345
346         for( j=i+1; j<njob; j++ ) 
347         {
348                         if( nlen[i] > nlen[j] )
349                         {
350                                 longer=(float)nlen[i];
351                                 shorter=(float)nlen[j];
352                         }
353                         else
354                         {
355                                 longer=(float)nlen[j];
356                                 shorter=(float)nlen[i];
357                         }
358 //                      lenfac = 3.0 / ( LENFACA + LENFACB / ( longer + LENFACC ) + shorter / longer * LENFACD );
359                         lenfac = 1.0 / ( shorter / longer * lenfacd + lenfacb / ( longer + lenfacc ) + lenfaca );
360 //                      lenfac = 1.0;
361 //                      fprintf( stderr, "lenfac = %f (%.0f,%.0f)\n", lenfac, longer, shorter );
362                         score = commonsextet_p( table1, pointt[j] );
363                         bunbo = MIN( mtxself[i], mtxself[j] );
364                         if( bunbo == 0.0 )
365                                 fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, 1.0, nlen[i], nlen[j] );
366                         else
367                                 fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, ( 1.0 - score / bunbo ) * lenfac, nlen[i], nlen[j] );
368 //                      fprintf( stderr, "##### mtx = %f, mtx[i][0]=%f, mtx[j][0]=%f, bunbo=%f\n", mtx[i][j-i], mtx[i][0], mtx[j][0], bunbo );
369 //          score = (double)commonsextet_p( table1, pointt[j] );
370 //                      fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, ( 1.0 - score / MIN( mtxself[i], mtxself[j] ) ) * 3, nlen[i], nlen[j] );
371
372
373                 } 
374                 free( table1 );
375         }
376                 
377         fprintf( stderr, "\n" );
378         SHOWVERSION;
379         exit( 0 );
380 }