6 #include "StrEdit_CostMatrix.h"
12 PUBLIC float **read_distance_matrix(char type[]);
13 PUBLIC char **read_sequence_list(int *n_of_seqs, char *mask);
14 PUBLIC float **Hamming_Distance_Matrix(char **seqs, int n_of_seqs);
15 PUBLIC float **StrEdit_SimpleDistMatrix(char **seqs, int n_of_seqs);
16 PUBLIC float **StrEdit_GotohDistMatrix(char **seqs, int n_of_seqs);
17 PUBLIC void free_distance_matrix(float **x);
18 PUBLIC void printf_distance_matrix(float **x);
19 PUBLIC void printf_taxa_list(void);
20 PUBLIC char *get_taxon_label(int whoami);
21 PUBLIC float StrEdit_SimpleDist(char *str1, char *str2);
22 PUBLIC float StrEdit_GotohDist(char *str1, char *str2);
23 PUBLIC void Set_StrEdit_CostMatrix(char type);
24 PUBLIC void Set_StrEdit_GapCosts(float per_digit, float per_gap);
26 /* NOTE: x[0][0] = (float)size_of_matrix; */
28 PRIVATE void read_taxa_list(void);
29 PRIVATE int string_consists_of(char line[],char *mask);
30 PRIVATE float StrEditCost( int i, int j, char *T1, char *T2);
31 PRIVATE int decode(char id);
33 PRIVATE char Taxa_List[MAXSEQS][50];
34 PRIVATE int Taxa_Numbers[MAXSEQS];
35 PRIVATE int N_of_named_taxa=0;
36 PRIVATE char *file_name;
37 PRIVATE char N_of_infiles=0;
38 PRIVATE float **StrEdit_CostMatrix;
39 PRIVATE char *StrEdit_ValidAlphabet;
40 PRIVATE float StrEdit_GapCost = 1.;
41 PRIVATE float StrEdit_GotohAlpha = 1.;
42 PRIVATE float StrEdit_GotohBeta = 1.;
46 PUBLIC float **read_distance_matrix(char type[])
57 if ((line = get_line(stdin))==NULL) return NULL;
58 if (*line =='@') return NULL;
61 if(file_name) free(file_name);
64 file_name = (char *) space(sizeof(char)*strlen(line));
65 sscanf(line,"*%s",file_name);
67 file_name = (char *) space(10);
68 sprintf(file_name,"%d",N_of_infiles);
72 else if (*line=='>') {
75 r = sscanf(line,"> %1s%*[ ] %d", type, &size);
76 fprintf(stderr, "%d ", r);
77 if (r==EOF) return NULL;
78 if((r==2)&&(size>1)) {
79 D=(float **)space((size+1)*sizeof(float *));
80 for(i=0; i<=size; i++)
81 D[i] = (float *)space((size+1)*sizeof(float));
82 D[0][0] = (float)size;
84 for(i=2; i<= size; i++) {
87 if (scanf("%f", &tmp)!=1) {
88 for(i=0;i<=size;i++) free(D[i]);
98 else printf("%s\n",line);
100 else printf(" %s\n", line);
105 /* ------------------------------------------------------------------------- */
107 PUBLIC char **read_sequence_list(int *n_of_seqs, char *mask)
117 if ((line = get_line(stdin))==NULL) break;
119 if(line[0]=='\0') break;
120 if(line[0]=='%') break;
121 if(line[0]=='#') break;
122 if(line[0]=='@') break;
125 if(file_name) free(file_name);
127 file_name = (char *) space(sizeof(char)*strlen(line));
128 sscanf(line,"*%s",file_name);
130 file_name = (char *) space(10);
131 sprintf(file_name,"%d",N_of_infiles);
139 if(string_consists_of(line,mask)){
142 if(isalpha(line[i])) line[i]=toupper(line[i]);
146 switch(toupper(line[i])){
147 case 'G' : case 'A' : case 'X' :
148 line[i] = 'R'; break;
149 case 'U' : case 'C' : case 'K' : case 'T' :
150 line[i] = 'Y'; break;
156 tt[*n_of_seqs] = (char *)space((len+1)*sizeof(char));
157 sscanf(line,"%s",tt[*n_of_seqs]);
163 if(*n_of_seqs == 0) return NULL;
165 sl = (char **) space((*n_of_seqs)*sizeof(char *));
166 for(i=0;i<*n_of_seqs; i++) sl[i] = tt[i];
171 /* -------------------------------------------------------------------------- */
173 PRIVATE int string_consists_of(char *line, char *mask)
176 if((len=strlen(line))==0) return 0; /* empty string ! */
178 for(i=1;i<strlen(mask);i++){ /* mask[0] marks toupper/not_toupper ! */
179 if(mask[i]==line[j]) break;
181 if(i==(strlen(mask))) break; /* letter j no in mask */
183 if(j!=len) return 0; /* loop left before last char -> false */
184 return 1; /* loop left after last char -> true */
186 /* -------------------------------------------------------------------------- */
188 PUBLIC void free_distance_matrix(float **x)
192 for(i=0;i<=n;i++) free(x[i]);
197 /* -------------------------------------------------------------------------- */
199 PUBLIC void printf_distance_matrix(float **x)
203 printf("> X %d\n",n);
206 for(j=1;j<i;j++) printf("%g ",x[i][j]);
212 /* -------------------------------------------------------------------------- */
214 PUBLIC float **Hamming_Distance_Matrix(char **seqs, int n_of_seqs)
218 D = (float **) space((n_of_seqs+1)*sizeof(float *));
219 for(i=0;i<=n_of_seqs;i++)
220 D[i] = (float *) space((n_of_seqs+1)*sizeof(float));
221 D[0][0] = (float) n_of_seqs;
223 for(i=1; i<n_of_seqs; i++) {
226 if(strlen(seqs[i])!=strlen(seqs[j]))
227 nrerror("Unequal Seqence Length for Hamming Distance.");
229 for(k=0;k<strlen(seqs[i]);k++)
230 D[i+1][j+1] += StrEditCost(k+1,k+1,seqs[i],seqs[j]);
231 /* was : (float)(seqs[i][k]!=seqs[j][k]); */
232 D[j+1][i+1] = D[i+1][j+1];
234 D[n_of_seqs][n_of_seqs] = 0.;
239 /* -------------------------------------------------------------------------- */
241 PUBLIC float **StrEdit_SimpleDistMatrix(char **seqs, int n_of_seqs)
245 D = (float **) space((n_of_seqs+1)*sizeof(float *));
246 for(i=0;i<=n_of_seqs;i++)
247 D[i] = (float *) space((n_of_seqs+1)*sizeof(float));
248 D[0][0] = (float) n_of_seqs;
250 for(i=1; i<n_of_seqs; i++) {
253 D[i+1][j+1] = StrEdit_SimpleDist(seqs[i],seqs[j]);
254 D[j+1][i+1] = D[i+1][j+1];
256 D[n_of_seqs][n_of_seqs] = 0.;
261 /* -------------------------------------------------------------------------- */
263 PUBLIC float **StrEdit_GotohDistMatrix(char **seqs, int n_of_seqs)
267 D = (float **) space((n_of_seqs+1)*sizeof(float *));
268 for(i=0;i<=n_of_seqs;i++)
269 D[i] = (float *) space((n_of_seqs+1)*sizeof(float));
270 D[0][0] = (float) n_of_seqs;
272 for(i=1; i<n_of_seqs; i++) {
275 D[i+1][j+1] = StrEdit_GotohDist(seqs[i],seqs[j]);
276 D[j+1][i+1] = D[i+1][j+1];
278 D[n_of_seqs][n_of_seqs] = 0.;
283 /* -------------------------------------------------------------------------- */
285 PRIVATE void read_taxa_list(void)
291 if ((line = get_line(stdin))==NULL) return;
294 sscanf(line,"#%d", &i);
295 if(i<=1) N_of_named_taxa = 0;
298 if ((line = get_line(stdin))==NULL) return;
299 } else N_of_named_taxa = 0;
302 if(line[0]=='\0') break;
303 if(line[0]=='%') break;
304 if(line[0]=='#') break;
305 if(line[0]=='@') break;
306 if(line[0]=='*') break;
307 *Taxa_List[N_of_named_taxa]='\0';
308 sscanf(line,"%d :%49s", &i, Taxa_List[N_of_named_taxa]);
309 if(*Taxa_List[N_of_named_taxa]) {
310 Taxa_Numbers[N_of_named_taxa]=i+add_it;
314 } while ((line = get_line(stdin))!=NULL);
315 if (line!=NULL) free(line);
319 /* -------------------------------------------------------------------------- */
321 PUBLIC void printf_taxa_list(void)
324 if(N_of_named_taxa>0){
325 printf("* List of Taxa: %s\n", file_name);
326 for(i=0;i<N_of_named_taxa;i++)
327 printf("%3d : %s\n",Taxa_Numbers[i],Taxa_List[i]);
328 printf("* End of Taxa List\n");
332 /* -------------------------------------------------------------------------- */
334 PUBLIC char *get_taxon_label(int whoami)
340 if(whoami<0) { /* negative arguments return the identifier of the data set */
341 if(!file_name) return NULL;
342 label = (char *) space(sizeof(char)*(strlen(file_name)+1));
343 strcpy(label,file_name);
346 for(i=0;i<N_of_named_taxa;i++) {
347 if(whoami==Taxa_Numbers[i]) {
348 label = (char *) space(sizeof(char)*(strlen(Taxa_List[i])+1));
349 strcpy(label,Taxa_List[i]);
353 sprintf(tmp,"%d",whoami);
355 label = (char *) space(sizeof(char)*(strlen(tmp)+1));
360 /* -------------------------------------------------------------------------- */
362 PUBLIC float StrEdit_SimpleDist(char *str1, char *str2 )
367 int i, j, length1,length2;
368 float minus, plus, change, temp;
370 length1 = strlen(str1);
371 length2 = strlen(str2);
373 distance = (float **) space((length1 +1)*sizeof(float *));
374 for(i=0; i<= length1; i++)
375 distance[i] = (float *) space( (length2+1)*sizeof(float));
377 for(i = 1; i <= length1; i++)
378 distance[i][0] = distance[i-1][0]+StrEditCost(i,0,str1,str2);
379 for(j = 1; j <= length2; j++)
380 distance[0][j] = distance[0][j-1]+StrEditCost(0,j,str1,str2);
382 for (i = 1; i <= length1; i++) {
383 for (j = 1; j <= length2 ; j++) {
384 minus = distance[i-1][j] + StrEditCost(i,0,str1,str2);
385 plus = distance[i][j-1] + StrEditCost(0,j,str1,str2);
386 change = distance[i-1][j-1]+ StrEditCost(i,j,str1,str2);
388 distance[i][j] = MIN3(minus, plus, change);
391 temp = distance[length1][length2];
392 for(i=0;i<=length1;i++) free(distance[i]);
398 /* -------------------------------------------------------------------------- */
400 PUBLIC float StrEdit_GotohDist(char *str1, char *str2 )
405 int i, j, length1,length2;
408 length1 = strlen(str1);
409 length2 = strlen(str2);
411 D = space((length1+1)*sizeof(float *));
412 for(i=0;i<=length1;i++) D[i] = space((length2+1)*sizeof(float));
413 E = space((length1+1)*sizeof(float *));
414 for(i=0;i<=length1;i++) E[i] = space((length2+1)*sizeof(float));
415 F = space((length1+1)*sizeof(float *));
416 for(i=0;i<=length1;i++) F[i] = space((length2+1)*sizeof(float));
418 D[0][0] = 0.; E[0][0] = 0.; F[0][0] = 0.;
420 for(i=1;i<=length1;i++) {
421 D[i][0] = StrEdit_GotohAlpha + StrEdit_GotohBeta*((float)(i-1));
422 E[i][0] = StrEdit_GotohAlpha + StrEdit_GotohBeta*((float)(i-1));
425 for(j=1;j<=length2;j++) {
426 D[0][j] = StrEdit_GotohAlpha + StrEdit_GotohBeta*((float)(j-1));
428 F[0][j] = StrEdit_GotohAlpha + StrEdit_GotohBeta*((float)(j-1));
430 for(i=1;i<=length1;i++) {
431 for(j=1;j<=length2;j++) {
432 E[i][j] = MIN2( (D[i][j-1]+StrEdit_GotohAlpha),
433 (E[i][j-1]+StrEdit_GotohBeta) );
434 F[i][j] = MIN2( (D[i-1][j]+StrEdit_GotohAlpha),
435 (F[i-1][j]+StrEdit_GotohBeta) );
436 D[i][j] = MIN3( E[i][j], F[i][j],
437 (D[i-1][j-1]+StrEditCost(i,j,str1,str2)) );
440 temp = D[length1][length2];
441 for(i=0;i<=length1;i++) {
442 free(D[i]); free(E[i]); free(F[i]);
444 free(D); free(E); free(F);
449 /* -------------------------------------------------------------------------- */
451 PRIVATE float StrEditCost(int i, int j, char *T1, char *T2)
453 /* positions i,j from [1..length]; i,j=0 implies Gap */
455 if((i==0)&&(j==0)) nrerror("Edit Cost: Aligned gap characters !!!");
456 if(i>0) i1 = decode(T1[i-1]); else i1 = 0;
457 if(j>0) j1 = decode(T2[j-1]); else j1 = 0;
458 if(StrEdit_CostMatrix==NULL) {
459 if(i&&j) return (float)(i1!=j1);
460 else return (float) StrEdit_GapCost;
462 else return (float) StrEdit_CostMatrix[i1][j1];
465 /* -------------------------------------------------------------------------- */
467 PRIVATE int decode(char id)
471 if (!StrEdit_ValidAlphabet) return (int)id;
472 alen = strlen(StrEdit_ValidAlphabet);
473 if(!alen) return (int)id;
475 for(n=0;n<alen;n++) {
476 if(id==StrEdit_ValidAlphabet[n]) return n;
478 fprintf(stderr,"Warning: Invalid character in DECODE -> set to ~gap~\n");
482 /* -------------------------------------------------------------------------- */
484 PUBLIC void Set_StrEdit_CostMatrix(char type)
487 if(StrEdit_ValidAlphabet) {
488 free(StrEdit_ValidAlphabet);
489 StrEdit_ValidAlphabet = NULL;
491 if(StrEdit_CostMatrix) {
492 free(StrEdit_CostMatrix);
493 StrEdit_CostMatrix = NULL;
497 StrEdit_ValidAlphabet = (char*) space((20+2)*sizeof(char));
498 strcpy(StrEdit_ValidAlphabet,StrEdit_DayhoffA);
499 StrEdit_CostMatrix = (float**) space((20+1)*sizeof(float*));
501 StrEdit_CostMatrix[i] = (float*)space((20+1)*sizeof(float));
504 StrEdit_CostMatrix[i][j] =
505 MAX2(StrEdit_DayhoffM[i][i],StrEdit_DayhoffM[j][j])-
506 StrEdit_DayhoffM[i][j];
508 StrEdit_CostMatrix[i][0] = StrEdit_DayhoffM[i][i];
509 StrEdit_CostMatrix[0][i] = StrEdit_DayhoffM[i][i];
511 StrEdit_CostMatrix[0][0] = StrEdit_DayhoffM[0][0];
514 StrEdit_ValidAlphabet = (char*) space((20+2)*sizeof(char));
515 strcpy(StrEdit_ValidAlphabet,StrEdit_GLHA);
516 StrEdit_CostMatrix = (float**) space((20+1)*sizeof(float*));
518 StrEdit_CostMatrix[i] = (float*)space((20+1)*sizeof(float));
521 StrEdit_CostMatrix[i][j] = StrEdit_GLHM[i][j];
522 StrEdit_CostMatrix[i][0] = StrEdit_CostMatrix[0][i] =
525 StrEdit_CostMatrix[0][0] = StrEdit_GLHM[0][0];
528 StrEdit_ValidAlphabet = space((4+2)*sizeof(char));
529 strcpy(StrEdit_ValidAlphabet,StrEdit_BinCodeA);
530 StrEdit_CostMatrix = (float**) space((4+1)*sizeof(float*));
532 StrEdit_CostMatrix[i] = (float*)space((4+1)*sizeof(float));
535 StrEdit_CostMatrix[i][j] = StrEdit_BinCodeM[i][j];
538 StrEdit_ValidAlphabet = space((4+2)*sizeof(char));
539 strcpy(StrEdit_ValidAlphabet,StrEdit_HogewegA);
540 StrEdit_CostMatrix = (float**) space((4+1)*sizeof(float*));
542 StrEdit_CostMatrix[i] = (float*)space((4+1)*sizeof(float));
545 StrEdit_CostMatrix[i][j] = StrEdit_HogewegM[i][j];
546 StrEdit_GotohAlpha = 3.;
547 StrEdit_GotohBeta = 0.;
550 if(!StrEdit_GapCost) StrEdit_GapCost = 1.; /* This is the simple distance */
554 /* -------------------------------------------------------------------------- */
556 PUBLIC void Set_StrEdit_GapCosts(float per_digit, float per_gap)
558 if(per_gap==0.) per_gap = per_digit;
559 if(per_digit<0) nrerror("Gap Costs invalid.");
560 if(per_digit>per_gap) nrerror("Gap Costs invalid.");
562 StrEdit_GapCost = per_digit;
563 StrEdit_GotohAlpha = per_digit; /* Gotoh gap function g(k) = a + b(k-1) */
564 StrEdit_GotohBeta = per_gap;
568 /* -------------------------------------------------------------------------- */