12 #include "svm_utils.h"
14 #include "model_avg.inc" /* defines avg_model_string */
15 #include "model_sd.inc" /* defines sd_model_string */
18 PRIVATE svm_model *avg_model;
19 PRIVATE svm_model *sd_model;
21 PRIVATE void freeFields(char** fields);
22 PRIVATE char** splitFields(char* string);
23 PRIVATE char** splitLines(char* string);
25 PUBLIC float get_z(char *sequence, double energy) {
26 double average_free_energy;
27 double sd_free_energy;
31 short *S = encode_sequence(sequence, 0);
33 int *AUGC = get_seq_composition(S, 1, length);
34 avg_model = svm_load_model_string(avg_model_string);
35 sd_model = svm_load_model_string(sd_model_string);
36 average_free_energy = avg_regression(AUGC[0],AUGC[1],AUGC[2],AUGC[3],AUGC[4], avg_model, &info_avg);
39 double difference = (energy/* /100*/) - average_free_energy;
40 sd_free_energy = sd_regression(AUGC[0], AUGC[1], AUGC[2], AUGC[3], AUGC[4], sd_model);
41 my_z = difference / sd_free_energy;
44 fprintf(stderr,"warning: sequence out of bounds\n");
46 my_z = shuffle_score(sequence, energy);
51 svm_destroy_model(avg_model);
52 svm_destroy_model(sd_model);
56 PUBLIC int *get_seq_composition(short *S, unsigned int start, unsigned int stop){
58 int *ret = (int *)space(sizeof(int) * 6);
60 for (i=MAX2(start, 1); i <= MIN2(stop, S[0]); i++){
61 if(S[i] > 4) ret[0]++;
64 ret[5] = -1; /* indicate last entry */
68 PUBLIC double sd_regression(int N, int A, int C, int G, int T, svm_model *sd_model){
69 double sd_free_energy = 0.0;
70 int length = A + C + G + T + N;
71 double GC_content = (double) (G + C)/length;
72 double AT_ratio = (double) A/(A+T);
73 double CG_ratio = (double) C/(C+G);
74 double norm_length = (double) (length-50)/350.0;
75 struct svm_node node_mono[5];
77 node_mono[0].index = 1; node_mono[0].value = GC_content;
78 node_mono[1].index = 2; node_mono[1].value = AT_ratio;
79 node_mono[2].index = 3; node_mono[2].value = CG_ratio;
80 node_mono[3].index = 4; node_mono[3].value = norm_length;
81 node_mono[4].index =-1;
83 sd_free_energy = svm_predict(sd_model,node_mono);
85 sd_free_energy = (double) sd_free_energy * sqrt(length);
87 return sd_free_energy;
90 PUBLIC double avg_regression(int N, int A, int C, int G, int T, struct svm_model *avg_model, int *info ){
91 double average_free_energy = 0.0;
93 int length = A + C + G + T + N;
94 double N_fraction = (double) N/length;
95 double GC_content = (double) (G + C)/length;
96 double AT_ratio = (double) A/(A+T);
97 double CG_ratio = (double) C/(C+G);
99 double norm_length = (double) (length-50)/350.0;
101 struct svm_node node_mono[5];
103 if ( length < 50 || length > 400 ) {
107 if ( N_fraction > 0.05 ) {
111 if ( GC_content < 0.20 || GC_content > 0.80 ) {
115 if ( AT_ratio < 0.20 || AT_ratio > 0.80 ) {
119 if ( CG_ratio < 0.20 || CG_ratio > 0.80 ) {
124 node_mono[0].index = 1; node_mono[0].value = GC_content;
125 node_mono[1].index = 2; node_mono[1].value = AT_ratio;
126 node_mono[2].index = 3; node_mono[2].value = CG_ratio;
127 node_mono[3].index = 4; node_mono[3].value = norm_length;
128 node_mono[4].index =-1;
130 average_free_energy = svm_predict(avg_model,node_mono);
132 average_free_energy = (double) average_free_energy * length;
134 return average_free_energy;
137 PUBLIC double minimal_sd(int N, int A, int C, int G, int T ){
138 int length = A + C + G + T + N;
139 if ( length < 60 ) return 0.450324;
140 if ( length < 70 ) return 0.749771;
141 if ( length < 80 ) return 1.029421;
142 if ( length < 90 ) return 1.027517;
143 if ( length < 100 ) return 1.347283;
144 if ( length < 120 ) return 1.112086;
145 if ( length < 150 ) return 1.574339;
146 if ( length < 170 ) return 1.779043;
147 if ( length < 200 ) return 1.922908;
148 if ( length < 250 ) return 2.226856;
149 if ( length < 300 ) return 2.349300;
150 if ( length < 350 ) return 2.589703;
151 if ( length < 400 ) return 2.791215;
156 PUBLIC svm_model *svm_load_model_string(char *modelString){
158 /* redefinition from svm.cpp */
159 char *svm_type_table[]={"c_svc","nu_svc","one_class","epsilon_svr","nu_svr",NULL};
160 char *kernel_type_table[]={"linear","polynomial","rbf","sigmoid",NULL};
162 struct svm_model *model;
163 char **lines, **fields;
165 char *key, *value, *field;
167 int dataStart, elements;
169 struct svm_node *x_space=NULL;
171 model = (struct svm_model*)space(sizeof(struct svm_model));
180 /* Read header until support vectors start */
181 lines=splitLines(modelString);
183 while (strcmp(lines[i],"SV")!=0){
184 fields=splitFields(lines[i]);
188 if(strcmp(key,"svm_type")==0){
190 for(j=0;svm_type_table[j];j++){
191 if(strcmp(svm_type_table[j],value)==0){
192 model->param.svm_type=j;
196 if(svm_type_table[i] == NULL){
197 fprintf(stderr,"unknown svm type.\n");
206 if(strcmp(key,"kernel_type")==0){
208 for(j=0;kernel_type_table[j];j++){
209 if(strcmp(kernel_type_table[j],value)==0){
210 model->param.kernel_type=j;
214 if(kernel_type_table[i] == NULL){
215 fprintf(stderr,"unknown kernel type.\n");
224 if (strcmp(key,"gamma")==0){
226 sscanf(value,"%lf",&model->param.gamma);
229 if (strcmp(key,"degree")==0){
231 sscanf(value,"%d",&model->param.degree);
234 if (strcmp(key,"coef0")==0){
236 sscanf(value,"%lf",&model->param.coef0);
238 if (strcmp(key,"nr_class")==0){
240 sscanf(value,"%d",&model->nr_class);
242 if (strcmp(key,"total_sv")==0){
244 sscanf(value,"%d",&model->l);
247 if (strcmp(key,"rho")==0){
248 int n = model->nr_class * (model->nr_class-1)/2;
249 model->rho = (double*)space(sizeof(double)*n);
251 sscanf(fields[j+1],"%lf",&model->rho[j]);
255 if (strcmp(key,"nr_sv")==0){
256 int n = model->nr_class;
257 model->nSV = (int*)space(sizeof(int)*n);
259 sscanf(fields[j+1],"%d",&model->nSV[j]);
263 if (strcmp(key,"label")==0){
264 int n = model->nr_class;
265 model->label = (int*)space(sizeof(int)*n);
267 sscanf(fields[j+1],"%d",&model->label[j]);
271 if (strcmp(key,"probA")==0){
272 int n = model->nr_class * (model->nr_class-1)/2;
273 model->probA = (double*)space(sizeof(double)*n);
275 sscanf(fields[j+1],"%lf",&model->probA[j]);
279 if (strcmp(key,"probB")==0){
280 int n = model->nr_class * (model->nr_class-1)/2;
281 model->probB = (double*)space(sizeof(double)*n);
283 sscanf(fields[j+1],"%lf",&model->probB[j]);
293 /* Count number of nodes (by counting colons) in advance to allocate
294 memory in one block */
295 while (lines[i]!=NULL){
297 while ((c=lines[i][j])!='\0'){
307 /* allocate memory for SVs and coefficients */
308 m = model->nr_class - 1;
310 model->sv_coef = (double**)space(sizeof(double*)*m);
312 model->sv_coef[i] = (double*)space(sizeof(double)*l);
314 model->SV = (struct svm_node**)space(sizeof(struct svm_node*)*l);
317 x_space = (struct svm_node*)space(sizeof(struct svm_node)*(elements));
321 /* parse support vector data */
324 fields=splitFields(lines[dataStart+i]);
325 model->SV[i] = &x_space[j];
327 while ((field=fields[k])!=NULL){
329 sscanf(fields[k],"%lf",&model->sv_coef[k][i]);
331 sscanf(fields[k],"%d:%lf",&(x_space[j].index),&(x_space[j].value));
336 x_space[j++].index = -1;
346 PRIVATE char **splitFields(char* string){
356 if (strlen(string)==0 || string==NULL){
360 /* First find all characters which are whitespaces and store the
361 positions in the array seps */
363 seps=(int *)space(sizeof(int));
367 while ((c=string[i])!='\0' && (c!='\n')){
369 seps=(int*)xrealloc(seps,sizeof(int)*(nSep+1));
375 seps=(int*)xrealloc(seps,sizeof(int)*(nSep+1));
376 seps[nSep]=strlen(string);
379 /* Then go through all intervals in between of two whitespaces (or
380 end or start of string) and store the fields in the array
381 "output"; if there are two adjacent whitespaces this is ignored
382 resulting in a behaviour like "split /\s+/" in perl */
384 for (i=0;i<nSep;i++){
388 int length=(stop-start);
392 currField=(char *)space(sizeof(char)*(length+1));
393 strncpy(currField,string+start+1,length-1);
394 currField[length]='\0';
396 /* check if field is not only whitespace */
399 while ((c=currField[j])!='\0'){
407 output=(char**)xrealloc(output,sizeof(char**)*(nField+1));
408 output[nField++]=currField;
415 /* printf("%s|\n",output[nField-1]); */
423 output=(char**)xrealloc(output,sizeof(char**)*(nField+1));
431 PRIVATE char **splitLines(char* string){
440 while ((c=string[i])!='\0'){
443 output=(char**)xrealloc(output,sizeof(char**)*(lineN+1));
444 currLine=(char*)xrealloc(currLine,sizeof(char)*(currLength+1));
445 currLine[currLength]='\0';
446 output[lineN]=currLine;
452 currLine=(char*)xrealloc(currLine,sizeof(char)*(currLength+1));
453 currLine[currLength]=c;
459 output=(char**)xrealloc(output,sizeof(char**)*(lineN+1));
466 /* for both splitLines and splitFields */
467 void freeFields(char** fields){
470 while (fields[i]!=NULL){