1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: hhhmm-C.h 316 2016-12-16 16:14:39Z fabian $
26 #include <iostream> // cin, cout, cerr
27 #include <fstream> // ofstream, ifstream
28 #include <stdio.h> // printf
35 #include <stdlib.h> // exit
36 #include <string> // strcmp, strstr
37 #include <math.h> // sqrt, pow
38 #include <limits.h> // INT_MIN
39 #include <float.h> // FLT_MIN
40 #include <time.h> // clock
41 #include <ctype.h> // islower, isdigit etc
42 #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
43 #include "list.h" // list data structure
44 #include "hash.h" // hash data structure
46 #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
48 //#include "new_new.h" /* memory tracking */
52 // #include "wnconj.h" // Will Naylor's wnlib for optimization in C
55 //////////////////////////////////////////////////////////////////////////////
57 //////////////////////////////////////////////////////////////////////////////
59 //////////////////////////////////////////////////////////////////////////////
61 //////////////////////////////////////////////////////////////////////////////
62 HMM::HMM(int maxseqdis, int maxres)
64 sname = new char*[maxseqdis](); // names of stored sequences
65 for (int i = 0; i < maxseqdis; i++){ sname[i] = NULL;}
66 seq = new char*[maxseqdis](); // residues of stored sequences (first at pos 1!)
67 for (int i = 0; i < maxseqdis; i++){ seq[i] = NULL;}
68 Neff_M = new float[maxres](); // Neff_M[i] = diversity of subalignment of seqs that have residue in col i
69 Neff_I = new float[maxres](); // Neff_I[i] = diversity of subalignment of seqs that have insert in col i
70 Neff_D = new float[maxres](); // Neff_D[i] = diversity of subalignment of seqs that have delete in col i
71 longname = new char[DESCLEN](); // Full name of first sequence of original alignment (NAME field)
72 ss_dssp = new char[maxres](); // secondary structure determined by dssp 0:- 1:H 2:E 3:C 4:S 5:T 6:G 7:B
73 sa_dssp = new char[maxres](); // solvent accessibility state determined by dssp 0:- 1:A (absolutely buried) 2:B 3:C 4:D 5:E (exposed)
74 ss_pred = new char[maxres](); // predicted secondary structure 0:- 1:H 2:E 3:C
75 ss_conf = new char[maxres](); // confidence value of prediction 0:- 1:0 ... 10:9
76 Xcons = NULL; // create only when needed: consensus sequence in internal representation (A=0 R=1 N=2 D=3 ...)
77 l = new int[maxres](); // l[i] = pos. of j'th match state in aligment
78 /* FS introduced sentinel, NULL terminates loop in destructor, FS, r221->222 */
79 f = new float*[maxres+1](); f[maxres] = NULL; // f[i][a] = prob of finding amino acid a in column i WITHOUT pseudocounts
80 g = new float*[maxres+1](); g[maxres] = NULL; // f[i][a] = prob of finding amino acid a in column i WITH pseudocounts
81 p = new float*[maxres+1](); p[maxres] = NULL; // p[i][a] = prob of finding amino acid a in column i WITH OPTIMUM pseudocounts
82 tr = new float*[maxres+1](); tr[maxres] = NULL; // log2 of transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
83 // tr_lin = new float*[maxres]; // linear transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
84 for (int i=0; i<maxres; i++) {f[i]=new(float[NAA+3])();}
85 for (int i=0; i<maxres; i++) {g[i]=new(float[NAA])();}
86 for (int i=0; i<maxres; i++) {p[i]=new(float[NAA])();}
87 for (int i=0; i<maxres; i++) {tr[i]=new(float[NTRANS])();}
88 // for (int i=0; i<maxres; i++) tr_lin[i]=new(float[NTRANS]);
91 n_display=N_in=N_filtered=0;
92 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
93 // lamda_hash.New(37,0.0); // Set size and NULL element for hash
94 // mu_hash.New(37,0.0); // Set size and NULL element for hash
96 name[0]=longname[0]=fam[0]='\0';
97 trans_lin=0; // transition probs in log space
101 //////////////////////////////////////////////////////////////////////////////
103 //////////////////////////////////////////////////////////////////////////////
106 //Delete name and seq matrices
108 for (int k=0; (k < n_display) && (NULL != sname[k]); k++){
109 delete[] sname[k]; sname[k] = NULL;
111 delete[] sname; sname = NULL;
114 for (int k=0; (k < n_display) && (NULL != seq[k]); k++){
115 delete[] seq[k]; seq[k] = NULL;
117 delete[] seq; seq = NULL;
119 delete[] Neff_M; Neff_M = NULL;
120 delete[] Neff_D; Neff_D = NULL;
121 delete[] Neff_I; Neff_I = NULL;
122 delete[] longname; longname = NULL;
123 delete[] ss_dssp; ss_dssp = NULL;
124 delete[] sa_dssp; sa_dssp = NULL;
125 delete[] ss_pred; ss_pred = NULL;
126 delete[] ss_conf; ss_conf = NULL;
127 delete[] Xcons; Xcons = NULL;
128 delete[] l; l = NULL;
129 for (int i=0; i</*MAXRES*/par.maxResLen; i++){
131 delete[] f[i]; f[i] = NULL;
135 for (int i=0; i</*MAXRES*/par.maxResLen; i++){
137 delete[] g[i]; g[i] = NULL;
141 for (int i=0; i</*MAXRES*/par.maxResLen; i++){
143 delete[] p[i]; p[i] = NULL;
147 for (int i=0; i</*MAXRES*/par.maxResLen; i++){
149 delete[] tr[i]; tr[i] = NULL;
153 // for (int i=0; i</*MAXRES*/par.maxResLen; i++) if (tr_lin[i]) delete[] tr_lin[i]; else break;
154 delete[] f; f = NULL;
155 delete[] g; g = NULL;
156 delete[] p; p = NULL;
157 delete[] tr; tr = NULL;
161 //////////////////////////////////////////////////////////////////////////////
162 // Deep-copy constructor
163 //////////////////////////////////////////////////////////////////////////////
164 HMM& HMM::operator=(HMM& q)
167 for (int i=0; i<=L+1; ++i)
169 for (int a=0; a<NAA; ++a)
175 for (int a=0; a<NTRANS; ++a)
177 ss_dssp[i]=q.ss_dssp[i];
178 sa_dssp[i]=q.sa_dssp[i];
179 ss_pred[i]=q.ss_pred[i];
180 ss_conf[i]=q.ss_conf[i];
184 for (int i=0; i<=L+1; ++i)
185 Xcons[i] =q.Xcons[i];
187 n_display=q.n_display;
188 for (int k=0; k<n_display; k++) {
189 sname[k]=new char[strlen(q.sname[k])+1];
190 if (!sname[k]) MemoryError("array of names for sequences to display");
191 strcpy(sname[k],q.sname[k]);
193 for (int k=0; k<n_display; k++) {
194 seq[k]=new char[strlen(q.seq[k])+1];
195 if (!seq[k]) MemoryError("array of names for sequences to display");
196 strcpy(seq[k],q.seq[k]);
205 for (int i=0; i<=L+1; ++i) Neff_M[i]=q.Neff_M[i];
206 for (int i=0; i<=L+1; ++i) Neff_I[i]=q.Neff_I[i];
207 for (int i=0; i<=L+1; ++i) Neff_D[i]=q.Neff_D[i];
210 strcpy(longname,q.longname);
221 for (int a=0; a<NAA; ++a) pav[a]=q.pav[a];
223 N_filtered=q.N_filtered;
224 trans_lin=q.trans_lin;
225 return (HMM&) (*this);
229 ///////////////////////////////////////////////////////////////////////////////
231 * @brief Read an HMM from an HHsearch .hhm file; return 0 at end of file
234 HMM::Read(FILE* dbf, char* path)
236 char line[LINELEN]=""; // input line
237 char str3[8]="",str4[8]=""; // first 3 and 4 letters of input line
238 char* ptr; // pointer for string manipulation
239 int i=0; // index for match state (first=1)
240 int a; // amino acid index
246 n_display=N_in=N_filtered=0;
247 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
249 trans_lin=0; // transition probs in log space
250 name[0]=longname[0]=fam[0]='\0';
251 //If at the end of while-loop L is still 0 then we have reached end of db file
253 //Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
255 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/'))
258 if (strscn(line)==NULL) continue; // skip lines that contain only white space
259 substr(str3,line,0,2); // copy the first three characters into str3
260 substr(str4,line,0,3); // copy the first four characters into str4
262 if (!strncmp("HH",line,2)) continue;
264 if (!strcmp("NAME",str4))
266 ptr=strscn(line+4); //advance to first non-white-space character
269 strncpy(longname,ptr,DESCLEN-1); //copy full name to longname
270 longname[DESCLEN-1]='\0';
271 strncpy(name,ptr,NAMELEN-1); //copy longname to name...
272 strcut(name); //...cut after first word...
276 strcpy(longname,"undefined");
277 strcpy(name,"undefined");
279 if (v>=4) cout<<"Reading in HMM "<<name<<":\n";
282 else if (!strcmp("FAM",str3))
284 ptr=strscn(line+3); //advance to first non-white-space character
285 if (ptr) strncpy(fam,ptr,IDLEN-1); else strcpy(fam,""); //copy family name to basename
286 ScopID(cl,fold,sfam,fam); //get scop classification from basename (e.g. a.1.2.3.4)
289 else if (!strcmp("FILE",str4))
291 if (path) strncpy(file,path,NAMELEN-1); else *file='\0'; // copy path to file variable
292 ptr=strscn(line+4); //advance to first non-white-space character
294 strncat(file,ptr,NAMELEN-1-strlen(file)); // append file name read from file to path
295 else strcat(file,"*");
298 else if (!strcmp("LENG",str4))
301 L=strint(ptr); //read next integer (number of match states)
303 else if (!strcmp("FILT",str4) || !strcmp("NSEQ",str4))
306 N_filtered=strint(ptr); //read next integer: number of sequences after filtering
307 N_in=strint(ptr); //read next integer: number of sequences in alignment
310 else if (!strcmp("NEFF",str4) || !strcmp("NAA",str3)) sscanf(line+6,"%f",&Neff_HMM);
312 else if (!strcmp("EVD",str3))
315 sscanf(line+6,"%f %f",&lamda,&mu);
316 // sscanf(line+22,"%s",key);
317 // lamda_hash.Add(key,lamda);
318 // mu_hash.Add(key,mu);
321 else if (!strcmp("DESC",str4)) continue;
322 else if (!strcmp("COM",str3)) continue;
323 else if (!strcmp("DATE",str4)) continue;
325 /////////////////////////////////////////////////////////////////////////////////////
326 // Read template sequences that should get displayed in output alignments
327 else if (!strcmp("SEQ",str3))
329 //char cur_seq[MAXCOL]=""; //Sequence currently read in
330 char *cur_seq = new char[par.maxColCnt]; //Sequence currently read in
331 int k; // sequence index; start with -1; after reading name of n'th sequence-> k=n
332 int h; // index for character in input line
333 int l=1; // index of character in sequence seq[k]
334 int i=1; // index of match states in ss_dssp[i] and ss_pred[i] sequence
335 int n_seq=0; // number of sequences to be displayed EXCLUDING ss sequences
336 cur_seq[0]='-'; // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq)
338 while (fgetline(line,LINELEN-1,dbf) && line[0]!='#')
340 if (v>=4) cout<<"Read from file:"<<line<<"\n"; //DEBUG
341 if (line[0]=='>') //line contains sequence name
343 if (k>=MAXSEQDIS-1) //maximum number of allowable sequences exceeded
344 {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); break;}
346 if (!strncmp(line,">ss_dssp",8)) nss_dssp=k;
347 else if (!strncmp(line,">sa_dssp",8)) nsa_dssp=k;
348 else if (!strncmp(line,">ss_pred",8)) nss_pred=k;
349 else if (!strncmp(line,">ss_conf",8)) nss_conf=k;
350 else if (!strncmp(line,">Cons-",6) || !strncmp(line,">Consensus",10)) ncons=k;
353 if (nfirst==-1) nfirst=k;
354 if (n_seq>=par.nseqdis)
355 {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); k--; break;}
359 //If this is not the first sequence then store residues of previous sequence
361 seq[k-1]=new char[strlen(cur_seq)+1];
362 if (!seq[k-1]) MemoryError("array of sequences to display");
363 strcpy(seq[k-1],cur_seq);
366 // store sequence name
367 strcut(line+1); //find next white-space character and overwrite it with end-of-string character
368 sname[k] = new char[strlen(line+1)+1]; //+1 for terminating '\0'
369 if (!sname[k]) MemoryError("array of names for sequences to display");
370 strcpy(sname[k],line+1); //store sequence name in **name
373 else //line contains sequence residues
377 cerr<<endl<<"WARNING: Ignoring following line while reading HMM"<<name<<":\n\'"<<line<<"\'\n";
381 h=0; //counts characters in current line
383 // Check whether all characters are correct; store into cur_seq
384 if (k==nss_dssp) // lines with dssp secondary structure states (. - H E C S T G B)
386 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
388 if (ss2i(line[h])>=0 && line[h]!='.')
390 char c=ss2ss(line[h]);
392 if (c!='.' && !(c>='a' && c<='z')) ss_dssp[i++]=ss2i(c);
395 else if (v && ss2i(line[h])==-2)
396 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
400 if (k==nsa_dssp) // lines with dssp secondary solvent accessibility (- A B C D E)
402 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
404 if (sa2i(line[h])>=0)
408 if (c!='.' && !(c>='a' && c<='z')) sa_dssp[i++]=sa2i(c);
411 else if (v && sa2i(line[h])==-2)
412 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
416 else if (k==nss_pred) // lines with predicted secondary structure (. - H E C)
418 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
420 if (ss2i(line[h])>=0 && ss2i(line[h])<=3 && line[h]!='.')
422 char c=ss2ss(line[h]);
424 if (c!='.' && !(c>='a' && c<='z')) ss_pred[i++]=ss2i(c);
427 else if (v && ss2i(line[h])==-2)
428 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
432 else if (k==nss_conf) // lines with confidence values should contain only 0-9, '-', or '.'
434 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
436 if (line[h]=='-' || (line[h]>='0' && line[h]<='9'))
439 ss_conf[l]=cf2i(line[h]);
442 else if (v && cf2i(line[h])==-2)
443 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
447 else // normal line containing residues
449 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
451 if (aa2i(line[h])>=0 && line[h]!='.') // ignore '.' and white-space characters ' ', \t and \n (aa2i()==-1)
452 {cur_seq[l]=line[h]; l++;}
453 else if (aa2i(line[h])==-2 && v)
454 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line '"<<line<<"' of HMM "<<name<<"\n";
458 cur_seq[l]='\0'; //Ensure that cur_seq ends with a '\0' character
462 //If this is not the first sequence some residues have already been read in
464 seq[k]=new char[strlen(cur_seq)+1];
465 if (!seq[k]) MemoryError("array of sequences to display");
466 strcpy(seq[k],cur_seq);
473 printf("nss_dssp=%i nsa_dssp=%i nss_pred=%i nss_conf=%i nfirst=%i\n",nss_dssp,nsa_dssp,nss_pred,nss_conf,nfirst);
474 for (k=0; k<n_display; k++)
477 cout<<">"<<sname[k]<<"(k="<<k<<")\n";
478 if (k==nss_dssp) {for (j=1; j<=L; j++) cout<<char(i2ss(ss_dssp[j]));}
479 else if (k==nsa_dssp) {for (j=1; j<=L; j++) cout<<char(i2sa(sa_dssp[j]));}
480 else if (k==nss_pred) {for (j=1; j<=L; j++) cout<<char(i2ss(ss_pred[j]));}
481 else if (k==nss_conf) {for (j=1; j<=L; j++) cout<<int(ss_conf[j]-1);}
482 else {for (j=1; j<=L; j++) cout<<seq[k][j];}
489 /////////////////////////////////////////////////////////////////////////////////////
490 // Read average amino acid frequencies for HMM
491 else if (!strcmp("FREQ",str4))
493 fprintf(stderr,"Error: hhm file has obsolete format.\n");
494 fprintf(stderr,"Please use hhmake version > 1.1 to generate hhm files.\n");
498 else if (!strcmp("AVER",str4)) {} // AVER line scrapped
499 else if (!strcmp("NULL",str4))
502 for (a=0; a<20 && ptr; ++a)
503 //s2[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
504 pb[s2a[a]] = (float) fpow2(float(-strinta(ptr))/HMMSCALE);
505 if (!ptr) return Warning(dbf,line,name);
509 for (a=0; a<20; ++a) printf("%5.1f ",100.*pb[s2a[a]]);
514 /////////////////////////////////////////////////////////////////////////////////////
515 // Read transition probabilities from start state
516 else if (!strcmp("HMM",str3))
518 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
519 fgetline(line,LINELEN-1,dbf); // Skip line with transition labels
521 for (a=0; a<=D2D && ptr; ++a)
522 tr[0][a] = float(-strinta(ptr))/HMMSCALE; //store transition probabilites as log2 values
523 // strinta returns next integer in string and puts ptr to first char
524 // after the integer. Returns -99999 if '*' is found.
525 // ptr is set to 0 if no integer is found after ptr.
526 Neff_M[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with M->? transition
527 Neff_I[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with I->? transition
528 Neff_D[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with D->? transition
529 if (!ptr) return Warning(dbf,line,name);
531 /////////////////////////////////////////////////////////////////////////////////////
532 // Read columns of HMM
533 int next_i=0; // index of next column
534 while (fgetline(line,LINELEN-2,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#')
536 if (strscn(line)==NULL) continue; // skip lines that contain only white space
538 // Read in AA probabilities
541 next_i = strint(ptr); ++i;
542 if (v && next_i!=prev_i+1)
545 cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n";
546 if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n";
550 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<". Skipping HMM\n";
553 if (i>=/*MAXRES*/par.maxResLen-2)
555 fgetline(line,LINELEN-1,dbf); // Skip line
559 for (a=0; a<20 && ptr; ++a)
560 // f[i][s2a[a]] = (float)pow(2.,float(-strinta(ptr))/HMMSCALE);
561 f[i][s2a[a]] = fpow2(float(-strinta(ptr))/HMMSCALE); // speed-up ~5 s for 10000 SCOP domains
563 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
565 if (!ptr) return Warning(dbf,line,name);
570 for (a=0; a<20; ++a) printf("%5.1f ",100*f[i][s2a[a]]);
575 // Read transition probabilities
576 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
577 if (line[0]!=' ' && line[0]!='\t') return Warning(dbf,line,name);
579 for (a=0; a<=D2D && ptr; ++a)
580 tr[i][a] = float(-strinta(ptr))/HMMSCALE; //store transition prob's as log2-values
581 Neff_M[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with M->? transition
582 Neff_I[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with I->? transition
583 Neff_D[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with D->? transition
584 if (!ptr) return Warning(dbf,line,name);
588 for (a=0; a<=D2D; ++a) printf("%5.1f ",100*fpow2(tr[i][a]));
589 printf("%5.1f %5.1f %5.1f \n",Neff_M[i],Neff_I[i],Neff_D[i]);
592 if (line[0]=='/' && line[1]=='/') break;
594 else if (v) cerr<<endl<<"WARNING: Ignoring line\n\'"<<line<<"\'\nin HMM "<<name<<"\n";
598 if (L==0) return 0; //End of db file -> stop reading in
600 // Set coefficients of EVD (= 0.0 if not calibrated for these parameters)
601 // lamda = lamda_hash.Show(par.Key());
602 // mu = mu_hash.Show(par.Key());
603 if (lamda && v>=3) printf("HMM %s is already calibrated: lamda=%-5.3f, mu=%-5.2f\n",name,lamda,mu);
605 if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n";
606 if (v && i>/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";}
607 if (v && !i) cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n";
608 if (v>=2) cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n";
611 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
612 for (a=0; a<20; ++a) f[0][a]=f[L+1][a]=pb[a];
614 Neff_I[L+1]=Neff_D[L+1]=0.0f;
616 return 1; //return status: ok
618 } /* this is the end of HMM::Read() */
621 /////////////////////////////////////////////////////////////////////////////////////
623 * @brief Read an HMM from a HMMer .hmm file; return 0 at end of file
626 HMM::ReadHMMer(FILE* dbf, char* filestr)
628 char line[LINELEN]=""; // input line
629 char desc[DESCLEN]=""; // description of family
630 char str4[5]=""; // first 4 letters of input line
631 char* ptr; // pointer for string manipulation
632 int i=0; // index for match state (first=1)
633 int a; // amino acid index
634 char dssp=0; // 1 if a consensus SS has been found in the transition prob lines
635 char annot=0; // 1 if at least one annotation character in insert lines is ne '-' or ' '
636 int k=0; // index for seq[k]
637 static char ignore_hmmer_cal = 0;
638 char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
639 //annotchr = new char[MAXRES]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
640 annotchr = new char[par.maxResLen]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
642 int iAlpha = 20; /* size of alphabet, default is protein = 20 */
643 double dAlphaInv = 1.00 / (double)(iAlpha); /* weight of AA */
648 n_display=N_in=N_filtered=0;
649 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
651 trans_lin=0; // transition probs in log space
652 name[0]=longname[0]=desc[0]=fam[0]='\0';
653 //If at the end of while-loop L is still 0 then we have reached end of db file
655 // Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
657 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/'))
660 if (strscn(line)==NULL) continue; // skip lines that contain only white space
661 if (!strncmp("HMMER",line,5)) continue;
663 substr(str4,line,0,3); // copy the first four characters into str4
665 if (!strcmp("NAME",str4) && name[0]=='\0')
667 ptr=strscn(line+4); // advance to first non-white-space character
668 strncpy(name,ptr,NAMELEN-1); // copy full name to name
669 strcut(name); // ...cut after first word...
670 if (v>=4) cout<<"Reading in HMM "<<name<<":\n";
673 else if (!strcmp("ACC ",str4))
675 ptr=strscn(line+4); // advance to first non-white-space character
676 strncpy(longname,ptr,DESCLEN-1); // copy Accession id to longname...
679 else if (!strcmp("DESC",str4))
681 ptr=strscn(line+4); // advance to first non-white-space character
684 strncpy(desc,ptr,DESCLEN-1); // copy description to name...
685 desc[DESCLEN-1]='\0';
686 strcut(ptr); // ...cut after first word...
688 if (!ptr || ptr[1]!='.' || strchr(ptr+3,'.')==NULL) strcpy(fam,""); else strcpy(fam,ptr); // could not find two '.' in name?
691 else if (!strcmp("LENG",str4))
694 L=strint(ptr); //read next integer (number of match states)
697 else if (!strcmp("ALPH",str4)) {
701 if (0 == strcmp(ptr, "Amino")){
704 else if (0 == strcmp(ptr, "Nucleic")){
706 printf("%s:%s:%d: WARNING: HMM reading does not work for DNA/RNA\n",
707 __FUNCTION__, __FILE__, __LINE__);
710 return Warning(dbf,line,name);
712 dAlphaInv = 1.00 / (double)(iAlpha);
715 else if (!strcmp("RF ",str4)) continue;
716 else if (!strcmp("CS ",str4)) continue;
717 else if (!strcmp("MAP ",str4)) continue;
718 else if (!strcmp("COM ",str4)) continue;
719 else if (!strcmp("NSEQ",str4))
722 N_in=N_filtered=strint(ptr); //read next integer: number of sequences after filtering
725 else if (!strcmp("DATE",str4)) continue;
726 else if (!strncmp("CKSUM ",line,5)) continue;
727 else if (!strcmp("GA ",str4)) continue;
728 else if (!strcmp("TC ",str4)) continue;
729 else if (!strcmp("NC ",str4)) continue;
731 else if (!strncmp("SADSS",line,5))
736 seq[nsa_dssp] = new char[/*MAXRES*/par.maxResLen+2];
737 sname[nsa_dssp] = new(char[15]);
738 strcpy(seq[nsa_dssp]," ");
739 strcpy(sname[nsa_dssp],"sa_dssp");
746 if (strlen(seq[nsa_dssp])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
747 printf("\nWARNING: HMM %s has SADSS records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
748 else strcat(seq[nsa_dssp],ptr);
752 else if (!strncmp("SSPRD",line,5))
757 seq[nss_pred] = new char[/*MAXRES*/par.maxResLen+2];
758 sname[nss_pred] = new(char[15]);
759 strcpy(seq[nss_pred]," ");
760 strcpy(sname[nss_pred],"ss_pred");
767 if (strlen(seq[nss_pred])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
768 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
769 else strcat(seq[nss_pred],ptr);
773 else if (!strncmp("SSCON",line,5))
778 seq[nss_conf] = new char[/*MAXRES*/par.maxResLen+2];
779 sname[nss_conf] = new(char[15]);
780 strcpy(seq[nss_conf]," ");
781 strcpy(sname[nss_conf],"ss_conf");
787 if (strlen(seq[nss_conf])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
788 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
789 else strcat(seq[nss_conf],ptr);
793 else if (!strncmp("SSCIT",line,5)) continue;
794 else if (!strcmp("XT ",str4)) continue;
795 else if (!strcmp("NULT",str4)) continue;
797 else if (!strcmp("NULE",str4))
800 for (a=0; (a < iAlpha) && ptr; ++a){
801 /* FIXME: FS introduced alphabet size (was '20')
802 and dAlphaInv (was '0.05' = 1/20) */
803 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
804 pb[s2a[a]] = (float) dAlphaInv * fpow2(float(strinta(ptr,-99999))/HMMSCALE); /* dAlphaInv */
806 for (a = iAlpha; a < 20; a++){
809 if (!ptr) return Warning(dbf,line,name);
813 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
814 printf("%5.1f ",100.*pb[s2a[a]]);
820 else if (!strcmp("EVD ",str4))
824 sscanf(ptr,"%f",&lamda);
826 sscanf(ptr,"%f",&mu);
829 if (v>=2 && ignore_hmmer_cal==0)
830 cerr<<endl<<"Warning: some HMMs have been calibrated with HMMER's 'hmmcalibrate'. These calibrations will be ignored\n";
836 /////////////////////////////////////////////////////////////////////////////////////
837 // Read transition probabilities from start state
838 else if (!strncmp("HMM",line,3))
840 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
841 fgetline(line,LINELEN-1,dbf); // Skip line with transition labels
843 for (a=0; a<=M2D && ptr; ++a)
844 tr[0][a] = float(strinta(ptr,-99999))/HMMSCALE; //store transition probabilites as log2 values
845 // strinta returns next integer in string and puts ptr to first char
846 // after the integer. Returns -99999 if '*' is found.
847 // ptr is set to 0 if no integer is found after ptr.
848 tr[0][I2M] = tr[0][D2M] = 0.0;
849 tr[0][I2I] = tr[0][D2D] = -99999.0;
850 if (!ptr) return Warning(dbf,line,name);
854 for (a=0; a<=D2D && ptr; ++a) printf("%5.1f ",100*fpow2(tr[i][a]));
858 // Prepare to store DSSP states (if there are none, delete afterwards)
860 seq[nss_dssp] = new char[/*MAXRES*/par.maxResLen+2];
861 sname[nss_dssp] = new(char[15]);
862 strcpy(sname[nss_dssp],"ss_dssp");
864 /////////////////////////////////////////////////////////////////////////////////////
865 // Read columns of HMM
866 int next_i=0; // index of next column
867 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#')
869 if (strscn(line)==NULL) continue; // skip lines that contain only white space
871 // Read in AA probabilities
874 next_i = strint(ptr); ++i;
875 if (v && next_i!=prev_i+1)
878 cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n";
879 if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n";
883 cerr<<endl<<"Error: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
887 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
888 if (i>=/*MAXRES*/par.maxResLen-2)
890 fgetline(line,LINELEN-1,dbf); // Skip two lines
891 fgetline(line,LINELEN-1,dbf);
895 for (a=0; (a<iAlpha) && ptr; ++a){ /* FIXME: FS introduced iAlpha, was '20' */
896 f[i][s2a[a]] = (float) pb[s2a[a]]*fpow2(float(strinta(ptr,-99999))/HMMSCALE);
897 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
899 for (a = iAlpha; a < 20; a++){
902 if (!ptr) return Warning(dbf,line,name);
906 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
907 printf("%5.1f ",100*f[i][s2a[a]]);
912 // Read insert emission line
913 fgetline(line,LINELEN-1,dbf);
915 if (!ptr) return Warning(dbf,line,name);
916 annotchr[i]=uprchr(*ptr);
917 if (*ptr!='-' && *ptr!=' ') annot=1;
919 // Read annotation character and seven transition probabilities
920 fgetline(line,LINELEN-1,dbf);
926 seq[nss_dssp][i]=*ptr;
931 seq[nss_dssp][i]=*ptr;
936 seq[nss_dssp][i]=*ptr;
941 seq[nss_dssp][i]=*ptr;
946 seq[nss_dssp][i]=*ptr;
951 seq[nss_dssp][i]=*ptr;
956 seq[nss_dssp][i]=*ptr;
963 seq[nss_dssp][i]=*ptr;
968 seq[nss_dssp][i]=*ptr;
974 for (a=0; a<=D2D && ptr; ++a)
975 tr[i][a] = float(strinta(ptr,-99999))/HMMSCALE; //store transition prob's as log2-values
976 if (!ptr) return Warning(dbf,line,name);
980 for (a=0; a<=D2D; ++a) printf("%5.1f ",100*fpow2(tr[i][a]));
985 if (line[0]=='/' && line[1]=='/') break;
987 } /* strncmp("HMM") */
991 if (L==0) return 0; //End of db file -> stop reading in
993 // Set coefficients of EVD (= 0.0 if not calibrated for these parameters)
994 // lamda = lamda_hash.Show(par.Key());
995 // mu = mu_hash.Show(par.Key());
996 if (lamda && v>=2) printf("HMM %s is already calibrated: lamda=%-5.3f, mu=%-5.2f\n",name,lamda,mu);
998 if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n";
999 if (v && i>=/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";}
1000 if (v && !i) cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n";
1003 if (strlen(longname)>0) strcat(longname," ");
1004 strncat(longname,name,DESCLEN-strlen(longname)-1); // longname = ACC NAME DESC
1005 if (strlen(name)>0) strcat(longname," ");
1006 strncat(longname,desc,DESCLEN-strlen(longname)-1);
1007 longname[DESCLEN-1]='\0';
1008 ScopID(cl,fold,sfam,fam);// get scop classification from basename (e.g. a.1.2.3.4)
1009 RemoveExtension(file,filestr); // copy name of dbfile without extension into 'file'
1011 // Secondary structure
1014 // remove dssp sequence
1015 // memory that had been allocated in case ss_dssp was given needs to be freed
1016 delete[] seq[nss_dssp]; seq[nss_dssp] = NULL;
1017 // memory that had been allocated in case ss_dssp was given needs to be freed
1018 delete[] sname[nss_dssp]; sname[nss_dssp] = NULL;
1024 for (i=1; i<=L; ++i) ss_pred[i] = ss2i(seq[nss_pred][i]);
1026 for (i=1; i<=L; ++i) ss_conf[i] = cf2i(seq[nss_conf][i]);
1028 for (i=1; i<=L; ++i) ss_conf[i] = 5;
1031 // Copy query (first sequence) and consensus residues?
1034 sname[k]=new(char[10]);
1035 strcpy(sname[k],"Consensus");
1036 sname[k+1]=new char[strlen(longname)+1];
1037 strcpy(sname[k+1],longname);
1038 seq[k]=new char[L+2];
1041 seq[k+1]=new char[L+2];
1044 for (i=1; i<=L; ++i)
1048 for (a=0; a<NAA; ++a)
1049 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
1050 if (pmax>0.6) seq[k][i]=i2aa(amax);
1051 else if (pmax>0.4) seq[k][i]=lwrchr(i2aa(amax));
1053 seq[k+1][i]=i2aa(amax);
1055 ncons=k++; // nfirst is set later!
1059 sname[k]=new char[strlen(longname)+1];
1060 /* FIXME valgrind says bytes get lost here during hmm iteration --
1061 fixed in HMM::ClobberGlobal(), I (FS) think */
1062 strcpy(sname[k],longname);
1063 seq[k]=new char[L+2];
1068 if (annot) // read in some annotation characters?
1072 strcpy(seq[k],annotchr); // overwrite the consensus sequence with the annotation characters
1074 else if (!par.showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence)
1076 /* FIXME: FS set ncons=k
1077 don't understand why it is not set but seem to need it */
1079 for (i=1; i<=L; ++i)
1083 for (a=0; a<NAA; ++a)
1084 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
1085 seq[k][i]=i2aa(amax);
1088 // printf("%i query name=%s seq=%s\n",n,sname[n],seq[n]);
1093 // Calculate overall Neff_HMM
1095 for (i=1; i<=L; ++i)
1098 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
1099 if (f[i][a]>1E-10) S-=f[i][a]*fast_log2(f[i][a]);
1101 Neff_HMM+=(float) fpow2(S);
1104 for (i=0; i<=L; ++i) Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts!
1106 cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n";
1108 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
1109 for (a=0; a<iAlpha; ++a) { /* FIXME: FS introduced iAlpha, was '20' */
1110 f[0][a]=f[L+1][a]=pb[a];
1112 delete[] annotchr; annotchr = NULL;
1114 return 1; //return status: ok
1116 } /* this is the end of HMM::ReadHMMer() */
1120 /////////////////////////////////////////////////////////////////////////////////////
1122 * @brief Read an HMM from a HMMER3 .hmm file; return 0 at end of file
1125 HMM::ReadHMMer3(FILE* dbf, char* filestr)
1127 char line[LINELEN]=""; // input line
1128 char desc[DESCLEN]=""; // description of family
1129 char str4[5]=""; // first 4 letters of input line
1130 char* ptr; // pointer for string manipulation
1131 int i=0; // index for match state (first=1)
1132 int a; // amino acid index
1133 char dssp=0; // 1 if a consensus SS has been found in the transition prob lines
1134 char annot=0; // 1 if at least one annotation character in insert lines is ne '-' or ' '
1135 int k=0; // index for seq[k]
1136 char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1137 //annotchr = new char[MAXRES]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1138 annotchr = new char[par.maxResLen]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1140 int iAlpha = 20; /* size of alphabet, default is protein = 20 */
1141 double dAlphaInv = 1.00 / (double)(iAlpha); /* weight of AA */
1146 n_seqs=n_display=N_in=N_filtered=0;
1147 nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
1149 trans_lin=0; // transition probs in log space
1150 name[0]=longname[0]=desc[0]=fam[0]='\0';
1151 //If at the end of while-loop L is still 0 then we have reached end of db file
1153 // Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
1156 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/'))
1159 if (strscn(line)==NULL) continue; // skip lines that contain only white space
1160 if (!strncmp("HMMER",line,5)) continue;
1162 substr(str4,line,0,3); // copy the first four characters into str4
1164 if (!strcmp("NAME",str4) && name[0]=='\0')
1166 ptr=strscn(line+4); // advance to first non-white-space character
1167 strncpy(name,ptr,NAMELEN-1); // copy full name to name
1168 strcut(name); // ...cut after first word...
1169 if (v>=4) cout<<"Reading in HMM "<<name<<":\n";
1172 else if (!strcmp("ACC ",str4))
1174 ptr=strscn(line+4); // advance to first non-white-space character
1175 strncpy(longname,ptr,DESCLEN-1); // copy Accession id to longname...
1178 else if (!strcmp("DESC",str4))
1180 ptr=strscn(line+4); // advance to first non-white-space character
1183 strncpy(desc,ptr,DESCLEN-1); // copy description to name...
1184 desc[DESCLEN-1]='\0';
1185 strcut(ptr); // ...cut after first word...
1187 if (!ptr || ptr[1]!='.' || strchr(ptr+3,'.')==NULL) strcpy(fam,""); else strcpy(fam,ptr); // could not find two '.' in name?
1190 else if (!strcmp("LENG",str4))
1193 L=strint(ptr); //read next integer (number of match states)
1196 else if (!strcmp("ALPH",str4)) {
1200 if (0 == strcmp(ptr, "amino")){
1203 else if (0 == strcmp(ptr, "Nucleic")){
1205 printf("%s:%s:%d: WARNING: HMM reading does not work for DNA/RNA\n",
1206 __FUNCTION__, __FILE__, __LINE__);
1209 return Warning(dbf,line,name);
1211 dAlphaInv = 1.00 / (double)(iAlpha);
1214 else if (!strcmp("RF ",str4)) continue;
1215 else if (!strcmp("CS ",str4)) continue;
1216 else if (!strcmp("MAP ",str4)) continue;
1217 else if (!strcmp("COM ",str4)) continue;
1218 else if (!strcmp("NSEQ",str4))
1221 N_in=N_filtered=strint(ptr); //read next integer: number of sequences after filtering
1224 else if (!strcmp("DATE",str4)) continue;
1225 else if (!strncmp("CKSUM ",line,5)) continue;
1226 else if (!strcmp("GA ",str4)) continue;
1227 else if (!strcmp("TC ",str4)) continue;
1228 else if (!strcmp("NC ",str4)) continue;
1230 //////////////////////////////////////////////////////////////////////////////////////////////////////
1233 else if (!strncmp("SADSS",line,5))
1238 seq[nsa_dssp] = new char[/*MAXRES*/par.maxResLen+2];
1239 sname[nsa_dssp] = new(char[15]);
1240 strcpy(seq[nsa_dssp]," ");
1241 strcpy(sname[nsa_dssp],"sa_dssp");
1248 if (strlen(seq[nsa_dssp])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
1249 printf("\nWARNING: HMM %s has SADSS records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
1250 else strcat(seq[nsa_dssp],ptr);
1254 else if (!strncmp("SSPRD",line,5))
1259 seq[nss_pred] = new char[/*MAXRES*/par.maxResLen+2];
1260 sname[nss_pred] = new(char[15]);
1261 strcpy(seq[nss_pred]," ");
1262 strcpy(sname[nss_pred],"ss_pred");
1269 if (strlen(seq[nss_pred])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
1270 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
1271 else strcat(seq[nss_pred],ptr);
1275 else if (!strncmp("SSCON",line,5))
1280 seq[nss_conf] = new char[/*MAXRES*/par.maxResLen+2];
1281 sname[nss_conf] = new(char[15]);
1282 strcpy(seq[nss_conf]," ");
1283 strcpy(sname[nss_conf],"ss_conf");
1289 if (strlen(seq[nss_conf])+strlen(ptr)>=(unsigned)(/*MAXRES*/par.maxResLen))
1290 printf("\nWARNING: HMM %s has SSPRD records with more than %i residues.\n",name,/*MAXRES*/par.maxResLen);
1291 else strcat(seq[nss_conf],ptr);
1295 else if (!strncmp("SSCIT",line,5)) continue;
1296 else if (!strcmp("XT ",str4)) continue;
1297 //////////////////////////////////////////////////////////////////////////////////////////////////////
1298 else if (!strncmp("STATS LOCAL",line,11)) continue;
1300 else if (!strcmp("EFFN",str4))
1303 float effn = strflt(ptr);
1304 // Calculate Neff_HMM by using f(x) = ax^0.1 + bx^0.5 + cx + d (fitted with scop25 dataset)
1305 Neff_HMM = -1.403534 * pow(effn, 0.1) + 4.428118 * pow(effn, 0.5) - 0.2885410 * effn - 1.108568;
1308 /////////////////////////////////////////////////////////////////////////////////////
1309 // Read transition probabilities from start state
1310 else if (!strncmp("HMM",line,3))
1312 fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels
1313 fgetline(line,LINELEN-1,dbf); // Skip line with transition labels
1316 if (!strncmp("COMPO",ptr,5))
1319 for (a=0; a<20 && ptr; ++a)
1320 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
1321 pb[s2a[a]] = (float) exp(-1.0*strflta(ptr,99999));
1322 if (!ptr) return Warning(dbf,line,name);
1326 for (a=0; a<20; ++a) printf("%6.3g ",100.*pb[s2a[a]]);
1329 fgetline(line,LINELEN-1,dbf); // Read next line
1332 fgetline(line,LINELEN-1,dbf); // Skip line with 0-states insert probabilities
1335 for (a=0; a<=D2D && ptr; ++a)
1336 tr[0][a] = Log2((float) exp(-1.0*strflta(ptr,99999))); //store transition probabilites as log2 values
1337 // strinta returns next integer in string and puts ptr to first char
1338 // after the integer. Returns -99999 if '*' is found.
1339 // ptr is set to 0 if no integer is found after ptr.
1340 if (!ptr) return Warning(dbf,line,name);
1344 for (a=0; a<=D2D && ptr; ++a) printf("%6.3g ",100*fpow2(tr[i][a]));
1348 // Prepare to store DSSP states (if there are none, delete afterwards)
1350 seq[nss_dssp] = new char[/*MAXRES*/par.maxResLen+2];
1351 sname[nss_dssp] = new(char[15]);
1352 strcpy(sname[nss_dssp],"ss_dssp");
1354 /////////////////////////////////////////////////////////////////////////////////////
1355 // Read columns of HMM
1356 int next_i=0; // index of next column
1357 while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#')
1359 if (strscn(line)==NULL) continue; // skip lines that contain only white space
1361 // Read in AA probabilities
1363 int prev_i = next_i;
1364 next_i = strint(ptr); ++i;
1365 if (v && next_i!=prev_i+1)
1368 cerr<<endl<<"WARNING: in HMM "<<name<<" state "<<prev_i<<" is followed by state "<<next_i<<"\n";
1369 if (warn==5) cerr<<endl<<"WARNING: further warnings while reading HMMs will be suppressed.\n";
1373 cerr<<endl<<"Error: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
1377 cerr<<endl<<"WARNING: in HMM "<<name<<" there are more columns than the stated length "<<L<<"\n";
1378 if (i>=/*MAXRES*/par.maxResLen-2)
1380 fgetline(line,LINELEN-1,dbf); // Skip two lines
1381 fgetline(line,LINELEN-1,dbf);
1385 for (a=0; a<iAlpha && ptr; ++a){ /* FIXME: FS introduced iAlpha, was '20' */
1386 f[i][s2a[a]] = (float) exp(-1.0*strflta(ptr,99999));
1387 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
1389 for (a = iAlpha; a < 20; a++){
1392 if (!ptr) return Warning(dbf,line,name);
1396 for (a=0; a<iAlpha; ++a) printf("%6.3g ",100*f[i][s2a[a]]);
1400 // Ignore MAP annotation
1401 ptr = strscn(line); //find next word
1402 ptr = strscn_ws(line); // ignore word
1404 // Read RF and CS annotation
1406 if (!ptr) return Warning(dbf,line,name);
1407 annotchr[i]=uprchr(*ptr);
1408 if (*ptr!='-' && *ptr!=' ') annot=1;
1415 seq[nss_dssp][i]=*ptr;
1420 seq[nss_dssp][i]=*ptr;
1425 seq[nss_dssp][i]=*ptr;
1430 seq[nss_dssp][i]=*ptr;
1435 seq[nss_dssp][i]=*ptr;
1440 seq[nss_dssp][i]=*ptr;
1445 seq[nss_dssp][i]=*ptr;
1452 seq[nss_dssp][i]=*ptr;
1454 case '-': // no SS available from any template
1455 case '.': // no clear consensus SS structure
1456 case 'X': // no clear consensus SS structure
1458 seq[nss_dssp][i]='-';
1462 seq[nss_dssp][i]=*ptr;
1466 // Read insert emission line
1467 fgetline(line,LINELEN-1,dbf);
1469 // Read seven transition probabilities
1470 fgetline(line,LINELEN-1,dbf);
1473 for (a=0; a<=D2D && ptr; ++a)
1474 tr[i][a] = Log2((float) exp(-1.0*strflta(ptr,99999))); //store transition prob's as log2-values
1475 if (!ptr) return Warning(dbf,line,name);
1479 for (a=0; a<=D2D; ++a) printf("%6.3g ",100*fpow2(tr[i][a]));
1484 if (line[0]=='/' && line[1]=='/') break;
1486 } /* strncmp("HMM") */
1490 if (L==0) return 0; //End of db file -> stop reading in
1492 if (v && i!=L) cerr<<endl<<"Warning: in HMM "<<name<<" there are only "<<i<<" columns while the stated length is "<<L<<"\n";
1493 if (v && i>=/*MAXRES*/par.maxResLen-2) {i=/*MAXRES*/par.maxResLen-2; cerr<<endl<<"WARNING: maximum number "<</*MAXRES*/par.maxResLen-2<<" of residues exceeded while reading HMM "<<name<<"\n";}
1494 if (v && !i) cerr<<endl<<"WARNING: HMM "<<name<<" contains no match states. Check the alignment that gave rise to this HMM.\n";
1497 if (strlen(longname)>0) strcat(longname," ");
1498 strncat(longname,name,DESCLEN-strlen(longname)-1); // longname = ACC NAME DESC
1499 if (strlen(name)>0) strcat(longname," ");
1500 strncat(longname,desc,DESCLEN-strlen(longname)-1);
1501 longname[DESCLEN-1]='\0';
1502 ScopID(cl,fold,sfam,fam);// get scop classification from basename (e.g. a.1.2.3.4)
1503 RemoveExtension(file,filestr); // copy name of dbfile without extension into 'file'
1505 // Secondary structure
1508 // remove dssp sequence
1509 delete[] seq[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
1510 delete[] sname[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
1514 else { seq[nss_dssp][0]='-'; seq[nss_dssp][L+1]='\0'; }
1518 for (i=1; i<=L; ++i) ss_pred[i] = ss2i(seq[nss_pred][i]);
1520 for (i=1; i<=L; ++i) ss_conf[i] = cf2i(seq[nss_conf][i]);
1522 for (i=1; i<=L; ++i) ss_conf[i] = 5;
1525 // Copy query (first sequence) and consensus residues?
1528 sname[k]=new(char[10]);
1529 strcpy(sname[k],"Consensus");
1530 sname[k+1]=new char[strlen(longname)+1];
1531 strcpy(sname[k+1],longname);
1532 seq[k]=new char[L+2];
1535 seq[k+1]=new char[L+2];
1538 for (i=1; i<=L; ++i)
1542 for (a=0; a<NAA; ++a)
1543 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
1544 if (pmax>0.6) seq[k][i]=i2aa(amax);
1545 else if (pmax>0.4) seq[k][i]=lwrchr(i2aa(amax));
1547 seq[k+1][i]=i2aa(amax);
1549 ncons=k++; // nfirst is set later!
1553 sname[k]=new char[strlen(longname)+1];
1554 strcpy(sname[k],longname);
1555 seq[k]=new char[L+2];
1560 if (annot) // read in some annotation characters?
1564 strcpy(seq[k],annotchr); // overwrite the consensus sequence with the annotation characters
1566 else if (!par.showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence)
1568 for (i=1; i<=L; ++i)
1572 for (a=0; a<NAA; ++a)
1573 if (f[i][a]>pmax) {amax=a; pmax=f[i][a];}
1574 seq[k][i]=i2aa(amax);
1577 // printf("%i query name=%s seq=%s\n",n,sname[n],seq[n]);
1583 // If no effektive number of sequences is given, calculate Neff_HMM by given profile
1584 if (Neff_HMM == 0) {
1585 for (i=1; i<=L; ++i)
1588 for (a=0; a<20; ++a)
1589 if (f[i][a]>1E-10) S-=f[i][a]*fast_log2(f[i][a]);
1590 Neff_HMM+=(float) fpow2(S);
1595 for (i=0; i<=L; ++i) Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts!
1597 Neff_I[L+1]=Neff_D[L+1]=0.0f;
1600 cout<<"Read in HMM "<<name<<" with "<<L<<" match states and effective number of sequences = "<<Neff_HMM<<"\n";
1602 ///////////////////////////////////////////////////////////////////
1604 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
1605 for (a=0; a<20; ++a) f[0][a]=f[L+1][a]=pb[a];
1608 has_pseudocounts=true;
1610 return 1; //return status: ok
1613 } /* this is the end of HMM::ReadHMMer3() */
1616 //////////////////////////////////////////////////////////////////////////////
1618 * @brief Add transition pseudocounts to HMM (and calculate lin-space transition probs)
1621 HMM::AddTransitionPseudocounts(float gapd, float gape, float gapf, float gapg, float gaph, float gapi, float gapb)
1623 int i; //position in alignment
1625 float pM2M, pM2I, pM2D, pI2I, pI2M, pD2D, pD2M;
1627 if (par.gapb<=0) return;
1628 if (trans_lin==1) {fprintf(stderr,"Error: Adding transition pseudocounts to linear representation of %s not allowed. Please report this error to the HHsearch developers.\n",name); exit(6);}
1629 if (trans_lin==2) {fprintf(stderr,"Error: Adding transition pseudocounts twice is %s not allowed. Please report this error to the HHsearch developers.\n",name); exit(6);}
1632 // Calculate pseudocount transition probabilities
1633 pM2D=pM2I=gapd*0.0286; //a-priori probability for inserts and deletions
1635 // gape=0 -> pI2I=0 gape=1 -> pI2I=0.75 gape=inf -> pI2I=1.
1636 pI2I=1.0*gape/(gape-1+1.0/0.75);
1638 // gape=0 -> pD2D=0 gape=1 -> pD2D=0.75 gape=inf -> pD2D=1.
1639 pD2D=1.0*gape/(gape-1+1.0/0.75);
1642 for (i=0; i<=L; ++i) //for all columns in HMM
1644 // Transitions from M state
1645 p0 = (Neff_M[i]-1)*fpow2(tr[i][M2M]) + gapb*pM2M;
1646 p1 = (Neff_M[i]-1)*fpow2(tr[i][M2D]) + gapb*pM2D;
1647 p2 = (Neff_M[i]-1)*fpow2(tr[i][M2I]) + gapb*pM2I;
1648 if (i==0) p1=p2=0; //from M(0) no transition to D(1) and I(0) possible
1649 if (i==L) p1=p2=0; //from M(L) no transition to D(L+1) and I(L+1) possible
1650 sum = p0+p1+p2+FLT_MIN;
1653 // p1 = pow(p1/sum,gapf);
1654 // p2 = pow(p2/sum,gapg);
1655 // sum = p0+p1+p2+FLT_MIN;
1656 // tr[i][M2M] = fast_log2(p0/sum);
1657 // tr[i][M2D] = fast_log2(p1/sum);
1658 // tr[i][M2I] = fast_log2(p2/sum);
1660 tr[i][M2M] = fast_log2(p0/sum);
1661 tr[i][M2D] = fast_log2(p1/sum)*gapf;
1662 tr[i][M2I] = fast_log2(p2/sum)*gapg;
1664 // Transitions from I state
1665 p0 = Neff_I[i]*fpow2(tr[i][I2M]) + gapb*pI2M;
1666 p1 = Neff_I[i]*fpow2(tr[i][I2I]) + gapb*pI2I;
1667 sum = p0+p1+FLT_MIN;
1669 // p0 = pow(p0/sum,gapg);
1670 // p1 = pow(p1/sum,gapi);
1671 // sum = p0+p1+FLT_MIN;
1672 // tr[i][I2M] = fast_log2(p0/sum);
1673 // tr[i][I2I] = fast_log2(p1/sum);
1675 tr[i][I2M] = fast_log2(p0/sum);
1676 tr[i][I2I] = fast_log2(p1/sum)*gapi;
1678 // Transitions from D state
1679 p0 = Neff_D[i]*fpow2(tr[i][D2M]) + gapb*pD2M;
1680 p1 = Neff_D[i]*fpow2(tr[i][D2D]) + gapb*pD2D;
1681 if (i==L) p1=0; //from D(L) no transition to D(L+1) possible
1682 sum = p0+p1+FLT_MIN;
1684 // p0 = pow(p0/sum,gapf);
1685 // p1 = pow(p1/sum,gaph);
1686 // sum = p0+p1+FLT_MIN;
1687 // tr[i][D2M] = fast_log2(p0/sum);
1688 // tr[i][D2D] = fast_log2(p1/sum);
1690 tr[i][D2M] = fast_log2(p0/sum);
1691 tr[i][D2D] = fast_log2(p1/sum)*gaph;
1693 // SS-dependent gap penalties
1694 tr[i][M2M_GAPOPEN]=tr[i][M2M];
1701 printf("\nPseudocount transition probabilities:\n");
1702 printf("pM2M=%4.1f%%, pM2I=%4.1f%%, pM2D=%4.1f%%, ",100*pM2M,100*pM2I,100*pM2D);
1703 printf("pI2M=%4.1f%%, pI2I=%4.1f%%, ",100*pI2M,100*pI2I);
1704 printf("pD2M=%4.1f%%, pD2D=%4.1f%% ",100*pD2M,100*pD2D);
1705 printf("tau = %4.1f%%\n\n",100.*gapb/(Neff_HMM-1+gapb));
1706 printf("Listing transition probabilities WITH pseudocounts:\n");
1707 printf(" i dssp pred sacc M->M M->I M->D I->M I->I D->M D->D\n");
1709 for (i=1; i<=L; ++i) //for all columns in HMM
1711 printf("%4i %1c %1c %1c %6.3f %6.3f %6.3f ",i,i2ss(ss_dssp[i]),i2ss(ss_pred[i]),i2sa(sa_dssp[i]),fpow2(tr[i][M2M]),fpow2(tr[i][M2I]),fpow2(tr[i][M2D]));
1712 printf("%6.3f %6.3f ",fpow2(tr[i][I2M]),fpow2(tr[i][I2I]));
1713 printf("%6.3f %6.3f ",fpow2(tr[i][D2M]),fpow2(tr[i][D2D]));
1714 printf("%1i %2i %1i\n",ss_pred[i],ss_conf[i],ss_dssp[i]);
1717 printf("nss_dssp=%i nss_pred=%i\n",nss_dssp,nss_pred);
1723 //////////////////////////////////////////////////////////////////////////////
1725 * @brief Use secondary structure-dependent gap penalties
1726 * on top of the HMM transition penalties
1729 HMM::UseSecStrucDependentGapPenalties()
1731 int i; // column in HMM
1733 //unsigned char iis[MAXRES]; // inside-integer array
1734 unsigned char iis[par.maxResLen]; // inside-integer array
1735 float d; // Additional penalty for opening gap whithin SS element
1736 float e; // Additional penalty for extending gap whithin SS element
1738 // Determine inside-integers:
1739 // CCSTCCCHHHHHHHHHHHCCCCCEEEEECCSBGGGCCCCEECC
1740 // 0000000123444432100000012210000000000001000
1742 for (i=0; i<=L; ++i) // forward run
1744 if (ss_dssp[i]==1 || ss_dssp[i]==2) {ii+=(ii<par.ssgapi);} else ii=0;
1746 } for (i=0; i<=L; ++i)
1749 for (i=L; i>=0; i--) // backward run
1751 if (ss_dssp[i]==1 || ss_dssp[i]==2) {ii+=(ii<par.ssgapi);} else ii=0;
1752 iis[i-1]=imin(ii,iis[i-1]);
1755 // Add SS-dependent gap penalties to HMM transition penalties
1756 for (i=0; i<=L; ++i) //for all columns in HMM
1758 d=-iis[i]*par.ssgapd;
1759 e=-iis[i]*par.ssgape;
1762 tr[i][M2M_GAPOPEN]+=d;
1773 printf("Col SS II\n");
1774 for (i=0; i<=L; ++i) printf("%3i %c %2i\n",i,i2ss(ss_dssp[i]),iis[i]);
1779 /////////////////////////////////////////////////////////////////////////////////////
1781 * @brief Generate an amino acid frequency matrix g[][] with full pseudocount admixture (tau=1)
1784 HMM::PreparePseudocounts()
1786 for (int i=0; i<=L+1; ++i)
1787 for (int a=0; a<20; ++a)
1788 g[i][a] = // produces fast code
1789 R[a][0]*f[i][0] +R[a][1]*f[i][1] +R[a][2]*f[i][2] +R[a][3]*f[i][3] +R[a][4]*f[i][4]
1790 +R[a][5]*f[i][5] +R[a][6]*f[i][6] +R[a][7]*f[i][7] +R[a][8]*f[i][8] +R[a][9]*f[i][9]
1791 +R[a][10]*f[i][10]+R[a][11]*f[i][11]+R[a][12]*f[i][12]+R[a][13]*f[i][13]+R[a][14]*f[i][14]
1792 +R[a][15]*f[i][15]+R[a][16]*f[i][16]+R[a][17]*f[i][17]+R[a][18]*f[i][18]+R[a][19]*f[i][19];
1795 /////////////////////////////////////////////////////////////////////////////////////
1797 * @brief Add amino acid pseudocounts to HMM and calculate average protein aa probabilities pav[a]
1798 * Pseudocounts: t.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1801 HMM::AddAminoAcidPseudocounts(char pcm, float pca, float pcb, float pcc)
1803 int i; //position in HMM
1804 int a; //amino acid (0..19)
1806 float tau; //tau = pseudocount admixture
1808 for (a=0; a<20; ++a) pav[a]=pb[a]*100.0f/Neff_HMM; // initialize vector of average aa freqs with pseudocounts
1810 // Calculate amino acid frequencies p[i][a] = (1-tau(i))*f[i][a] + tau(i)*g[i][a]
1813 case 0: //no pseudocounts whatsoever: tau=0
1814 for (i=1; i<=L; ++i)
1815 for (a=0; a<20; ++a)
1816 pav[a] += ( p[i][a]=f[i][a] );
1818 case 1: //constant pseudocounts (for optimization): tau = pca
1820 for (i=1; i<=L; ++i)
1821 for (a=0; a<20; ++a)
1822 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
1824 case 2: //divergence-dependent pseudocounts
1825 case 4: //divergence-dependent pseudocounts and rate matrix rescaling
1827 for (i=1; i<=L; ++i)
1829 tau = fmin(1.0, pca/(1. + Neff_M[i]/pcb ) );
1830 for (a=0; a<20; ++a)
1831 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
1834 for (i=1; i<=L; ++i)
1836 tau = fmin(1.0, pca/(1. + pow((Neff_M[i])/pcb,pcc)));
1837 for (a=0; a<20; ++a)
1838 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
1841 case 3: // constant-divergence pseudocounts
1842 for (i=1; i<=L; ++i)
1844 float x = Neff_M[i]/pcb;
1845 pca = 0.793 + 0.048*(pcb-10.0);
1846 tau = fmax(0.0, pca*(1-x + pcc*x*(1-x)) );
1847 for (a=0; a<20; ++a)
1848 pav[a] += ( p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a] );
1850 if (v>=2) { printf("Divergence before / after addition of amino acid pseudocounts: %5.2f / %5.2f\n",Neff_HMM, CalcNeff()); }
1852 } //end switch (pcm)
1855 // Normalize vector of average aa frequencies pav[a]
1856 NormalizeTo1(pav,NAA);
1858 for (a=0; a<20; ++a)
1859 p[0][a] = p[L+1][a] = pav[a];
1867 cout<<"No pseudocounts added (-pcm 0)\n";
1870 cout<<"Adding constant AA pseudocount admixture of "<<pca<<" to HMM "<<name<<"\n";
1873 cout<<"Adding divergence-dependent AA pseudocounts (-pcm 2) with admixture of "
1874 <<pca/(1.+pow((Neff_HMM-1.)/pcb,pcc))<<" to HMM "<<name<<"\n";
1876 } //end switch (pcm)
1877 cout<<"\nAverage amino acid frequencies WITH pseudocounts in HMM: \nProf: ";
1878 for (a=0; a<20; ++a) printf("%4.1f ",100*pav[a]);
1882 cout<<"\nAmino acid frequencies WITHOUT pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n";
1883 for (i=1; i<=L; ++i)
1887 for (a=0; a<20; ++a)
1890 printf("%4.1f ",100*f[i][a]);
1892 printf(" sum=%5.3f\n",sum);
1894 cout<<"\nAmino acid frequencies WITH pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n";
1895 for (i=1; i<=L; ++i)
1899 for (a=0; a<20; ++a)
1902 printf("%4.1f ",100*p[i][a]);
1904 printf(" sum=%5.3f\n",sum);
1912 /////////////////////////////////////////////////////////////////////////////////////
1914 * @brief Factor Null model into HMM t
1917 HMM::IncludeNullModelInHMM(HMM& q, HMM& t)
1920 int i,j; //query and template match state indices
1921 int a; //amino acid index
1923 switch (par.columnscore)
1926 case 0: // Null model with background prob. from database
1927 for (a=0; a<20; ++a) pnul[a]=pb[a];
1930 case 1: // Null model with background prob. equal average from query and template
1931 for (a=0; a<20; ++a) pnul[a]=0.5*(q.pav[a]+t.pav[a]);
1934 case 2: // Null model with background prob. from template protein
1935 for (a=0; a<20; ++a) pnul[a]=t.pav[a];
1938 case 3: // Null model with background prob. from query protein
1939 for (a=0; a<20; ++a) pnul[a]=q.pav[a];
1942 case 4: // Null model with background prob. equal average from query and template
1943 for (a=0; a<20; ++a) pnul[a]=sqrt(q.pav[a]*t.pav[a]);
1946 case 10: // Separated column scoring for Stochastic Backtracing (STILL USED??)
1947 for (i=0; i<=q.L+1; ++i)
1950 for (a=0; a<20; ++a) sum += pb[a]*q.p[i][a];
1951 sum = 1.0/sqrt(sum);
1952 for (a=0; a<20; ++a) q.p[i][a]*=sum;
1954 for (j=0; j<=t.L+1; j++)
1957 for (a=0; a<20; ++a) sum += pb[a]*t.p[j][a];
1958 sum = 1.0/sqrt(sum);
1959 for (a=0; a<20; ++a) t.p[j][a]*=sum;
1963 case 11: // log co-emission probability (no null model)
1964 for (a=0; a<20; ++a) pnul[a]=0.05;
1969 // !!!!! ATTENTION!!!!!!! after this t.p is not the same as after adding pseudocounts !!!
1970 //Introduce amino acid weights into template (for all but SOP scores)
1971 if (par.columnscore!=10)
1972 for (a=0; a<20; ++a)
1973 for (j=0; j<=t.L+1; j++)
1978 cout<<"\nAverage amino acid frequencies\n";
1979 cout<<" A R N D C Q E G H I L K M F P S T W Y V\n";
1981 for (a=0; a<20; ++a) printf("%4.1f ",100*q.pav[a]);
1983 for (a=0; a<20; ++a) printf("%4.1f ",100*t.pav[a]);
1985 for (a=0; a<20; ++a) printf("%4.1f ",100*pnul[a]);
1987 for (a=0; a<20; ++a) printf("%4.1f ",100*pb[a]);
1995 /////////////////////////////////////////////////////////////////////////////////////
1997 * @brief Write HMM to output file
2000 HMM::WriteToFile(char* outfile)
2002 const int SEQLEN=100; // number of residues per line for sequences to be displayed
2005 if (trans_lin) {fprintf(stderr,"Error: Writing transition pseudocounts in linear representation not allowed. Please report this error to the HHsearch developers.\n"); exit(6);}
2008 if (strcmp(outfile,"stdout"))
2010 if (par.append) outf=fopen(outfile,"a"); else outf=fopen(outfile,"w");
2011 if (!outf) OpenFileError(outfile);
2015 if (v>=2) cout<<"Writing HMM to "<<outfile<<"\n";
2017 // fprintf(outf,"HHsearch HHM format 1.5\n");
2018 fprintf(outf,"HHsearch 1.5\n"); // format specification
2019 fprintf(outf,"NAME %s\n",longname); // name of first sequence
2020 fprintf(outf,"FAM %s\n",fam); // family name
2021 char file_nopath[NAMELEN];
2022 RemovePath(file_nopath,file);
2023 fprintf(outf,"FILE %s\n",file_nopath); // base name of alignment file
2025 // Print command line
2026 fprintf(outf,"COM ");
2027 for (int i=0; i<par.argc; i++)
2028 if (strlen(par.argv[i])<=100)
2029 fprintf(outf,"%s ",par.argv[i]);
2031 fprintf(outf,"<%i characters> ",(int)strlen(par.argv[i]));
2034 // print out date stamp
2035 time_t* tp=new(time_t);
2037 fprintf(outf,"DATE %s",ctime(tp));
2038 delete tp; tp = NULL; /* really? FS */
2040 // Print out some statistics of alignment
2041 fprintf(outf,"LENG %i match states, %i columns in multiple alignment\n",L,l[L]);
2042 fprintf(outf,"FILT %i out of %i sequences passed filter (-id %i -cov %i -qid %i -qsc %.2f -diff %i)\n",N_filtered,N_in,par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);
2043 fprintf(outf,"NEFF %-4.1f\n",Neff_HMM);
2045 // Print selected sequences from alignment (including secondary structure and confidence values, if known)
2046 fprintf(outf,"SEQ\n");
2047 for (int n=0; n<n_display; n++)
2049 fprintf(outf,">%s\n",sname[n]);
2050 //first sequence character starts at 1; 0 not used.
2051 for(unsigned int j=0; j<strlen(seq[n]+1); j+=SEQLEN) fprintf(outf,"%-.*s\n",SEQLEN,seq[n]+1+j);
2053 fprintf(outf,"#\n");
2055 // print null model background probabilities from substitution matrix
2056 fprintf(outf,"NULL ");
2057 for (a=0; a<20; ++a) fout(outf,-iround(fast_log2(pb[s2a[a]])*HMMSCALE ));
2060 // print table header line with amino acids
2061 fprintf(outf,"HMM ");
2062 for (a=0; a<20; ++a) fprintf(outf,"%1c\t",i2aa(s2a[a]));
2065 // print table header line with state transitions
2066 fprintf(outf," M->M\tM->I\tM->D\tI->M\tI->I\tD->M\tD->D\tNeff\tNeff_I\tNeff_D\n");
2068 // print out transition probabilities from begin state (virtual match state)
2070 for (a=0; a<=D2D; ++a) fout(outf,-iround(tr[0][a]*HMMSCALE));
2071 fout(outf,iround(Neff_M[0]*HMMSCALE));
2072 fout(outf,iround(Neff_I[0]*HMMSCALE));
2073 fout(outf,iround(Neff_D[0]*HMMSCALE));
2076 // Start loop for printing HMM columns
2078 for (i=1; i<=L; ++i)
2081 while(islower(seq[nfirst][h]) && seq[nfirst][h]) h++;
2082 fprintf(outf,"%1c %-4i ",seq[nfirst][h++],i);
2084 // Print emission probabilities for match state
2085 for (a=0; a<20; ++a) fout(outf,-iround(fast_log2(p[i][s2a[a]])*HMMSCALE ));
2086 fprintf(outf,"%-i",l[i]);
2089 // Print transition probabilities
2091 for (a=0; a<=D2D; ++a) fout(outf,-iround(tr[i][a]*HMMSCALE));
2092 fout(outf,iround(Neff_M[i]*HMMSCALE));
2093 fout(outf,iround(Neff_I[i]*HMMSCALE));
2094 fout(outf,iround(Neff_D[i]*HMMSCALE));
2095 fprintf(outf,"\n\n");
2096 } // end for(i)-loop for printing HMM columns
2098 fprintf(outf,"//\n");
2102 /////////////////////////////////////////////////////////////////////////////////////
2104 * @brief Write HMM to output file
2107 HMM::InsertCalibration(char* infile)
2109 char* line = new(char[LINELEN]); // input line
2110 char** lines = new char*[3*L+100000];
2113 char done=0; // inserted new 'EVD mu sigma' line?
2115 // Read from infile all lines and insert the EVD line with lamda and mu coefficients
2117 inf.open(infile, ios::in);
2118 if (!inf) OpenFileError(infile);
2119 if (v>=2) cout<<"Recording calibration coefficients in "<<infile<<"\n";
2121 while (inf.getline(line,LINELEN) && !(line[0]=='/' && line[1]=='/') && nline<2*/*MAXRES*/par.maxResLen)
2124 // Found an EVD lamda mu line? -> remove
2125 while (!done && !strncmp(line,"EVD ",3) && !(line[0]=='/' && line[1]=='/') && nline<2*/*MAXRES*/par.maxResLen)
2126 inf.getline(line,LINELEN);
2127 if ((line[0]=='/' && line[1]=='/') || nline>=2*/*MAXRES*/par.maxResLen)
2128 {fprintf(stderr,"Error: wrong format in %s. Expecting hhm format\n",infile); exit(1);}
2130 // Found the SEQ line? -> insert calibration before this line
2131 if (!done && (!strncmp("SEQ",line,3) || !strncmp("HMM",line,3)) && (isspace(line[3]) || line[3]=='\0'))
2134 lines[nline]=new(char[128]);
2135 if (!lines[nline]) MemoryError("space to read in HHM file for calibration");
2136 sprintf(lines[nline],"EVD %-7.4f %-7.4f",lamda,mu);
2139 lines[nline]=new char[strlen(line)+1];
2140 if (!lines[nline]) MemoryError("space to read in HHM file for calibration");
2141 strcpy (lines[nline],line);
2146 // Write to infile all lines
2147 FILE* infout=fopen(infile,"w");
2149 cerr<<endl<<"WARNING in "<<program_name<<": no calibration coefficients written to "<<infile<<":\n";
2150 cerr<<"Could not open file for writing.\n";
2153 for (l=0; l<nline; l++) {
2154 fprintf(infout,"%s\n",lines[l]);
2155 delete[] lines[l]; lines[l] = NULL;
2157 fprintf(infout,"//\n");
2159 delete[] line; line = NULL;
2160 delete[] lines; lines = NULL;
2164 /////////////////////////////////////////////////////////////////////////////////////
2166 * @brief Write HMM to output file in HMMER format
2169 HMM::WriteToFileHMMER(char* outfile)
2171 const int INTSCALE=1000; //scaling factor in HMMER files
2172 const float pBD=0.50;
2173 const int LOG2pBD=iround(fast_log2(pBD)*INTSCALE);
2174 const int LOG2pBM=iround(fast_log2(1-pBD)*INTSCALE);
2175 const float pJB=1.0/350;
2176 const int LOG2pJB=iround(fast_log2(pJB)*INTSCALE);
2177 const int LOG2pJJ=iround(fast_log2(1-pJB)*INTSCALE);
2178 const float pEJ=0.5;
2179 const int LOG2pEJ=iround(fast_log2(pEJ)*INTSCALE);
2180 const int LOG2pEC=iround(fast_log2(1-pEJ)*INTSCALE);
2184 if (trans_lin) {fprintf(stderr,"Error: Writing transition pseudocounts in linear representation not allowed. Please report this error to the HHsearch developers.\n"); exit(6);}
2187 if (strcmp(outfile,"stdout"))
2189 if (par.append) outf=fopen(outfile,"a"); else outf=fopen(outfile,"w");
2190 if (!outf) OpenFileError(outfile);
2194 if (v>=2) cout<<"Writing HMM to "<<outfile<<"\n";
2196 fprintf(outf,"HMMER2.0 [hhmake %s]\n",VERSION_AND_DATE);
2197 fprintf(outf,"NAME %s\n",file); // base name of alignment file
2198 fprintf(outf,"DESC %s\n",longname);
2199 fprintf(outf,"LENG %i\n",L);
2200 fprintf(outf,"ALPH Amino\n"); // amino acid seuqences (not DNA)
2201 fprintf(outf,"RF yes\n"); // reference annotation flag
2202 fprintf(outf,"CS yes\n"); // consensus structure annotation flag
2203 fprintf(outf,"MAP yes\n"); // write MA column number after each line of aa probabilities
2205 fprintf(outf,"COM "); // print out command line
2206 for (i=0; i<=par.argc-1; ++i) fprintf(outf,"%s ",par.argv[i]); fprintf(outf,"\n");
2208 fprintf(outf,"NSEQ %i\n",N_filtered); // print number of sequences after filtering
2211 time_t* tp=new(time_t);
2213 fprintf(outf,"DATE %s",ctime(tp));
2214 delete tp; tp = NULL; /* really? FS */
2216 // Print out secondary structure
2218 fprintf(outf,"SSDSS %s\n",seq[nss_dssp]);
2220 fprintf(outf,"SADSS %s\n",seq[nsa_dssp]);
2222 fprintf(outf,"SSPRD %s\n",seq[nss_pred]);
2224 fprintf(outf,"SSCNF %s\n",seq[nss_conf]);
2227 // Special Plan7 transitions that control repeated detection of profile HMM within sequence
2228 fprintf(outf,"XT %6i %6i %6i %6i %6i %6i %6i %6i\n",LOG2pJB,LOG2pJJ,LOG2pEC,LOG2pEJ,LOG2pJB,LOG2pJJ,LOG2pJB,LOG2pJJ);
2229 fprintf(outf,"NULT -4 -8455\n");
2232 // Null model background probabilities from substitution matrix
2233 fprintf(outf,"NULE ");
2234 for (a=0; a<20; ++a)
2236 float lg2=fast_log2(pb[s2a[a]]*20.0);
2237 if (lg2<-99.999) fprintf(outf," *"); else fprintf(outf," %6i",iround(lg2*INTSCALE));
2241 // Table header line with amino acids
2242 fprintf(outf,"HMM ");
2243 for (a=0; a<20; ++a) fprintf(outf," %1c ",i2aa(s2a[a]));
2246 // Table header line with state transitions
2247 fprintf(outf," m->m m->i m->d i->m i->i d->m d->d b->m m->e\n");
2249 // Transition probabilities from begin state
2250 fprintf(outf," %6i * %6i\n",LOG2pBM,LOG2pBD);
2252 // Start loop for printing HMM columns
2254 for (i=1; i<=L; ++i)
2257 // Emission probabilities for match state
2258 fprintf(outf," %5i",i);
2259 for (a=0; a<20; ++a) fprintf(outf," %6i",imax(-9999,iround(fast_log2(p[i][s2a[a]]/pb[s2a[a]])*INTSCALE)));
2260 fprintf(outf," %5i",l[i]);
2263 // Emission probabilities (relative to null model) for insert state
2264 while(islower(seq[nfirst][h]) && seq[nfirst][h]) h++;
2266 fprintf(outf," %1c * * * * * * * * * * * * * * * * * * * *\n",seq[nfirst][h++]);
2268 fprintf(outf," %1c 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",seq[nfirst][h++]);
2270 // Transition probabilities
2273 while(islower(seq[nss_dssp][hss]) && seq[nss_dssp][hss]) hss++;
2274 c=seq[nss_dssp][hss++];
2277 fprintf(outf," %1c",c);
2280 for (a=0; a<=D2D; ++a) fprintf(outf," %6i",imax(-9999,iround(tr[i][a]*INTSCALE)));
2281 fprintf(outf," %6i *\n",LOG2pBM);
2285 for (a=0; a<=D2D; ++a) fprintf(outf," *");
2286 fprintf(outf," * 0\n");
2290 for (a=0; a<=D2D; ++a) fprintf(outf," %6i",imax(-9999,iround(tr[i][a]*INTSCALE)));
2291 fprintf(outf," * *\n");
2295 fprintf(outf,"//\n");
2300 /////////////////////////////////////////////////////////////////////////////////////
2302 * @brief Transform log to lin transition probs
2305 HMM::Log2LinTransitionProbs(float beta)
2307 if (trans_lin==1) return;
2309 for (int i=0; i<=L; ++i)
2311 for (int a=0; a<NTRANS; ++a)
2312 tr[i][a] = fpow2(beta*tr[i][a]);
2313 /* FIXME valgrind says: "Conditional jump or move depends on
2314 * uninitialised value(s)" when using hmm iteration
2321 * @brief Set query columns in His-tags etc to Null model distribution
2324 HMM::NeutralizeTags()
2326 char* qseq = seq[nfirst];
2334 // Neutralize His tag
2335 if ( (pt=strstr(qseq,"HHHHH")) )
2338 if (v>=2) printf("Neutralized His-tag at position %i\n",i0);
2339 for (i=imax(i0-5,1); i<i0; ++i) // neutralize leading 5 columns
2340 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2341 for (; (*pt)!='H'; ++i,++pt) // neutralize His columns
2342 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2344 for (; i<imin(i0+5,L+1); ++i) // neutralize trailing 5 columns
2345 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2346 if (v>=3) printf("start:%i end:%i\n",imax(i0-5,1),i-1);
2349 // Neutralize C-myc tag
2350 if ( (pt=strstr(qseq,"EQKLISEEDL")) )
2352 if (v>=2) printf("Neutralized C-myc-tag at position %i\n",int(pt-qseq)+1);
2353 for (i=pt-qseq+1; i<=pt-qseq+10; ++i)
2354 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2356 // Neutralize FLAG tag
2357 if ( (pt=strstr(qseq,"DYKDDDDK")) )
2359 if (v>=2) printf("Neutralized FLAG-tag at position %i\n",int(pt-qseq)+1);
2360 for (i=pt-qseq+1; i<=pt-qseq+8; ++i)
2361 for (a=0; a<NAA; ++a) p[i][a]=pb[a];
2367 /////////////////////////////////////////////////////////////////////////////////////
2369 * @brief Calculate effective number of sequences using profiles INCLUDING pseudocounts
2375 for (int i=1; i<=L; ++i)
2376 for (int a=0; a<20; ++a)
2377 if (p[i][a]>1E-10) Neff-=p[i][a]*fast_log2(p[i][a]);
2378 return fpow2(Neff/L);
2382 /////////////////////////////////////////////////////////////////////////////////////
2384 * @brief Calculate consensus of HMM (needed to merge HMMs later)
2387 HMM::CalculateConsensus()
2389 int i; // position in query
2390 int a; // amino acid
2391 if (!Xcons) Xcons = new char[/*MAXRES*/par.maxResLen+2];
2392 for (i=1; i<=L; ++i)
2394 float max=f[i][0]-pb[0];
2395 for (a=1; a<20; ++a)
2396 if (f[i][a]-pb[a]>max) Xcons[i]=a;
2398 Xcons[0]=Xcons[L+1]=ENDGAP;
2401 // /////////////////////////////////////////////////////////////////////////////////////
2402 // // Store linear transition probabilities
2403 // /////////////////////////////////////////////////////////////////////////////////////
2404 // void HMM::StoreLinearTransitionProbs()
2406 // int i; // position in query
2407 // for (i=0; i<=L+1; ++i) if (!tr_lin[i]) tr_lin[i] = new(float[NTRANS]);
2408 // for (i=0; i<=L+1; ++i)
2410 // tr_lin[i][M2M] = fpow2(tr[i][M2M]);
2411 // tr_lin[i][M2I] = fpow2(tr[i][M2I]);
2412 // tr_lin[i][M2D] = fpow2(tr[i][M2D]);
2413 // tr_lin[i][D2M] = fpow2(tr[i][M2D]);
2414 // tr_lin[i][D2D] = fpow2(tr[i][D2D]);
2415 // tr_lin[i][I2M] = fpow2(tr[i][I2M]);
2416 // tr_lin[i][I2I] = fpow2(tr[i][I2I]);
2421 // #define Weff(Neff) (1.0+par.neffa*(Neff-1.0)+(par.neffb-4.0*par.neffa)/16.0*(Neff-1.0)*(Neff-1.0))
2423 // /////////////////////////////////////////////////////////////////////////////////////
2424 // // Initialize f[i][a] with query HMM
2425 // /////////////////////////////////////////////////////////////////////////////////////
2426 // void HMM::MergeQueryHMM(HMM& q, float wk[])
2428 // int i; // position in query
2429 // int a; // amino acid
2430 // float Weff_M, Weff_D, Weff_I;
2431 // for (i=1; i<=L; i++)
2433 // Weff_M = Weff(q.Neff_M[i]-1.0);
2434 // Weff_D = Weff(q.Neff_D[i]-1.0);
2435 // Weff_I = Weff(q.Neff_I[i]-1.0);
2436 // for (a=0; a<20; a++) f[i][a] = q.f[i][a]*wk[i]*Weff_M;
2437 // tr_lin[i][M2M] = q.tr_lin[i][M2M]*wk[i]*Weff_M;
2438 // tr_lin[i][M2I] = q.tr_lin[i][M2I]*wk[i]*Weff_M;
2439 // tr_lin[i][M2D] = q.tr_lin[i][M2D]*wk[i]*Weff_M;
2440 // tr_lin[i][D2M] = q.tr_lin[i][D2M]*wk[i]*Weff_D;
2441 // tr_lin[i][D2D] = q.tr_lin[i][D2D]*wk[i]*Weff_D;
2442 // tr_lin[i][I2M] = q.tr_lin[i][I2M]*wk[i]*Weff_I;
2443 // tr_lin[i][I2I] = q.tr_lin[i][I2I]*wk[i]*Weff_I;
2449 // /////////////////////////////////////////////////////////////////////////////////////
2450 // // Normalize probabilities in total merged super-HMM
2451 // /////////////////////////////////////////////////////////////////////////////////////
2452 // void HMM::NormalizeHMMandTransitionsLin2Log()
2454 // int i; // position in query
2455 // int a; // amino acid
2456 // for (i=0; i<=L+1; i++)
2459 // for (a=0; a<20; a++) sum += f[i][a];
2460 // for (a=0; a<20; a++) f[i][a]/=sum;
2461 // sum = tr_lin[i][M2M] + tr_lin[i][M2I] + tr_lin[i][M2D];
2462 // tr_lin[i][M2M] /= sum;
2463 // tr_lin[i][M2I] /= sum;
2464 // tr_lin[i][M2D] /= sum;
2465 // tr[i][M2M] = fast_log2(tr_lin[i][M2M]);
2466 // tr[i][M2I] = fast_log2(tr_lin[i][M2I]);
2467 // tr[i][M2D] = fast_log2(tr_lin[i][M2D]);
2468 // sum = tr_lin[i][D2M] + tr_lin[i][D2D];
2469 // tr_lin[i][D2M] /= sum;
2470 // tr_lin[i][D2D] /= sum;
2471 // tr[i][D2M] = fast_log2(tr_lin[i][D2M]);
2472 // tr[i][D2D] = fast_log2(tr_lin[i][D2D]);
2473 // sum = tr_lin[i][I2M] + tr_lin[i][I2I];
2474 // tr_lin[i][I2M] /= sum;
2475 // tr_lin[i][I2I] /= sum;
2476 // tr[i][I2M] = fast_log2(tr_lin[i][I2M]);
2477 // tr[i][I2I] = fast_log2(tr_lin[i][I2I]);
2482 // UNCOMMENT TO ACTIVATE COMPOSITIONALLY BIASED PSEUDOCOUNTS BY RESCALING THE RATE MATRIX
2484 // /////////////////////////////////////////////////////////////////////////////////////
2485 // //// Function to minimize
2486 // /////////////////////////////////////////////////////////////////////////////////////
2487 // double RescaleMatrixFunc(double x[])
2490 // for (int a=0; a<20; ++a)
2493 // for (int b=0; b<20; ++b) za+=P[a][b]*x[b];
2494 // sum += (x[a]*za-qav[a])*(x[a]*za-qav[a]);
2499 // /////////////////////////////////////////////////////////////////////////////////////
2500 // //// Gradient of function to minimize
2501 // /////////////////////////////////////////////////////////////////////////////////////
2502 // void RescaleMatrixFuncGrad(double grad[], double x[])
2504 // double z[20] = {0.0};
2507 // for (int a=0; a<20; ++a)
2508 // for (int b=0; b<20; ++b) z[a] += P[a][b]*x[b];
2510 // for (int a=0; a<20; ++a) w[a] = x[a]*z[a]-qav[a];
2511 // for (int a=0; a<20; ++a)
2514 // for (int b=0; b<20; ++b) tmp += P[a][b]*x[b]*w[b];
2515 // grad[a] = 2.0*tmp;
2521 // /////////////////////////////////////////////////////////////////////////////////////
2522 // //// Rescale a substitution matrix to biased aa frequencies in global vector qav[a]
2523 // /////////////////////////////////////////////////////////////////////////////////////
2524 // void HMM::RescaleMatrix()
2528 // double x[21]; // scaling factor
2530 // const int len=20;
2531 // const int max_iterations=50;
2533 // if (v>=2) printf("Adjusting rate matrix to query amino acid composition ...\n");
2535 // // Put amino acid frequencies into global array (needed to call WNLIB's conjugate gradient method)
2536 // for (a=0; a<20; ++a) qav[a] = pav[a];
2538 // // Initialize scaling factors x[a]
2539 // for (a=0; a<20; ++a) x[a]=pow(qav[a]/pb[a],0.73); // Initialize
2541 // // Call conjugate gradient minimization method from WNLIB
2542 // wn_conj_gradient_method(&code,&val_min,x,len,&RescaleMatrixFunc,&RescaleMatrixFuncGrad,max_iterations);
2545 // // Calculate z[a] = sum_b Pab*xb
2546 // float sum_err=0.0f;
2547 // float sum = 0.0f;
2548 // for (a=0; a<20; ++a)
2550 // float za=0.0f; // za = sum_b Pab*xb
2551 // for (b=0; b<20; ++b) za+=P[a][b]*x[b];
2552 // sum_err += (x[a]*za/qav[a]-1)*(x[a]*za/qav[a]-1);
2555 // if (sum_err>1e-3 & v>=1) fprintf(stderr,"WARNING: adjusting rate matrix by CG resulted in residual error of %5.3f.\n",sum_err);
2557 // // Rescale rate matrix
2558 // for (a=0; a<20; ++a)
2559 // for (b=0; b<20; ++b)
2561 // P[a][b] *= x[a]*x[b]/sum;
2562 // R[a][b] = P[a][b]/qav[b];
2565 // // How well approximated?
2568 // // Calculate z[a] = sum_b Pab*xb
2570 // for (a=0; a<20; ++a)
2571 // for (z[a]=0.0, b=0; b<20; ++b) z[a]+=P[a][b];
2572 // printf("Adjust A R N D C Q E G H I L K M F P S T W Y V\nErr? ");
2573 // for (a=0; a<20; ++a) printf("%4.0f ",1000*z[a]/qav[a]);
2574 // cout<<endl<<"xa ";
2575 // for (a=0; a<20; ++a) fprintf(stdout,"%4.2f ",x[a]);
2579 // // Evaluate sequence identity underlying substitution matrix
2583 // float entropy=0.0f;
2584 // float entropy_qav=0.0f;
2585 // float mut_info=0.0f;
2586 // for (a=0; a<20; ++a) id += P[a][a];
2587 // for (a=0; a<20; ++a) entropy_qav-=qav[a]*fast_log2(qav[a]);
2588 // for (a=0; a<20; ++a)
2589 // for (b=0; b<20; ++b)
2591 // entropy-=P[a][b]*fast_log2(R[a][b]);
2592 // mut_info += P[a][b]*fast_log2(P[a][b]/qav[a]/qav[b]);
2595 // fprintf(stdout,"Rescaling rate matrix: sequence identity = %2.0f%%; entropy per column = %4.2f bits (out of %4.2f); mutual information = %4.2f bits\n",100*id,entropy,entropy_qav,mut_info);
2601 /* @* HMM::ClobberGlobal (eg, q,t)
2604 HMM::ClobberGlobal(void){
2606 for (int i = 0; i < n_display; i++){
2608 delete[] sname[i]; sname[i] = NULL;
2611 delete[] seq[i]; seq[i] = NULL;
2614 Neff_M[0] = Neff_I[0] = Neff_D[0] = 0.0;
2615 longname[0] = '\0'; file[0] = '\0';
2616 ss_dssp[0] = sa_dssp[0] = ss_pred[0] = ss_conf[0] = '\0';
2621 n_display = N_in = N_filtered = 0;
2622 nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1;
2623 lamda = 0.0; mu = 0.0;
2624 name[0] = longname[0] = fam[0] = '\0';
2626 for (int i = 0; i < NAA; i++){
2633 } /* this is the end of ClobberGlobal() */