********************************************************************/
/*
- * RCS $Id: hhhmm-C.h 224 2011-03-23 12:13:33Z fabian $
+ * RCS $Id: hhhmm-C.h 316 2016-12-16 16:14:39Z fabian $
*/
#include "hhdecl-C.h"
#include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
#endif
+//#include "new_new.h" /* memory tracking */
// #ifndef WNLIB
// #define WNLIB
//////////////////////////////////////////////////////////////////////////////
HMM::HMM(int maxseqdis, int maxres)
{
- sname = new char*[maxseqdis]; // names of stored sequences
- for (int i = 0; i < maxseqdis; i++){ sname[i] = NULL;}
- seq = new char*[maxseqdis]; // residues of stored sequences (first at pos 1!)
- for (int i = 0; i < maxseqdis; i++){ seq[i] = NULL;}
- Neff_M = new float[maxres]; // Neff_M[i] = diversity of subalignment of seqs that have residue in col i
- Neff_I = new float[maxres]; // Neff_I[i] = diversity of subalignment of seqs that have insert in col i
- Neff_D = new float[maxres]; // Neff_D[i] = diversity of subalignment of seqs that have delete in col i
- longname = new char[DESCLEN]; // Full name of first sequence of original alignment (NAME field)
- ss_dssp = new char[maxres]; // secondary structure determined by dssp 0:- 1:H 2:E 3:C 4:S 5:T 6:G 7:B
- sa_dssp = new char[maxres]; // solvent accessibility state determined by dssp 0:- 1:A (absolutely buried) 2:B 3:C 4:D 5:E (exposed)
- ss_pred = new char[maxres]; // predicted secondary structure 0:- 1:H 2:E 3:C
- ss_conf = new char[maxres]; // confidence value of prediction 0:- 1:0 ... 10:9
- Xcons = NULL; // create only when needed: consensus sequence in internal representation (A=0 R=1 N=2 D=3 ...)
- l = new int[maxres]; // l[i] = pos. of j'th match state in aligment
- /* FS introduced sentinel, NULL terminates loop in destructor, FS, r221->222 */
- f = new float*[maxres+1]; f[maxres] = NULL; // f[i][a] = prob of finding amino acid a in column i WITHOUT pseudocounts
- g = new float*[maxres+1]; g[maxres] = NULL; // f[i][a] = prob of finding amino acid a in column i WITH pseudocounts
- p = new float*[maxres+1]; p[maxres] = NULL; // p[i][a] = prob of finding amino acid a in column i WITH OPTIMUM pseudocounts
- tr = new float*[maxres+1]; tr[maxres] = NULL; // log2 of transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
-// tr_lin = new float*[maxres]; // linear transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
- for (int i=0; i<maxres; i++) {f[i]=new(float[NAA+3]);}
- for (int i=0; i<maxres; i++) {g[i]=new(float[NAA]);}
- for (int i=0; i<maxres; i++) {p[i]=new(float[NAA]);}
- for (int i=0; i<maxres; i++) {tr[i]=new(float[NTRANS]);}
-// for (int i=0; i<maxres; i++) tr_lin[i]=new(float[NTRANS]);
- L=0;
- Neff_HMM=0;
- n_display=N_in=N_filtered=0;
- nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
-// lamda_hash.New(37,0.0); // Set size and NULL element for hash
-// mu_hash.New(37,0.0); // Set size and NULL element for hash
- lamda=0.0; mu=0.0;
- name[0]=longname[0]=fam[0]='\0';
- trans_lin=0; // transition probs in log space
+ sname = new char*[maxseqdis](); // names of stored sequences
+ for (int i = 0; i < maxseqdis; i++){ sname[i] = NULL;}
+ seq = new char*[maxseqdis](); // residues of stored sequences (first at pos 1!)
+ for (int i = 0; i < maxseqdis; i++){ seq[i] = NULL;}
+ Neff_M = new float[maxres](); // Neff_M[i] = diversity of subalignment of seqs that have residue in col i
+ Neff_I = new float[maxres](); // Neff_I[i] = diversity of subalignment of seqs that have insert in col i
+ Neff_D = new float[maxres](); // Neff_D[i] = diversity of subalignment of seqs that have delete in col i
+ longname = new char[DESCLEN](); // Full name of first sequence of original alignment (NAME field)
+ ss_dssp = new char[maxres](); // secondary structure determined by dssp 0:- 1:H 2:E 3:C 4:S 5:T 6:G 7:B
+ sa_dssp = new char[maxres](); // solvent accessibility state determined by dssp 0:- 1:A (absolutely buried) 2:B 3:C 4:D 5:E (exposed)
+ ss_pred = new char[maxres](); // predicted secondary structure 0:- 1:H 2:E 3:C
+ ss_conf = new char[maxres](); // confidence value of prediction 0:- 1:0 ... 10:9
+ Xcons = NULL; // create only when needed: consensus sequence in internal representation (A=0 R=1 N=2 D=3 ...)
+ l = new int[maxres](); // l[i] = pos. of j'th match state in aligment
+ /* FS introduced sentinel, NULL terminates loop in destructor, FS, r221->222 */
+ f = new float*[maxres+1](); f[maxres] = NULL; // f[i][a] = prob of finding amino acid a in column i WITHOUT pseudocounts
+ g = new float*[maxres+1](); g[maxres] = NULL; // f[i][a] = prob of finding amino acid a in column i WITH pseudocounts
+ p = new float*[maxres+1](); p[maxres] = NULL; // p[i][a] = prob of finding amino acid a in column i WITH OPTIMUM pseudocounts
+ tr = new float*[maxres+1](); tr[maxres] = NULL; // log2 of transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
+ // tr_lin = new float*[maxres]; // linear transition probabilities M2M M2I M2D I2M I2I D2M D2D M2M_GAPOPEN GAPOPEN GAPEXTD
+ for (int i=0; i<maxres; i++) {f[i]=new(float[NAA+3])();}
+ for (int i=0; i<maxres; i++) {g[i]=new(float[NAA])();}
+ for (int i=0; i<maxres; i++) {p[i]=new(float[NAA])();}
+ for (int i=0; i<maxres; i++) {tr[i]=new(float[NTRANS])();}
+ // for (int i=0; i<maxres; i++) tr_lin[i]=new(float[NTRANS]);
+ L=0;
+ Neff_HMM=0;
+ n_display=N_in=N_filtered=0;
+ nss_dssp=nsa_dssp=nss_pred=nss_conf=nfirst=ncons=-1;
+ // lamda_hash.New(37,0.0); // Set size and NULL element for hash
+ // mu_hash.New(37,0.0); // Set size and NULL element for hash
+ lamda=0.0; mu=0.0;
+ name[0]=longname[0]=fam[0]='\0';
+ trans_lin=0; // transition probs in log space
}
n_display=q.n_display;
for (int k=0; k<n_display; k++) {
- sname[k]=new(char[strlen(q.sname[k])+1]);
+ sname[k]=new char[strlen(q.sname[k])+1];
if (!sname[k]) MemoryError("array of names for sequences to display");
strcpy(sname[k],q.sname[k]);
}
for (int k=0; k<n_display; k++) {
- seq[k]=new(char[strlen(q.seq[k])+1]);
+ seq[k]=new char[strlen(q.seq[k])+1];
if (!seq[k]) MemoryError("array of names for sequences to display");
strcpy(seq[k],q.seq[k]);
}
else if (!strcmp("SEQ",str3))
{
//char cur_seq[MAXCOL]=""; //Sequence currently read in
- char *cur_seq = new(char[par.maxColCnt]); //Sequence currently read in
+ char *cur_seq = new char[par.maxColCnt]; //Sequence currently read in
int k; // sequence index; start with -1; after reading name of n'th sequence-> k=n
int h; // index for character in input line
int l=1; // index of character in sequence seq[k]
//If this is not the first sequence then store residues of previous sequence
if (k>0) {
- seq[k-1]=new(char[strlen(cur_seq)+1]);
+ seq[k-1]=new char[strlen(cur_seq)+1];
if (!seq[k-1]) MemoryError("array of sequences to display");
strcpy(seq[k-1],cur_seq);
}
// store sequence name
strcut(line+1); //find next white-space character and overwrite it with end-of-string character
- sname[k] = new (char[strlen(line+1)+1]); //+1 for terminating '\0'
+ sname[k] = new char[strlen(line+1)+1]; //+1 for terminating '\0'
if (!sname[k]) MemoryError("array of names for sequences to display");
strcpy(sname[k],line+1); //store sequence name in **name
l=1; i=1;
} //while(getline)
//If this is not the first sequence some residues have already been read in
if (k>=0) {
- seq[k]=new(char[strlen(cur_seq)+1]);
+ seq[k]=new char[strlen(cur_seq)+1];
if (!seq[k]) MemoryError("array of sequences to display");
strcpy(seq[k],cur_seq);
}
if (nsa_dssp<0)
{
nsa_dssp=k++;
- seq[nsa_dssp] = new(char[/*MAXRES*/par.maxResLen+2]);
+ seq[nsa_dssp] = new char[/*MAXRES*/par.maxResLen+2];
sname[nsa_dssp] = new(char[15]);
strcpy(seq[nsa_dssp]," ");
strcpy(sname[nsa_dssp],"sa_dssp");
if (nss_pred<0)
{
nss_pred=k++;
- seq[nss_pred] = new(char[/*MAXRES*/par.maxResLen+2]);
+ seq[nss_pred] = new char[/*MAXRES*/par.maxResLen+2];
sname[nss_pred] = new(char[15]);
strcpy(seq[nss_pred]," ");
strcpy(sname[nss_pred],"ss_pred");
if (nss_conf<0)
{
nss_conf=k++;
- seq[nss_conf] = new(char[/*MAXRES*/par.maxResLen+2]);
+ seq[nss_conf] = new char[/*MAXRES*/par.maxResLen+2];
sname[nss_conf] = new(char[15]);
strcpy(seq[nss_conf]," ");
strcpy(sname[nss_conf],"ss_conf");
// Prepare to store DSSP states (if there are none, delete afterwards)
nss_dssp=k++;
- seq[nss_dssp] = new(char[/*MAXRES*/par.maxResLen+2]);
+ seq[nss_dssp] = new char[/*MAXRES*/par.maxResLen+2];
sname[nss_dssp] = new(char[15]);
strcpy(sname[nss_dssp],"ss_dssp");
{
sname[k]=new(char[10]);
strcpy(sname[k],"Consensus");
- sname[k+1]=new(char[strlen(longname)+1]);
+ sname[k+1]=new char[strlen(longname)+1];
strcpy(sname[k+1],longname);
- seq[k]=new(char[L+2]);
+ seq[k]=new char[L+2];
seq[k][0]=' ';
seq[k][L+1]='\0';
- seq[k+1]=new(char[L+2]);
+ seq[k+1]=new char[L+2];
seq[k+1][0]=' ';
seq[k+1][L+1]='\0';
for (i=1; i<=L; ++i)
}
else
{
- sname[k]=new(char[strlen(longname)+1]);
+ sname[k]=new char[strlen(longname)+1];
/* FIXME valgrind says bytes get lost here during hmm iteration --
fixed in HMM::ClobberGlobal(), I (FS) think */
strcpy(sname[k],longname);
- seq[k]=new(char[L+2]);
+ seq[k]=new char[L+2];
seq[k][0]=' ';
seq[k][L+1]='\0';
}
int
HMM::ReadHMMer3(FILE* dbf, char* filestr)
{
- char line[LINELEN]=""; // input line
- char desc[DESCLEN]=""; // description of family
- char str4[5]=""; // first 4 letters of input line
- char* ptr; // pointer for string manipulation
- int i=0; // index for match state (first=1)
- int a; // amino acid index
- char dssp=0; // 1 if a consensus SS has been found in the transition prob lines
- char annot=0; // 1 if at least one annotation character in insert lines is ne '-' or ' '
- int k=0; // index for seq[k]
+ char line[LINELEN]=""; // input line
+ char desc[DESCLEN]=""; // description of family
+ char str4[5]=""; // first 4 letters of input line
+ char* ptr; // pointer for string manipulation
+ int i=0; // index for match state (first=1)
+ int a; // amino acid index
+ char dssp=0; // 1 if a consensus SS has been found in the transition prob lines
+ char annot=0; // 1 if at least one annotation character in insert lines is ne '-' or ' '
+ int k=0; // index for seq[k]
char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
//annotchr = new char[MAXRES]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
annotchr = new char[par.maxResLen]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
if (nsa_dssp<0)
{
nsa_dssp=k++;
- seq[nsa_dssp] = new(char[/*MAXRES*/par.maxResLen+2]);
+ seq[nsa_dssp] = new char[/*MAXRES*/par.maxResLen+2];
sname[nsa_dssp] = new(char[15]);
strcpy(seq[nsa_dssp]," ");
strcpy(sname[nsa_dssp],"sa_dssp");
if (nss_pred<0)
{
nss_pred=k++;
- seq[nss_pred] = new(char[/*MAXRES*/par.maxResLen+2]);
+ seq[nss_pred] = new char[/*MAXRES*/par.maxResLen+2];
sname[nss_pred] = new(char[15]);
strcpy(seq[nss_pred]," ");
strcpy(sname[nss_pred],"ss_pred");
if (nss_conf<0)
{
nss_conf=k++;
- seq[nss_conf] = new(char[/*MAXRES*/par.maxResLen+2]);
+ seq[nss_conf] = new char[/*MAXRES*/par.maxResLen+2];
sname[nss_conf] = new(char[15]);
strcpy(seq[nss_conf]," ");
strcpy(sname[nss_conf],"ss_conf");
ptr = strscn(line);
for (a=0; a<=D2D && ptr; ++a)
- tr[0][a] = log2((float) exp(-1.0*strflta(ptr,99999))); //store transition probabilites as log2 values
+ tr[0][a] = Log2((float) exp(-1.0*strflta(ptr,99999))); //store transition probabilites as log2 values
// strinta returns next integer in string and puts ptr to first char
// after the integer. Returns -99999 if '*' is found.
// ptr is set to 0 if no integer is found after ptr.
// Prepare to store DSSP states (if there are none, delete afterwards)
nss_dssp=k++;
- seq[nss_dssp] = new(char[/*MAXRES*/par.maxResLen+2]);
+ seq[nss_dssp] = new char[/*MAXRES*/par.maxResLen+2];
sname[nss_dssp] = new(char[15]);
strcpy(sname[nss_dssp],"ss_dssp");
ptr+=2;
for (a=0; a<=D2D && ptr; ++a)
- tr[i][a] = log2((float) exp(-1.0*strflta(ptr,99999))); //store transition prob's as log2-values
+ tr[i][a] = Log2((float) exp(-1.0*strflta(ptr,99999))); //store transition prob's as log2-values
if (!ptr) return Warning(dbf,line,name);
if (v>=4)
{
{
sname[k]=new(char[10]);
strcpy(sname[k],"Consensus");
- sname[k+1]=new(char[strlen(longname)+1]);
+ sname[k+1]=new char[strlen(longname)+1];
strcpy(sname[k+1],longname);
- seq[k]=new(char[L+2]);
+ seq[k]=new char[L+2];
seq[k][0]=' ';
seq[k][L+1]='\0';
- seq[k+1]=new(char[L+2]);
+ seq[k+1]=new char[L+2];
seq[k+1][0]=' ';
seq[k+1][L+1]='\0';
for (i=1; i<=L; ++i)
}
else
{
- sname[k]=new(char[strlen(longname)+1]);
+ sname[k]=new char[strlen(longname)+1];
strcpy(sname[k],longname);
- seq[k]=new(char[L+2]);
+ seq[k]=new char[L+2];
seq[k][0]=' ';
seq[k][L+1]='\0';
}
HMM::InsertCalibration(char* infile)
{
char* line = new(char[LINELEN]); // input line
- char** lines = new(char*[3*L+100000]);
+ char** lines = new char*[3*L+100000];
int nline=0;
int l;
char done=0; // inserted new 'EVD mu sigma' line?
sprintf(lines[nline],"EVD %-7.4f %-7.4f",lamda,mu);
nline++;
}
- lines[nline]=new(char[strlen(line)+1]);
+ lines[nline]=new char[strlen(line)+1];
if (!lines[nline]) MemoryError("space to read in HHM file for calibration");
strcpy (lines[nline],line);
nline++;