1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: hhfunc-C.h 256 2011-06-23 13:57:28Z fabian $
22 * Changelog: Michael Remmert made changes to hhalign stand-alone code
23 * FS implemented some of the changes on 2010-11-11 -> MR1
25 * did not incorporate SS prediction PSIpred (yet); functions are:
34 * AddBackgroundFrequencies()
36 * @brief add background frequencies (derived from HMM) to
39 * @param[in,out] ppfFreq,
40 * [in] residue frequencies of sequence/profile,
41 * [out] overlayed with HMM background frequencies
42 * @param[in,out] ppfPseudoF,
43 * [in] residue frequencies+pseudocounts of sequence/profile,
44 * [out] overlayed with HMM background frequencies+pseudocounts
46 * length of sequence/profile (not aligned to HMM)
50 * sequences/profile to be 'softened'
51 * @param[in] pcPrealigned,
52 * sequence aligned to HMM, this is not quite a consensus,
53 * it is identical to 1st sequence but over-writes gap information,
54 * if other sequences in profile have (non-gap) residues
56 * number of sequences pre-aligned (pcPrealigned is 'consensus' of these sequences)
57 * @param[in] pcRepresent
58 * sequence representative of HMM, aligned to pcSeq0
61 AddBackgroundFrequencies(float **ppfFreq, float **ppfPseudoF, float **ppfPseudoTr,
62 int iSeqLen, hmm_light *prHMM,
63 char **ppcSeq, char *pcPrealigned, int iPreCnt, char *pcRepresent)
66 char *pcS = pcPrealigned; /* current residue in pre-aligned sequence */
67 char *pcH = pcRepresent; /* current residue in pre-aligned HMM */
68 int iS = 0; /* position in un-aligned sequence (corresponds to pcS) */
69 int iH = 0; /* position in un-aligned HMM (corresponds to pcH) */
70 int iA; /* residue iterator */
71 int iT; /* transition state iterator */
72 float fFWeight = 0.50 / sqrt((float)(iPreCnt)); /* weight of backgroud frequencies */ /* FIXME: tune value, 0.50 default */
73 //float fFWeight = 0.75;
74 float fFThgiew = UNITY - fFWeight; /* weight of 'true' frequencies */
75 float fGWeight = 0.50 / sqrt((float)(iPreCnt)); /* weight of backgroud transitions */ /* FIXME: tune value, 0.50 default */
76 //float fGWeight = 0.50 /*/ (float)(iPreCnt)*/; /* weight of backgroud transitions */ /* FIXME: tune value, 0.50 default */
77 float fGThgiew = UNITY - fGWeight; /* weight of 'true' transitions */
79 /* zf1SeqTrans[] are the (phenomenological) transition probabilities (+ pseudo-counts)
80 for a single sequence. It is almost certain (0.99) to stay in a match state, and very
81 unlikely (0.05) to go from match to insertion/deletion */
82 float zf1SeqTrans[] = {0.98, 0.01, 0.01, 0.25, 0.75, 0.25, 0.75};
83 float zf1SeqInit[] = {0.98, 0.01, 0.01, 0.99, 0.01, 0.99, 0.01};
84 float zf1SeqDel[] = {0.24, 0.01, 0.75, 0.01, 0.01, 0.01, 0.01};
85 float zf1SeqRevrt[] = {0.98, 0.01, 0.01, 0.01, 0.01, 0.99, 0.01};
87 if ( (NULL == pcPrealigned) || (NULL == pcRepresent) ){
88 /*printf("%s/%s:%d: WARNING HMM=NULL -- didn't think I would get here (carry on, no danger)\n",
89 __FUNCTION__, __FILE__, __LINE__);*/
93 if (NULL == prHMM->p){
94 printf("%s:%s:%d: WARNING -- Background Pseudocounts point to NULL\n"
95 "\tthis is not intended - don't add background but CONTINUE\n",
96 __FUNCTION__, __FILE__, __LINE__);
101 /* FIXME: should be 0 (FS thinks) but -1 gives better results */
102 iH = iS = 0/*-1*//*+1*/;
103 while ( ('\0' != *pcS) && ('\0' != *pcH) ){
105 if ( ('-' != *pcH) && ('-' != *pcS) ){
111 * - HMM had a gap in previous position (now closed)
112 * FIXME: this does not really work */
113 if ((iH > 0) && ('-' == *(pcH-1))){
115 for (iT = 0; iT < STATE_TRANSITIONS; iT++){
116 ppfPseudoTr[iS-1][iT] = log2f(zf1SeqRevrt[iT]);
122 /* do frequencies -- this is not really useful;
123 frequencies are derived from HMM
124 and contain already pseudocounts (PCs),
125 adding frequencies and then PCs will add PCs _twice_
126 results are better than not to add them,
127 but not as good as PCs */
128 for (iA = 0; iA < AMINOACIDS; iA++){
129 fAux = fFThgiew * ppfFreq[iS][iA] + fFWeight * prHMM->f[iH][iA];
130 ppfFreq[iS][iA] = fAux;
133 /* do pseudo-counts */
134 for (iA = 0; iA < AMINOACIDS; iA++){
136 fFThgiew * ppfPseudoF[iS][iA] + fFWeight * prHMM->p[iH][iA];
137 ppfPseudoF[iS][iA] = fAux;
139 #if 0 /* do state transitions */
140 for (iT = 0; iT < STATE_TRANSITIONS; iT++){
142 /* this is a very crude method,
143 which averages the logarithms of the transitions,
144 effectively performing a geometric mean -
145 this presumably violates normalisation.
146 however, results are surprisingly good */
148 fGThgiew * ppfPseudoTr[iS][iT] +
149 fGWeight * prHMM->tr[iH][iT];
150 ppfPseudoTr[iS][iT] = fAux;
151 #else /* crude averaging */
152 /* There are 2 things to consider:
153 (1) one should really blend the probabilities of
154 the transitions, however, by default we have
155 the logarithms thereof.
156 So must exponentiate, blend, and then take log again.
157 This is expensive, and does not seem to lead to better
158 results than blending the logarithms
159 (and violating normalisation)
160 (2) transition probabilities for a single sequence
161 are really easy, there are no insert/delete transitions.
162 However, there is a begin state that is different
164 But again, this does not seem to make a blind bit
169 fGThgiew * zf1SeqTrans[iT] +
170 fGWeight * prHMM->linTr[iH][iT];
171 ppfPseudoTr[iS][iT] = log2f(fAux);
175 fGThgiew * zf1SeqInit[iT] +
176 fGWeight * prHMM->linTr[iH][iT];
177 ppfPseudoTr[iS][iT] = log2f(fAux);
179 #endif /* mixing of linear/log */
180 } /* 0 <= iT < STATE_TRANSITIONS */
181 #endif /* did state transitions */
182 } /* was Match -- neither HMM nor sequence have gap */
184 else if ('-' == *pcH){
185 /* sequence opened up gap in HMM */
187 if ((iH > 0) && ('-' != *(pcH-1)) && (iS > 0)){
188 /* this is the first gap in HMM
189 * FIXME: this does not really work */
190 for (iT = 0; iT < STATE_TRANSITIONS; iT++){
191 ppfPseudoTr[iS-1][iT] = log2f(zf1SeqDel[iT]);
195 /* do nothing, keep single sequence values exactly as they are*/
200 else if ('-' == *pcS){
201 /* here the single sequence has a gap,
202 and the HMM (as a whole) does not. There may be individual gaps
203 in the HMM at this stage. By ignoring this we say that the HMM
204 dominates the overall behaviour - as in the M2M state as well
216 } /* this is the end of AddBackgroundFrequencies() */
223 * @brief Read HMM input file or transfer alignment
224 * and add pseudocounts etc.
226 * @param[in] iRnPtype
227 * type of read/prepare
228 * enum {INTERN_ALN_2_HMM = 0, READ_ALN_2_HMM, READ_HMM_2_HMM};
232 * number of seqs in alignment
233 * @param[in,out] prHMM,
234 * [in] if sequences read/prepared, [out] if HMM from file
235 * @param[in] pcPrealigned,
236 * (single) sequence aligned to background HMM
237 * @param[in] pcRepresent,
238 * sequence representing HMM aligned to individual sequence
239 * param[in] pdExWeights
240 * (external) sequence weights, derived from tree
242 * name of file with HMM info (formerly also alignment)
244 * HMM structure with transition probabilities, residue frequencies etc
246 * FIXME: what is qali?
248 * @return FAILURE on error, OK otherwise
251 ReadAndPrepare(int iRnPtype,
252 char **ppcProf, int iCnt, hmm_light *prHMM,
253 char *pcPrealigned, char *pcRepresent, double *pdExWeights,
254 char* infile, HMM& q, Alignment* qali=NULL) {
256 //#ifndef CLUSTALO_NOFILE
259 /* NOTE: there are different scenarios:
261 (i) ("" != infile) - read HMM from file,
262 transfer frequency/transition/pseudo-count (f/t/p) info into prHMM
264 (ii) ('\0' != ppcProf[0]) - transfer sequence/alignment into q/qali,
265 don't save f/t/p into prHMM,
266 on the contrary, if prior f/t/p available then add it to q/qali,
267 this is only done if (1==iCnt)
269 (iii) ('\0' == ppcProf[0]) - re-cycle old HMM information
270 recreate a HMM from previous data
273 /********************************/
274 /*** (o) Recycle internal HMM ***/
275 /********************************/
276 if ( (INTERN_ALN_2_HMM == iRnPtype) && (iCnt <= 0) ){
278 /* NOTE: here we are writing into a HMM structure/class;
279 memory has been allocated for this in hhalign.cpp;
280 however, as iCnt<=0, there may not be memory for
281 prHMM->n_display sequences/names.
282 But then, there doesn't have to be.
283 At this stage we are just copying one HMM into another HMM,
284 sequences are irrelevant. The only sequence of (marginal)
285 interest is the consensus sequence */
286 /* FIXME: check that prHMM is valid -- how? */
288 const int ciCons = 0;
289 const int ciNoof = ciCons+1;
290 const int ciInvalid = -1;
291 q.n_display = ciNoof; /* only consensus */
294 q.nfirst = ciCons;//prHMM->nfirst;
295 q.nss_dssp = ciInvalid;//prHMM->nss_dssp;
296 q.nsa_dssp = ciInvalid;//prHMM->nsa_dssp;
297 q.nss_pred = ciInvalid;//prHMM->nss_pred;
298 q.nss_conf = ciInvalid;//prHMM->nss_conf;
300 q.N_in = prHMM->N_in;
301 q.N_filtered = prHMM->N_filtered;
302 /* NOTE: I (FS) think that only ever 1 sequence will be transferred here,
303 that is, the consensus sequence. However, we might want to allow
304 (in the future) to transfer more sequences,
305 hence the awkward for() loop */
307 for (int i = prHMM->ncons+0; i < prHMM->ncons+q.n_display; i++){
308 /* NOTE: In the original hhalign code the first position
309 is kept open ('\0'). This makes it difficult to use
310 string functions like strlen/strdup etc.
311 Insert a temporary gap (.) to facilitate string operations */
312 char cHead = prHMM->seq[i][0];
313 prHMM->seq[i][0] = '.';
314 q.seq[i] = strdup(prHMM->seq[i]);
315 prHMM->seq[i][0] = q.seq[i][0] = cHead;
319 char cHead = prHMM->seq[prHMM->ncons][0];
320 prHMM->seq[prHMM->ncons][0] = '.';
321 q.seq[q.ncons] = strdup(prHMM->seq[prHMM->ncons]);
322 prHMM->seq[prHMM->ncons][0] = q.seq[q.ncons][0] = cHead;
325 const int NEFF_SCORE = 10; /* FIXME: Magic Number */
326 for (int i = 0; i < prHMM->L+1; i++){
327 q.Neff_M[i] = prHMM->Neff_M[i];
328 q.Neff_I[i] = prHMM->Neff_I[i];
329 q.Neff_D[i] = prHMM->Neff_D[i];
331 q.Neff_HMM = prHMM->Neff_HMM;
332 /* skip longname,name,file,fam,sfam,fold,cl */
333 q.lamda = prHMM->lamda;
336 HMMshadow rShadow = {0}; /* friend of HMM to access private members */
337 rShadow.copyShadowToHMM(q, *prHMM);
339 /* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */
340 /* pav already done in copyShadowToHMM */
346 } /* INTERN_ALN_2_HMM && iCnt<=0 */
348 /******************************/
349 /*** (i) Read HMM from file ***/
350 /******************************/
351 char line[LINELEN] = {0}; // input line
353 //if ( (0 != strcmp(infile,"")) /*&& (iCnt > 0)*/ )
354 if ( (READ_HMM_2_HMM == iRnPtype) || (READ_ALN_2_HMM == iRnPtype)) {
356 if (0 == strcmp(infile,"")){
358 "\texpected to re %s from file but no file specified\n"
360 , __FUNCTION__, __FILE__, __LINE__
361 , (READ_HMM_2_HMM==iRnPtype?"HMM":"alignment"));
364 inf = fopen(infile, "r");
365 if (!inf) OpenFileError(infile);
366 Pathname(path,infile);
371 //if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name);
375 fgetline(line,LINELEN-1,inf);
378 //if ( (0 != strcmp(infile,"")) && (iCnt > 0) )
379 if ( (READ_HMM_2_HMM == iRnPtype) ){
381 if (0 == strcmp(infile,"")){
382 printf("%s:%s:%d: expected to read HMM from file but no file-name\n",
383 __FUNCTION__, __FILE__, __LINE__);
386 // Is infile a HMMER3 file? /* MR1 */
387 if (!strncmp(line,"HMMER3",6))
390 cout<<"Query file is in HMMER3 format\n";
391 cout<<"WARNING! Use of HMMER3 format as input results in dramatically loss of sensitivity!\n";
394 // Read 'query HMMER file
396 q.ReadHMMer3(inf,path);
398 // Don't add transition pseudocounts to query!!
400 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
401 q.PreparePseudocounts();
403 // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a]
404 q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
406 /* further down there is a q.Log2LinTransitionProbs
407 but only if (iCnt>0), however, we still need it it here (i think),
408 there is no danger of doing this twice, as trans_lin is checked
409 FIXME (FS, 2011-01-12) */
410 /* further down there is a q.Log2LinTransitionProbs
411 but only if (iCnt>0), however, we still need it it here (i think),
412 there is no danger of doing this twice, as trans_lin is checked
413 FIXME (FS, 2011-01-12) */
414 //if (par.forward >= 1)
416 q.Log2LinTransitionProbs(1.0);
420 // ... or Is infile an old HMMER file?
421 else if (!strncmp(line,"HMMER",5)) {
423 cout<<"Query file is in HMMER format\n";
424 cout<<"WARNING! Use of HMMER format as input results in dramatically loss of sensitivity!\n";
427 // Read 'query HMMER file
428 q.ReadHMMer(inf,path);
429 if (v>=2 && q.Neff_HMM>11.0)
430 fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM);
432 // Don't add transition pseudocounts to query!!
434 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
435 q.PreparePseudocounts();
437 // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a]
438 q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
440 /* further down there is a q.Log2LinTransitionProbs
441 but only if (iCnt>0), however, we still need it it here (i think),
442 there is no danger of doing this twice, as trans_lin is checked
443 FIXME (FS, 2011-01-12) */
444 //if (par.forward >= 1)
446 q.Log2LinTransitionProbs(1.0);
449 } /* it was a HMMer file */
451 // ... or is it an hhm file?
452 else if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2)) {
454 if (v>=2) cout<<"Query file is in HHM format\n";
456 // Rewind to beginning of line and read query hhm file
459 if (v>=2 && q.Neff_HMM>11.0)
460 fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM);
462 // Add transition pseudocounts to query -> q.p[i][a]
463 q.AddTransitionPseudocounts();
465 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
466 q.PreparePseudocounts();
468 // Add amino acid pseudocounts to query: q.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
469 q.AddAminoAcidPseudocounts();
471 } /* it was a HHM file */
473 fprintf(stderr, "%s:%s:%d: Unknown HMM format in infile\n"
474 "infile=%s, #seq=%d\n"
475 , __FUNCTION__, __FILE__, __LINE__
482 /*** transfer class info to struct */
483 prHMM->n_display = q.n_display;
485 prHMM->seq = (char **)calloc((q.n_display+1), sizeof(char *));
486 /* FIXME valgrind says bytes get lost in the above calloc during
489 for (int i = 0; i < q.n_display; i++){
490 /* NOTE: In the original hhalign code the first position
491 is kept open ('\0'). This makes it difficult to use
492 string functions like strlen/strdup etc.
493 Insert a temporary gap (.) to facilitate string operations */
494 char cHead = q.seq[i][0];
496 prHMM->seq[i] = strdup(q.seq[i]);
497 q.seq[i][0] = prHMM->seq[i][0] = cHead;
499 prHMM->ncons = q.ncons;
500 prHMM->nfirst = q.nfirst;
501 prHMM->nss_dssp = q.nss_dssp;
502 prHMM->nsa_dssp = q.nsa_dssp;
503 prHMM->nss_pred = q.nss_pred;
504 prHMM->nss_conf = q.nss_conf;
506 prHMM->N_in = q.N_in;
507 prHMM->N_filtered = q.N_filtered;
508 const int NEFF_SCORE = 10; /* FIXME: Magic Number */
509 prHMM->Neff_M = (float *)calloc(prHMM->L+1, sizeof(float));
510 prHMM->Neff_I = (float *)calloc(prHMM->L+1, sizeof(float));
511 prHMM->Neff_D = (float *)calloc(prHMM->L+1, sizeof(float));
512 /*for (int i = 0; i < prHMM->L+1; i++){
513 prHMM->Neff_M[i] = prHMM->Neff_I[i] = prHMM->Neff_D[i] = NEFF_SCORE;
515 prHMM->Neff_HMM = q.Neff_HMM;
516 /* skip longname,name,file,fam,sfam,fold,cl */
517 prHMM->lamda = q.lamda;
520 HMMshadow rShadow = {0}; /* friend of HMM to access private members */
521 rShadow.copyHMMtoShadow(q);
523 prHMM->f = (float **)calloc(prHMM->L+1, sizeof(float *));
524 prHMM->g = (float **)calloc(prHMM->L+1, sizeof(float *));
525 prHMM->p = (float **)calloc(prHMM->L+1, sizeof(float *));
526 prHMM->tr = (float **)calloc(prHMM->L+1, sizeof(float *));
527 prHMM->linTr = (float **)calloc(prHMM->L+1, sizeof(float *));
528 for (int i = 0; i < prHMM->L+1; i++){
529 prHMM->f[i] = (float *)calloc(AMINOACIDS, sizeof(float));
530 prHMM->g[i] = (float *)calloc(AMINOACIDS, sizeof(float));
531 prHMM->p[i] = (float *)calloc(AMINOACIDS, sizeof(float));
532 for (int j = 0; j < AMINOACIDS; j++){
533 prHMM->f[i][j] = (float)rShadow.f[i][j];
534 prHMM->g[i][j] = (float)rShadow.g[i][j];
535 prHMM->p[i][j] = (float)rShadow.p[i][j];
537 prHMM->tr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float));
538 prHMM->linTr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float));
539 for (int j = 0; j< STATE_TRANSITIONS; j++){
540 prHMM->tr[i][j] = (float)rShadow.tr[i][j];
541 prHMM->linTr[i][j] = fpow2(rShadow.tr[i][j]);
543 } /*0 <= i < prHMM->L+1 */
544 /* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */
545 for (int j = 0; j < AMINOACIDS; j++){
546 prHMM->pav[j] = (float)rShadow.pav[j];
550 } /* have read HHM from file */
551 /*else if ( ((NULL != ppcProf) && (iCnt > 0) && ('\0' != ppcProf[0][0])) ||
552 ( (0 != strcmp(infile,"") && (0 == iCnt) )) )*/
553 else if ( (INTERN_ALN_2_HMM == iRnPtype) || (READ_ALN_2_HMM == iRnPtype) ) {
555 if ( (INTERN_ALN_2_HMM == iRnPtype) &&
556 ( (NULL == ppcProf) || (iCnt <= 0) || ('\0' == ppcProf[0][0]) ) ){
557 printf("%s:%s:%d: was expecting internal alignment but\n"
558 "\tppcProf=%p, #seq=%d, ppcProf[0][0]=%c\n"
559 , __FUNCTION__, __FILE__, __LINE__
560 , ppcProf, iCnt, ppcProf[0][0]);
563 else if ( (READ_ALN_2_HMM == iRnPtype) &&
564 (0 == strcmp(infile,"")) ){
565 printf("%s:%s:%d: was expecting to read alignment from file but no filename\n"
566 , __FUNCTION__, __FILE__, __LINE__);
570 /*******************************/
571 /*** (ii) it is an alignment ***/
572 /*******************************/
573 /* transfer alignment information from clustal character array
574 into pali/q/t classes */
576 * NOTE that emissions in HMMer file format contain pseudo-counts.
577 * HHM file format does not contain emission pseudo-counts.
578 * the structure that stores background HMM information does contain pseudo-counts
582 pali=new Alignment(iCnt); /* FIXME: pass in iCnt to get rid of MAXSEQ */
589 printf("\nError in %s: only HHM files can be calibrated.\n",program_name);
590 printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile);
594 if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n";
596 /* Read alignment from infile into matrix X[k][l] as ASCII
597 (and supply first line as extra argument) */
599 if (INTERN_ALN_2_HMM == iRnPtype){
600 pali->Transfer(ppcProf, iCnt);
602 //else if (0 == iCnt)
603 else if (READ_ALN_2_HMM == iRnPtype){
604 pali->Read(inf, infile, line);
607 printf("%s:%s:%d: FATAL problem\n"
608 "infile = (%s), #seq = %d\n"
609 , __FUNCTION__, __FILE__, __LINE__
614 /* Convert ASCII to int (0-20), throw out all insert states,
615 record their number in I[k][i]
616 and store marked sequences in name[k] and seq[k] */
617 pali->Compress(infile);
619 /* Sort out the nseqdis most dissimilar sequences for display
620 in the output alignments */
621 pali->FilterForDisplay(par.max_seqid, par.coverage, par.qid,
622 par.qsc,par.nseqdis);
624 // Filter alignment for min score per column with core query profile, defined by coverage_core and qsc_core
625 //if (par.coresc>-10) pali->HomologyFilter(par.coverage_core, par.qsc_core, par.coresc);
627 /* Remove sequences with seq. identity larger than seqid percent
628 (remove the shorter of two) */
629 pali->N_filtered = pali->Filter(par.max_seqid, par.coverage,
630 par.qid, par.qsc, par.Ndiff);
632 /* Calculate pos-specific weights,
633 AA frequencies and transitions -> f[i][a], tr[i][a] */
634 pali->FrequenciesAndTransitions(q);
635 if (v>=2 && q.Neff_HMM>11.0)
636 fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM);
638 /*printf("%d %d %f %d (N,Nf,Neff,L) %s:%s:%d\n"
639 , q.N_in, q.N_filtered, q.Neff_HMM, q.L, __FUNCTION__, __FILE__, __LINE__);*/
641 // Add transition pseudocounts to query -> p[i][a]
642 q.AddTransitionPseudocounts();
644 /* Generate an amino acid frequency matrix from f[i][a]
645 with full pseudocount admixture (tau=1) -> g[i][a] */
646 q.PreparePseudocounts();
648 /* Add amino acid pseudocounts to query:
649 p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] */
650 q.AddAminoAcidPseudocounts();
652 /* ****** add aligned background pseudocounts ***** */
653 HMMshadow rShadowQ = {0};
654 rShadowQ.copyHMMtoShadow(q);
656 AddBackgroundFrequencies(rShadowQ.f, rShadowQ.p, rShadowQ.tr,
658 q.seq, pcPrealigned, iCnt, pcRepresent);
662 delete(pali); pali = NULL;
665 } /* else if (NULL != ppcProf) // not hmmer/hhalign but alignment */
667 //else if ((prHMM->L > 0) && ('\0' == ppcProf[0][0]))
668 else if (INTERN_HMM_2_HMM == iRnPtype){
670 /******************************************/
671 /*** (iii) re-cycle old HMM information ***/
672 /******************************************/
675 printf("%s:%s:%d: was expecting to copy HMM structure but L=%d\n"
676 , __FUNCTION__, __FILE__, __LINE__, prHMM->L);
679 printf("%s:%s:%d: RE-CYCLING HMM\n", __FUNCTION__, __FILE__, __LINE__);
682 q.n_display = prHMM->n_display;
684 for (int i = 0; i < q.n_display; i++){
685 /* NOTE: In the original hhalign code the first position
686 is kept open ('\0'). This makes it difficult to use
687 string functions like strlen/strdup etc.
688 Insert a temporary gap (.) to facilitate string operations */
689 char cHead = prHMM->seq[i][0];
690 prHMM->seq[i][0] = '.';
691 q.seq[i] = strdup(prHMM->seq[i]);
692 q.seq[i][0] = prHMM->seq[i][0] = cHead;
694 q.nfirst = prHMM->nfirst;
698 char cHead = prHMM->seq[prHMM->nfirst][0];
699 prHMM->seq[prHMM->nfirst][0] = '.';
700 q.seq[0] = strdup(prHMM->seq[prHMM->nfirst]);
701 q.seq[q.n_display-1][0] = prHMM->seq[prHMM->nfirst][0] = cHead;
703 q.ncons = prHMM->ncons;
704 q.nss_dssp = prHMM->nss_dssp;
705 q.nsa_dssp = prHMM->nsa_dssp;
706 q.nss_pred = prHMM->nss_pred;
707 q.nss_conf = prHMM->nss_conf;
709 q.N_in = prHMM->N_in;
710 q.N_filtered = prHMM->N_filtered;
711 #define NEFF_SCORE 10 /* FIXME: Magic Number */
712 /*for (int i; i < prHMM->L+1; i++){
713 q.Neff_M[i] = q.Neff_I[i] = q.Neff_D[i] = NEFF_SCORE;
715 q.Neff_HMM = prHMM->Neff_HMM;
716 /* skip longname,name,file,fam,sfam,fold,cl */
717 q.lamda = prHMM->lamda;
720 HMMshadow rShadow = {0}; /* friend of HMM to access private members */
721 rShadow.copyShadowToHMM(q, *prHMM);
726 if (par.forward>=1) q.Log2LinTransitionProbs(1.0);
735 } /*** end: ReadAndPrepare() ***/
745 FreeHMMstruct(hmm_light *prHMM)
749 if (NULL != prHMM->f){
750 for (i = 0; i < prHMM->L+1; i++){
751 if (NULL != prHMM->f[i]){
752 free(prHMM->f[i]); prHMM->f[i] = NULL;
754 } /* 0 <= i < prHMM->L+1 */
755 free(prHMM->f); prHMM->f = NULL;
757 if (NULL != prHMM->g){
758 for (i = 0; i < prHMM->L+1; i++){
759 if (NULL != prHMM->g[i]){
760 free(prHMM->g[i]); prHMM->g[i] = NULL;
762 } /* 0 <= i < prHMM->L+1 */
763 free(prHMM->g); prHMM->g = NULL;
765 if (NULL != prHMM->p){
766 for (i = 0; i < prHMM->L+1; i++){
767 if (NULL != prHMM->p[i]){
768 free(prHMM->p[i]); prHMM->p[i] = NULL;
770 } /* 0 <= i < prHMM->L+1 */
771 free(prHMM->p); prHMM->p = NULL;
773 if (NULL != prHMM->tr){
774 for (i = 0; i < prHMM->L+1; i++){
775 if (NULL != prHMM->tr[i]){
776 free(prHMM->tr[i]); prHMM->tr[i] = NULL;
778 } /* 0 <= i < prHMM->L+1 */
779 free(prHMM->tr); prHMM->tr = NULL;
781 if (NULL != prHMM->linTr){
782 for (i = 0; i < prHMM->L+1; i++){
783 if (NULL != prHMM->linTr[i]){
784 free(prHMM->linTr[i]); prHMM->linTr[i] = NULL;
786 } /* 0 <= i < prHMM->L+1 */
787 free(prHMM->linTr); prHMM->linTr = NULL;
790 if (NULL != prHMM->Neff_M){
791 free(prHMM->Neff_M); prHMM->Neff_M = NULL;
793 if (NULL != prHMM->Neff_I){
794 free(prHMM->Neff_I); prHMM->Neff_I = NULL;
796 if (NULL != prHMM->Neff_D){
797 free(prHMM->Neff_D); prHMM->Neff_D = NULL;
800 if (NULL != prHMM->seq){
801 for (i = 0; i < prHMM->n_display; i++){
802 if (NULL != prHMM->seq[i]){
803 free(prHMM->seq[i]); prHMM->seq[i] = NULL;
806 free(prHMM->seq); prHMM->seq = NULL;
809 memset(prHMM, 0, sizeof(hmm_light));
811 } /*** end: FreeHMMstruct() ***/
815 * @brief comparisin function for ascending qsort() of doubles
818 CompDblAsc(const void *pv1, const void *pv2){
820 double d1 = *(double *)pv1;
821 double d2 = *(double *)pv2;
823 if (d1 > d2) { return +1; }
824 else if (d1 < d2) { return -1; }
827 } /*** end: CompDblAsc() ***/
833 * @brief convert alignment into HMM (hhmake)
836 * structure with pseudocounts etc
837 * @param[in] pcHMM_input
838 * name of file with HMM
842 AlnToHMM2(hmm_light *rHMM_p,
843 char **ppcSeq, int iN){
847 SetSubstitutionMatrix();
852 const int ciGoodMeasureSeq = 10;
853 int iAuxLen = strlen(ppcSeq[0]);
854 par.maxResLen = iAuxLen;
855 par.maxResLen += ciGoodMeasureSeq;
856 par.maxColCnt = iAuxLen + ciGoodMeasureSeq;
857 par.max_seqid=DEFAULT_FILTER;
858 par.loc=0; par.mact=0;
859 /* some minor parameters, affecting alignment (i think) */
860 par.p = 0.0; /* minimum threshold for inclusion in hit list */
861 par.Z = 100; /* minimum threshold for inclusion in hit list and alignment listing */
862 par.z = 1; /* min number of lines in hit list */
863 par.B = 100; /* max number of alignments */
864 par.b = 1; /* min number of alignments */
865 par.E = 1e6; /* maximum threshold for inclusion in hit list and alignment listing */
866 par.altali=1;par.hitrank=0;par.showcons=1; par.showdssp=1;par.showpred=1;par.nseqdis=iN;par.cons=1;
869 const int ciGoodMeasure = 10;
870 int iLen = strlen(ppcSeq[0]) + ciGoodMeasure;
871 if (iLen > par.maxResLen){
872 par.maxResLen = par.maxColCnt = iLen;
874 HMM rTemp(iN+2, iLen); /* temporary template */
875 Alignment rTempAli(iN+2, iLen); /* temporary alignment */
876 int iParCons = par.cons;
878 /*printf(">>>>>>>>>>> %s:%s:%d: there are %d sequences\n", __FUNCTION__, __FILE__, __LINE__, iN);*/
881 if (OK != ReadAndPrepare(INTERN_ALN_2_HMM,
883 NULL/*prealigned*/, NULL/*representative*/, NULL/*weights*/, //YES/*transfer*/,
884 (char*)("")/*in-file*/, rTemp, &rTempAli)) {
890 /*** transfer class info to struct */
891 rHMM_p->n_display = rTemp.n_display;
892 rHMM_p->sname = NULL;
893 rHMM_p->seq = (char **)calloc((rTemp.n_display+1), sizeof(char *));
895 for (int i = 0; i < rTemp.n_display; i++){
896 /* NOTE: In the original hhalign code the first position
897 is kept open ('\0'). This makes it difficult to use
898 string functions like strlen/strdup etc.
899 Insert a temporary gap (.) to facilitate string operations */
900 char cHead = rTemp.seq[i][0];
901 rTemp.seq[i][0] = '.';
902 rHMM_p->seq[i] = strdup(rTemp.seq[i]);
903 rTemp.seq[i][0] = rHMM_p->seq[i][0] = cHead;
905 rHMM_p->ncons = rTemp.ncons;
906 rHMM_p->nfirst = rTemp.nfirst;
907 if (-1 == rHMM_p->ncons){
908 /* ncons needed later for alignment of
909 representative sequence and copy of profile.
910 ncons not always set (-1 default),
911 this will cause segmentation fault.
912 nfirst (probably) right index -
914 rHMM_p->ncons = rTemp.nfirst;
916 rHMM_p->nss_dssp = rTemp.nss_dssp;
917 rHMM_p->nsa_dssp = rTemp.nsa_dssp;
918 rHMM_p->nss_pred = rTemp.nss_pred;
919 rHMM_p->nss_conf = rTemp.nss_conf;
921 rHMM_p->N_in = rTemp.N_in;
922 rHMM_p->N_filtered = rTemp.N_filtered;
923 #define NEFF_SCORE 10 /* FIXME: Magic Number */ //// get rid of that, FS, 2010-12-22
924 rHMM_p->Neff_M = (float *)calloc(rHMM_p->L+1, sizeof(float));
925 rHMM_p->Neff_I = (float *)calloc(rHMM_p->L+1, sizeof(float));
926 rHMM_p->Neff_D = (float *)calloc(rHMM_p->L+1, sizeof(float));
927 for (int i = 0; i < rHMM_p->L+1; i++){
928 rHMM_p->Neff_M[i] = rHMM_p->Neff_I[i] = rHMM_p->Neff_D[i] = NEFF_SCORE; //// get rid of that, FS, 2010-12-22
930 rHMM_p->Neff_HMM = rTemp.Neff_HMM;
931 /* skip longname,name,file,fam,sfam,fold,cl */
932 rHMM_p->lamda = rTemp.lamda;
933 rHMM_p->mu = rTemp.mu;
935 HMMshadow rShadow = {0}; /* friend of HMM to access private members */
936 rShadow.copyHMMtoShadow(rTemp);
938 rHMM_p->Neff_M = (float *)calloc(rHMM_p->L+1, sizeof(float));
939 rHMM_p->Neff_I = (float *)calloc(rHMM_p->L+1, sizeof(float));
940 rHMM_p->Neff_D = (float *)calloc(rHMM_p->L+1, sizeof(float));
941 rHMM_p->f = (float **)calloc(rHMM_p->L+1, sizeof(float *));
942 rHMM_p->g = (float **)calloc(rHMM_p->L+1, sizeof(float *));
943 rHMM_p->p = (float **)calloc(rHMM_p->L+1, sizeof(float *));
944 rHMM_p->tr = (float **)calloc(rHMM_p->L+1, sizeof(float *));
945 rHMM_p->linTr = (float **)calloc(rHMM_p->L+1, sizeof(float *));
947 for (int i = 0; i < rHMM_p->L+1; i++){
948 rHMM_p->Neff_M[i] = (float)rShadow.Neff_M[i];
949 rHMM_p->Neff_I[i] = (float)rShadow.Neff_I[i];
950 rHMM_p->Neff_D[i] = (float)rShadow.Neff_D[i];
951 rHMM_p->f[i] = (float *)calloc(AMINOACIDS, sizeof(float));
952 rHMM_p->g[i] = (float *)calloc(AMINOACIDS, sizeof(float));
953 rHMM_p->p[i] = (float *)calloc(AMINOACIDS, sizeof(float));
954 for (int j = 0; j < AMINOACIDS; j++){
955 rHMM_p->f[i][j] = (float)rShadow.f[i][j];
956 rHMM_p->g[i][j] = (float)rShadow.g[i][j];
957 rHMM_p->p[i][j] = (float)rShadow.p[i][j];
959 rHMM_p->tr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float));
960 rHMM_p->linTr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float));
961 for (int j = 0; j< STATE_TRANSITIONS; j++){
962 rHMM_p->tr[i][j] = (float)rShadow.tr[i][j];
963 rHMM_p->linTr[i][j] = fpow2(rShadow.tr[i][j]);
965 } /*0 <= i < prHMM->L+1 */
966 /* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */
967 for (int j = 0; j < AMINOACIDS; j++){
968 rHMM_p->pav[j] = (float)rShadow.pav[j];
976 rTemp.ClobberGlobal();
977 rTempAli.ClobberGlobal();
981 } /*** end of AlnToHMM2() ***/
987 * @brief turn alignment (from file) into HHM (HMM) on file
989 * @param[out] hmm_out
990 * name of file with HMM info corresponding to tmp_aln
992 * name of file with alignment, to be turned into HMM (HHM)
996 HHMake_Wrapper(char *tmp_aln, char *hmm_out)
999 HMM rTemp; /* temporary template */
1000 Alignment rTempAli; /* temporary alignment */
1001 hmm_light *rHMM_p = NULL;
1004 this is a wrapper for a stand-alone program hhmake.
1005 hhmake uses a different set of parameters than hhalign.
1006 However, all parameters are GLOBAL.
1007 So at this point we register the hhalign settings,
1008 reset them to hhmake settings and set them back
1009 at the end of the function
1012 /* save settings from hhalign */
1013 int iParShowcons=par.showcons;
1014 int iParAppend=par.append;
1015 int iParNseqdis=par.nseqdis;
1016 int iParMark=par.mark;
1017 int iParMaxSeqid=par.max_seqid;
1018 int iParQid=par.qid;
1019 double dParQsc=par.qsc;
1020 int iParCoverage=par.coverage;
1021 int iParNdiff=par.Ndiff;
1022 int iParCoverageCore=par.coverage_core;
1023 double dParQscCore=par.qsc_core;
1024 double dParCoresc=par.coresc;
1026 int iParMgaps=par.Mgaps;
1027 int iParMatrix=par.matrix;
1028 int iParPcm=par.pcm;
1029 double dParPcw=par.pcw;
1030 double dParGapb=par.gapb;
1033 /* these are settings suitable for hhmake */
1034 par.showcons=1; // write consensus sequence into hhm file
1035 par.append=0; // overwrite output file
1036 par.nseqdis=10; // maximum number of query or template sequences to be recoreded in HMM and diplayed in output alignments
1037 par.mark=0; // 1: only marked sequences (or first) get displayed; 0: most divergent ones get displayed
1038 par.max_seqid=90; // default for maximum sequence identity threshold
1039 par.qid=0; // default for maximum sequence identity threshold
1040 par.qsc=-20.0f; // default for minimum score per column with query
1041 par.coverage=0; // default for minimum coverage threshold
1042 par.Ndiff=100; // pick Ndiff most different sequences from alignment
1043 par.coverage_core=30; // Minimum coverage for sequences in core alignment
1044 par.qsc_core=0.3f; // Minimum score per column with core alignment (HMM)
1045 par.coresc=-20.0f; // Minimum score per column with core alignment (HMM)
1046 par.M=1; // match state assignment is by A2M/A3M
1047 par.Mgaps=50; // above this percentage of gaps, columns are assigned to insert states
1048 par.matrix=0; // Subst.matrix 0: Gonnet, 1: HSDM, 2: BLOSUM50 3: BLOSUM62
1049 par.pcm=0; // no amino acid and transition pseudocounts added
1050 par.pcw=0; // wc>0 weighs columns according to their intra-clomun similarity
1051 par.gapb=0.0; // default values for transition pseudocounts; 0.0: add no transition pseudocounts!
1052 par.wg=0; // 0: use local sequence weights 1: use local ones
1055 if (OK != ReadAndPrepare(READ_ALN_2_HMM,
1056 NULL, 0, rHMM_p, NULL, NULL, NULL,
1057 tmp_aln, rTemp, &rTempAli)) {
1061 rTemp.WriteToFile(hmm_out);
1065 /* restore settings from hhalign */
1066 par.showcons=iParShowcons;
1067 par.append=iParAppend;
1068 par.nseqdis=iParNseqdis;
1070 par.max_seqid=iParMaxSeqid;
1073 par.coverage=iParCoverage;
1074 par.Ndiff=iParNdiff;
1075 par.coverage_core=iParCoverageCore;
1076 par.qsc_core=dParQscCore;
1077 par.coresc=dParCoresc;
1079 par.Mgaps=iParMgaps;
1080 par.matrix=iParMatrix;
1088 rTemp.ClobberGlobal();
1089 rTempAli.ClobberGlobal();
1097 * @brief read HMM from file, copy (relevant) info into struct
1100 * structure with pseudocounts etc, probably uninitialised on entry
1101 * @param[in] pcHMM_input
1102 * name of file with HMM
1106 readHMMWrapper(hmm_light *rHMM_p,
1109 par.maxResLen = 15002;
1110 HMM rTemp(1000,par.maxResLen); /* temporary template */
1111 Alignment rTempAli; /* temporary alignment */
1112 /* AW changed init from {0} to 0 because it failed to compile with
1113 * gcc 4.3.3 with the following error:
1114 * error: braces around initializer for non-aggregate type
1116 /* FS taken out initialiser alltogether */
1118 /* 0th arg of RnP is the type of RnP, ie,
1119 enum {INTERN_ALN_2_HMM = 0, READ_ALN_2_HMM, READ_HMM_2_HMM};*/
1120 /* 1st arg of ReadAndPrepare() is ppcSeqs, 2nd is #seq */
1121 /* FIXME: at this stage the 3rd arg, rHMM_p, only acts as a dummy.
1122 this is rather silly, as it is the actual struct that will
1123 carry the HMM info at the end.
1124 If we write it already in ReadAndPrepare() then we could
1125 dispense with all this friend-class nonsense */
1126 if (OK != ReadAndPrepare(READ_HMM_2_HMM,
1127 NULL, 0, rHMM_p, NULL, NULL, NULL,
1128 pcHMM_input, rTemp, &rTempAli)) {
1132 /* an important piece of information I want to get out of here
1133 is the consenssus sequence. there are however certain
1134 Pfam HMMs that don't trigger consensus calculation.
1135 at the moment I (FS) don't understand why this is
1136 (or rather why this is not). The proper place to do this
1137 should be inside ReadAndPrepare():ReadHMMer(), but
1138 I have not quite penetrated the logic there.
1139 Therefore I try to catch this condition at this point (here)
1142 if (-1 == rHMM_p->ncons){
1144 rHMM_p->ncons = rHMM_p->nfirst;
1146 for (i = 0; i < rHMM_p->L; i++){
1147 double dPmax = 0.00;
1149 for (iA = 0; iA < AMINOACIDS; iA++){
1150 if (rHMM_p->f[i][iA] > dPmax){
1152 dPmax = rHMM_p->f[i][iA];
1154 } /* (0 <= iA < AMINOACIDS) */
1155 rHMM_p->seq[rHMM_p->ncons][i] = i2aa(iAmax);
1156 } /* (0 <= i < rHMM_p->L) */
1158 } /* ncons not set */
1161 rTemp.ClobberGlobal();
1162 rTempAli.ClobberGlobal();
1166 } /*** end: readHMMWrapper() ***/
1173 /////////////////////////////////////////////////////////////////////////////
1175 * @brief Do precalculations for q and t to prepare comparison
1178 PrepareTemplate(HMM& q, HMM& t, int format)
1180 if (format==0) // HHM format
1182 // Add transition pseudocounts to template
1183 t.AddTransitionPseudocounts();
1185 // Generate an amino acid frequency matrix from t.f[i][a] with max pseudocounts (tau=1) -> t.g[i][a]
1186 t.PreparePseudocounts();
1188 // Add amino acid pseudocounts to template: t.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1189 t.AddAminoAcidPseudocounts();
1191 else // HHMER format
1193 // Don't add transition pseudocounts to template
1194 // t.AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, 0.0);
1196 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
1197 t.PreparePseudocounts();
1199 // DON'T ADD amino acid pseudocounts to temlate: pcm=0! t.p[i][a] = t.f[i][a]
1200 t.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
1203 // Modify transition probabilities to include SS-dependent penalties
1204 if (par.ssgap) t.UseSecStrucDependentGapPenalties();
1206 if (par.forward>=1) t.Log2LinTransitionProbs(1.0);
1208 // Factor Null model into HMM t
1209 // ATTENTION! t.p[i][a] is divided by pnul[a] (for reasons of efficiency) => do not reuse t.p
1210 t.IncludeNullModelInHMM(q,t); // Can go BEFORE the loop if not dependent on template
1216 /*** end of hhfunc-C.h ***/