1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: hhfunc-C.h 309 2016-06-13 14:10:11Z fabian $
22 * Changelog: Michael Remmert made changes to hhalign stand-alone code
23 * FS implemented some of the changes on 2010-11-11 -> MR1
25 * did not incorporate SS prediction PSIpred (yet); functions are:
32 //#include "new_new.h" /* memory tracking */
35 * AddBackgroundFrequencies()
37 * @brief add background frequencies (derived from HMM) to
40 * @param[in,out] ppfFreq,
41 * [in] residue frequencies of sequence/profile,
42 * [out] overlayed with HMM background frequencies
43 * @param[in,out] ppfPseudoF,
44 * [in] residue frequencies+pseudocounts of sequence/profile,
45 * [out] overlayed with HMM background frequencies+pseudocounts
47 * length of sequence/profile (not aligned to HMM)
51 * sequences/profile to be 'softened'
52 * @param[in] pcPrealigned,
53 * sequence aligned to HMM, this is not quite a consensus,
54 * it is identical to 1st sequence but over-writes gap information,
55 * if other sequences in profile have (non-gap) residues
57 * number of sequences pre-aligned (pcPrealigned is 'consensus' of these sequences)
58 * @param[in] pcRepresent
59 * sequence representative of HMM, aligned to pcSeq0
62 AddBackgroundFrequencies(float **ppfFreq, float **ppfPseudoF, float **ppfPseudoTr,
63 int iSeqLen, hmm_light *prHMM,
64 char **ppcSeq, char *pcPrealigned, int iPreCnt, char *pcRepresent)
67 char *pcS = pcPrealigned; /* current residue in pre-aligned sequence */
68 char *pcH = pcRepresent; /* current residue in pre-aligned HMM */
69 int iS = 0; /* position in un-aligned sequence (corresponds to pcS) */
70 int iH = 0; /* position in un-aligned HMM (corresponds to pcH) */
71 int iA; /* residue iterator */
72 //int iT; /* transition state iterator */
73 float fFWeight = 0.50 / sqrt((float)(iPreCnt)); /* weight of backgroud frequencies */ /* FIXME: tune value, 0.50 default */
74 //float fFWeight = 0.75;
75 float fFThgiew = UNITY - fFWeight; /* weight of 'true' frequencies */
76 //float fGWeight = 0.50 / sqrt((float)(iPreCnt)); /* weight of backgroud transitions */ /* FIXME: tune value, 0.50 default */
77 //float fGWeight = 0.50 /*/ (float)(iPreCnt)*/; /* weight of backgroud transitions */ /* FIXME: tune value, 0.50 default */
78 //float fGThgiew = UNITY - fGWeight; /* weight of 'true' transitions */
81 if ( (NULL == pcPrealigned) || (NULL == pcRepresent) ){
82 /*printf("%s/%s:%d: WARNING HMM=NULL -- didn't think I would get here (carry on, no danger)\n",
83 __FUNCTION__, __FILE__, __LINE__);*/
87 if (NULL == prHMM->p){
88 printf("%s:%s:%d: WARNING -- Background Pseudocounts point to NULL\n"
89 "\tthis is not intended - don't add background but CONTINUE\n",
90 __FUNCTION__, __FILE__, __LINE__);
95 /* FIXME: should be 0 (FS thinks) but -1 gives better results */
96 iH = iS = 0/*-1*//*+1*/;
97 while ( ('\0' != *pcS) && ('\0' != *pcH) ){
99 if ( ('-' != *pcH) && ('-' != *pcS) ){
105 * - HMM had a gap in previous position (now closed)
106 * FIXME: this does not really work */
107 if ((iH > 0) && ('-' == *(pcH-1))){
109 for (iT = 0; iT < STATE_TRANSITIONS; iT++){
110 ppfPseudoTr[iS-1][iT] = log2f(zf1SeqRevrt[iT]);
116 /* do frequencies -- this is not really useful;
117 frequencies are derived from HMM
118 and contain already pseudocounts (PCs),
119 adding frequencies and then PCs will add PCs _twice_
120 results are better than not to add them,
121 but not as good as PCs */
122 for (iA = 0; iA < AMINOACIDS; iA++){
123 fAux = fFThgiew * ppfFreq[iS][iA] + fFWeight * prHMM->f[iH][iA];
124 ppfFreq[iS][iA] = fAux;
127 /* do pseudo-counts */
128 for (iA = 0; iA < AMINOACIDS; iA++){
130 fFThgiew * ppfPseudoF[iS][iA] + fFWeight * prHMM->p[iH][iA];
131 ppfPseudoF[iS][iA] = fAux;
133 #if 0 /* do state transitions */
134 for (iT = 0; iT < STATE_TRANSITIONS; iT++){
136 /* this is a very crude method,
137 which averages the logarithms of the transitions,
138 effectively performing a geometric mean -
139 this presumably violates normalisation.
140 however, results are surprisingly good */
142 fGThgiew * ppfPseudoTr[iS][iT] +
143 fGWeight * prHMM->tr[iH][iT];
144 ppfPseudoTr[iS][iT] = fAux;
145 #else /* crude averaging */
146 /* There are 2 things to consider:
147 (1) one should really blend the probabilities of
148 the transitions, however, by default we have
149 the logarithms thereof.
150 So must exponentiate, blend, and then take log again.
151 This is expensive, and does not seem to lead to better
152 results than blending the logarithms
153 (and violating normalisation)
154 (2) transition probabilities for a single sequence
155 are really easy, there are no insert/delete transitions.
156 However, there is a begin state that is different
158 But again, this does not seem to make a blind bit
163 fGThgiew * zf1SeqTrans[iT] +
164 fGWeight * prHMM->linTr[iH][iT];
165 ppfPseudoTr[iS][iT] = log2f(fAux);
169 fGThgiew * zf1SeqInit[iT] +
170 fGWeight * prHMM->linTr[iH][iT];
171 ppfPseudoTr[iS][iT] = log2f(fAux);
173 #endif /* mixing of linear/log */
174 } /* 0 <= iT < STATE_TRANSITIONS */
175 #endif /* did state transitions */
176 } /* was Match -- neither HMM nor sequence have gap */
178 else if ('-' == *pcH){
179 /* sequence opened up gap in HMM */
181 if ((iH > 0) && ('-' != *(pcH-1)) && (iS > 0)){
182 /* this is the first gap in HMM
183 * FIXME: this does not really work */
184 for (iT = 0; iT < STATE_TRANSITIONS; iT++){
185 ppfPseudoTr[iS-1][iT] = log2f(zf1SeqDel[iT]);
189 /* do nothing, keep single sequence values exactly as they are*/
194 else if ('-' == *pcS){
195 /* here the single sequence has a gap,
196 and the HMM (as a whole) does not. There may be individual gaps
197 in the HMM at this stage. By ignoring this we say that the HMM
198 dominates the overall behaviour - as in the M2M state as well
210 } /* this is the end of AddBackgroundFrequencies() */
217 * @brief Read HMM input file or transfer alignment
218 * and add pseudocounts etc.
220 * @param[in] iRnPtype
221 * type of read/prepare
222 * enum {INTERN_ALN_2_HMM = 0, READ_ALN_2_HMM, READ_HMM_2_HMM, INTERN_HMM_2_HMM};
226 * number of seqs in alignment
227 * @param[in,out] prHMM,
228 * [in] if sequences read/prepared, [out] if HMM from file
229 * @param[in] pcPrealigned,
230 * (single) sequence aligned to background HMM
231 * @param[in] pcRepresent,
232 * sequence representing HMM aligned to individual sequence
233 * param[in] pdExWeights
234 * (external) sequence weights, derived from tree
236 * name of file with HMM info (formerly also alignment)
238 * HMM structure with transition probabilities, residue frequencies etc
240 * FIXME: what is qali?
242 * @return FAILURE on error, OK otherwise
245 ReadAndPrepare(int iRnPtype,
246 char **ppcProf, int iCnt, hmm_light *prHMM,
247 char *pcPrealigned, char *pcRepresent, double *pdExWeights,
248 char* infile, HMM& q, Alignment* qali=NULL) {
250 //#ifndef CLUSTALO_NOFILE
253 /* NOTE: there are different scenarios:
255 (i) ("" != infile) - read HMM from file,
256 transfer frequency/transition/pseudo-count (f/t/p) info into prHMM
258 (ii) ('\0' != ppcProf[0]) - transfer sequence/alignment into q/qali,
259 don't save f/t/p into prHMM,
260 on the contrary, if prior f/t/p available then add it to q/qali,
261 this is only done if (1==iCnt)
263 (iii) ('\0' == ppcProf[0]) - re-cycle old HMM information
264 recreate a HMM from previous data
267 /********************************/
268 /*** (o) Recycle internal HMM ***/
269 /********************************/
270 if ( (INTERN_ALN_2_HMM == iRnPtype) && (iCnt <= 0) ){
272 /* NOTE: here we are writing into a HMM structure/class;
273 memory has been allocated for this in hhalign.cpp;
274 however, as iCnt<=0, there may not be memory for
275 prHMM->n_display sequences/names.
276 But then, there doesn't have to be.
277 At this stage we are just copying one HMM into another HMM,
278 sequences are irrelevant. The only sequence of (marginal)
279 interest is the consensus sequence */
280 /* FIXME: check that prHMM is valid -- how? */
282 const int ciCons = 0;
283 const int ciNoof = ciCons+1;
284 const int ciInvalid = -1;
285 q.n_display = ciNoof; /* only consensus */
288 q.nfirst = ciCons;//prHMM->nfirst;
289 q.nss_dssp = ciInvalid;//prHMM->nss_dssp;
290 q.nsa_dssp = ciInvalid;//prHMM->nsa_dssp;
291 q.nss_pred = ciInvalid;//prHMM->nss_pred;
292 q.nss_conf = ciInvalid;//prHMM->nss_conf;
294 q.N_in = prHMM->N_in;
295 q.N_filtered = prHMM->N_filtered;
296 /* NOTE: I (FS) think that only ever 1 sequence will be transferred here,
297 that is, the consensus sequence. However, we might want to allow
298 (in the future) to transfer more sequences,
299 hence the awkward for() loop */
301 for (int i = prHMM->ncons+0; i < prHMM->ncons+q.n_display; i++){
302 /* NOTE: In the original hhalign code the first position
303 is kept open ('\0'). This makes it difficult to use
304 string functions like strlen/strdup etc.
305 Insert a temporary gap (.) to facilitate string operations */
306 char cHead = prHMM->seq[i][0];
307 prHMM->seq[i][0] = '.';
308 q.seq[i] = strdup(prHMM->seq[i]);
309 prHMM->seq[i][0] = q.seq[i][0] = cHead;
313 char cHead = prHMM->seq[prHMM->ncons][0];
314 prHMM->seq[prHMM->ncons][0] = '.';
315 q.seq[q.ncons] = strdup(prHMM->seq[prHMM->ncons]);
316 prHMM->seq[prHMM->ncons][0] = q.seq[q.ncons][0] = cHead;
319 for (int i = 0; i < prHMM->L+1; i++){
320 q.Neff_M[i] = prHMM->Neff_M[i];
321 q.Neff_I[i] = prHMM->Neff_I[i];
322 q.Neff_D[i] = prHMM->Neff_D[i];
324 q.Neff_HMM = prHMM->Neff_HMM;
325 /* skip longname,name,file,fam,sfam,fold,cl */
326 q.lamda = prHMM->lamda;
329 HMMshadow rShadow = {0}; /* friend of HMM to access private members */
330 rShadow.copyShadowToHMM(q, *prHMM);
332 /* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */
333 /* pav already done in copyShadowToHMM */
339 } /* INTERN_ALN_2_HMM && iCnt<=0 */
341 /******************************/
342 /*** (i) Read HMM from file ***/
343 /******************************/
344 char line[LINELEN] = {0}; // input line
346 //if ( (0 != strcmp(infile,"")) /*&& (iCnt > 0)*/ )
347 if ( (READ_HMM_2_HMM == iRnPtype) || (READ_ALN_2_HMM == iRnPtype)) {
349 if (0 == strcmp(infile,"")){
351 "\texpected to re %s from file but no file specified\n"
353 , __FUNCTION__, __FILE__, __LINE__
354 , (READ_HMM_2_HMM==iRnPtype?"HMM":"alignment"));
357 inf = fopen(infile, "r");
358 if (!inf) OpenFileError(infile);
359 Pathname(path,infile);
364 //if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name);
368 fgetline(line,LINELEN-1,inf);
371 //if ( (0 != strcmp(infile,"")) && (iCnt > 0) )
372 if ( (READ_HMM_2_HMM == iRnPtype) ){
374 if (0 == strcmp(infile,"")){
375 printf("%s:%s:%d: expected to read HMM from file but no file-name\n",
376 __FUNCTION__, __FILE__, __LINE__);
379 // Is infile a HMMER3 file? /* MR1 */
380 if (!strncmp(line,"HMMER3",6))
383 cout<<"Query file is in HMMER3 format\n";
384 cout<<"WARNING! Use of HMMER3 format as input results in dramatically loss of sensitivity!\n";
387 // Read 'query HMMER file
389 q.ReadHMMer3(inf,path);
391 // Don't add transition pseudocounts to query!!
393 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
394 q.PreparePseudocounts();
396 // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a]
397 q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
399 /* further down there is a q.Log2LinTransitionProbs
400 but only if (iCnt>0), however, we still need it it here (i think),
401 there is no danger of doing this twice, as trans_lin is checked
402 FIXME (FS, 2011-01-12) */
403 /* further down there is a q.Log2LinTransitionProbs
404 but only if (iCnt>0), however, we still need it it here (i think),
405 there is no danger of doing this twice, as trans_lin is checked
406 FIXME (FS, 2011-01-12) */
407 //if (par.forward >= 1)
409 q.Log2LinTransitionProbs(1.0);
413 // ... or Is infile an old HMMER file?
414 else if (!strncmp(line,"HMMER",5)) {
416 cout<<"Query file is in HMMER format\n";
417 cout<<"WARNING! Use of HMMER format as input results in dramatically loss of sensitivity!\n";
420 // Read 'query HMMER file
421 q.ReadHMMer(inf,path);
422 if (v>=2 && q.Neff_HMM>11.0)
423 fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM);
425 // Don't add transition pseudocounts to query!!
427 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
428 q.PreparePseudocounts();
430 // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a]
431 q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
433 /* further down there is a q.Log2LinTransitionProbs
434 but only if (iCnt>0), however, we still need it it here (i think),
435 there is no danger of doing this twice, as trans_lin is checked
436 FIXME (FS, 2011-01-12) */
437 //if (par.forward >= 1)
439 q.Log2LinTransitionProbs(1.0);
442 } /* it was a HMMer file */
444 // ... or is it an hhm file?
445 else if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2)) {
447 if (v>=2) cout<<"Query file is in HHM format\n";
449 // Rewind to beginning of line and read query hhm file
452 if (v>=2 && q.Neff_HMM>11.0)
453 fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM);
455 // Add transition pseudocounts to query -> q.p[i][a]
456 q.AddTransitionPseudocounts();
458 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
459 q.PreparePseudocounts();
461 // Add amino acid pseudocounts to query: q.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
462 q.AddAminoAcidPseudocounts();
464 } /* it was a HHM file */
466 fprintf(stderr, "%s:%s:%d: Unknown HMM format in infile\n"
467 "infile=%s, #seq=%d\n"
468 , __FUNCTION__, __FILE__, __LINE__
475 /*** transfer class info to struct */
476 prHMM->n_display = q.n_display;
478 prHMM->seq = (char **)calloc((q.n_display+1), sizeof(char *));
479 /* FIXME valgrind says bytes get lost in the above calloc during
482 for (int i = 0; i < q.n_display; i++){
483 /* NOTE: In the original hhalign code the first position
484 is kept open ('\0'). This makes it difficult to use
485 string functions like strlen/strdup etc.
486 Insert a temporary gap (.) to facilitate string operations */
487 char cHead = q.seq[i][0];
489 prHMM->seq[i] = strdup(q.seq[i]);
490 q.seq[i][0] = prHMM->seq[i][0] = cHead;
492 prHMM->ncons = q.ncons;
493 prHMM->nfirst = q.nfirst;
494 prHMM->nss_dssp = q.nss_dssp;
495 prHMM->nsa_dssp = q.nsa_dssp;
496 prHMM->nss_pred = q.nss_pred;
497 prHMM->nss_conf = q.nss_conf;
499 prHMM->N_in = q.N_in;
500 prHMM->N_filtered = q.N_filtered;
501 prHMM->Neff_M = (float *)calloc(prHMM->L+1, sizeof(float));
502 prHMM->Neff_I = (float *)calloc(prHMM->L+1, sizeof(float));
503 prHMM->Neff_D = (float *)calloc(prHMM->L+1, sizeof(float));
504 prHMM->Neff_HMM = q.Neff_HMM;
505 /* skip longname,name,file,fam,sfam,fold,cl */
506 prHMM->lamda = q.lamda;
509 HMMshadow rShadow = {0}; /* friend of HMM to access private members */
510 rShadow.copyHMMtoShadow(q);
512 prHMM->f = (float **)calloc(prHMM->L+1, sizeof(float *));
513 prHMM->g = (float **)calloc(prHMM->L+1, sizeof(float *));
514 prHMM->p = (float **)calloc(prHMM->L+1, sizeof(float *));
515 prHMM->tr = (float **)calloc(prHMM->L+1, sizeof(float *));
516 prHMM->linTr = (float **)calloc(prHMM->L+1, sizeof(float *));
517 for (int i = 0; i < prHMM->L+1; i++){
518 prHMM->f[i] = (float *)calloc(AMINOACIDS, sizeof(float));
519 prHMM->g[i] = (float *)calloc(AMINOACIDS, sizeof(float));
520 prHMM->p[i] = (float *)calloc(AMINOACIDS, sizeof(float));
521 for (int j = 0; j < AMINOACIDS; j++){
522 prHMM->f[i][j] = (float)rShadow.f[i][j];
523 prHMM->g[i][j] = (float)rShadow.g[i][j];
524 prHMM->p[i][j] = (float)rShadow.p[i][j];
526 prHMM->tr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float));
527 prHMM->linTr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float));
528 for (int j = 0; j< STATE_TRANSITIONS; j++){
529 prHMM->tr[i][j] = (float)rShadow.tr[i][j];
530 prHMM->linTr[i][j] = fpow2(rShadow.tr[i][j]);
532 } /*0 <= i < prHMM->L+1 */
533 /* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */
534 for (int j = 0; j < AMINOACIDS; j++){
535 prHMM->pav[j] = (float)rShadow.pav[j];
539 } /* have read HHM from file */
540 /*else if ( ((NULL != ppcProf) && (iCnt > 0) && ('\0' != ppcProf[0][0])) ||
541 ( (0 != strcmp(infile,"") && (0 == iCnt) )) )*/
542 else if ( (INTERN_ALN_2_HMM == iRnPtype) || (READ_ALN_2_HMM == iRnPtype) ) {
544 if ( (INTERN_ALN_2_HMM == iRnPtype) &&
545 ( (NULL == ppcProf) || (iCnt <= 0) || ('\0' == ppcProf[0][0]) ) ){
546 printf("%s:%s:%d: was expecting internal alignment but\n"
547 "\tppcProf=%p, #seq=%d, ppcProf[0][0]=%c\n"
548 , __FUNCTION__, __FILE__, __LINE__
549 , ppcProf, iCnt, ppcProf[0][0]);
552 else if ( (READ_ALN_2_HMM == iRnPtype) &&
553 (0 == strcmp(infile,"")) ){
554 printf("%s:%s:%d: was expecting to read alignment from file but no filename\n"
555 , __FUNCTION__, __FILE__, __LINE__);
559 /*******************************/
560 /*** (ii) it is an alignment ***/
561 /*******************************/
562 /* transfer alignment information from clustal character array
563 into pali/q/t classes */
565 * NOTE that emissions in HMMer file format contain pseudo-counts.
566 * HHM file format does not contain emission pseudo-counts.
567 * the structure that stores background HMM information does contain pseudo-counts
571 pali=new Alignment(iCnt); /* FIXME: pass in iCnt to get rid of MAXSEQ */
578 printf("\nError in %s: only HHM files can be calibrated.\n",program_name);
579 printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile);
583 if (v>=2 && strcmp(infile,"stdin")) cout<<infile<<" is in A2M, A3M or FASTA format\n";
585 /* Read alignment from infile into matrix X[k][l] as ASCII
586 (and supply first line as extra argument) */
588 if (INTERN_ALN_2_HMM == iRnPtype){
589 pali->Transfer(ppcProf, iCnt);
591 //else if (0 == iCnt)
592 else if (READ_ALN_2_HMM == iRnPtype){
593 pali->Read(inf, infile, line);
596 printf("%s:%s:%d: FATAL problem\n"
597 "infile = (%s), #seq = %d\n"
598 , __FUNCTION__, __FILE__, __LINE__
603 /* Convert ASCII to int (0-20), throw out all insert states,
604 record their number in I[k][i]
605 and store marked sequences in name[k] and seq[k] */
606 pali->Compress(infile);
608 /* Sort out the nseqdis most dissimilar sequences for display
609 in the output alignments */
610 pali->FilterForDisplay(par.max_seqid, par.coverage, par.qid,
611 par.qsc,par.nseqdis);
613 // Filter alignment for min score per column with core query profile, defined by coverage_core and qsc_core
614 //if (par.coresc>-10) pali->HomologyFilter(par.coverage_core, par.qsc_core, par.coresc);
616 /* Remove sequences with seq. identity larger than seqid percent
617 (remove the shorter of two) */
618 pali->N_filtered = pali->Filter(par.max_seqid, par.coverage,
619 par.qid, par.qsc, par.Ndiff);
621 /* Calculate pos-specific weights,
622 AA frequencies and transitions -> f[i][a], tr[i][a] */
623 pali->FrequenciesAndTransitions(q);
624 if (v>=2 && q.Neff_HMM>11.0)
625 fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM);
627 /*printf("%d %d %f %d (N,Nf,Neff,L) %s:%s:%d\n"
628 , q.N_in, q.N_filtered, q.Neff_HMM, q.L, __FUNCTION__, __FILE__, __LINE__);*/
630 // Add transition pseudocounts to query -> p[i][a]
631 q.AddTransitionPseudocounts();
633 /* Generate an amino acid frequency matrix from f[i][a]
634 with full pseudocount admixture (tau=1) -> g[i][a] */
635 q.PreparePseudocounts();
637 /* Add amino acid pseudocounts to query:
638 p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] */
639 q.AddAminoAcidPseudocounts();
641 /* ****** add aligned background pseudocounts ***** */
642 HMMshadow rShadowQ = {0};
643 rShadowQ.copyHMMtoShadow(q);
645 AddBackgroundFrequencies(rShadowQ.f, rShadowQ.p, rShadowQ.tr,
647 q.seq, pcPrealigned, iCnt, pcRepresent);
651 delete(pali); pali = NULL;
654 } /* else if (NULL != ppcProf) // not hmmer/hhalign but alignment */
656 //else if ((prHMM->L > 0) && ('\0' == ppcProf[0][0]))
657 else if (INTERN_HMM_2_HMM == iRnPtype){
659 /******************************************/
660 /*** (iii) re-cycle old HMM information ***/
661 /******************************************/
664 printf("%s:%s:%d: was expecting to copy HMM structure but L=%d\n"
665 , __FUNCTION__, __FILE__, __LINE__, prHMM->L);
668 printf("%s:%s:%d: RE-CYCLING HMM\n", __FUNCTION__, __FILE__, __LINE__);
671 q.n_display = prHMM->n_display;
673 for (int i = 0; i < q.n_display; i++){
674 /* NOTE: In the original hhalign code the first position
675 is kept open ('\0'). This makes it difficult to use
676 string functions like strlen/strdup etc.
677 Insert a temporary gap (.) to facilitate string operations */
678 char cHead = prHMM->seq[i][0];
679 prHMM->seq[i][0] = '.';
680 q.seq[i] = strdup(prHMM->seq[i]);
681 q.seq[i][0] = prHMM->seq[i][0] = cHead;
683 q.nfirst = prHMM->nfirst;
687 char cHead = prHMM->seq[prHMM->nfirst][0];
688 prHMM->seq[prHMM->nfirst][0] = '.';
689 q.seq[0] = strdup(prHMM->seq[prHMM->nfirst]);
690 q.seq[q.n_display-1][0] = prHMM->seq[prHMM->nfirst][0] = cHead;
692 q.ncons = prHMM->ncons;
693 q.nss_dssp = prHMM->nss_dssp;
694 q.nsa_dssp = prHMM->nsa_dssp;
695 q.nss_pred = prHMM->nss_pred;
696 q.nss_conf = prHMM->nss_conf;
698 q.N_in = prHMM->N_in;
699 q.N_filtered = prHMM->N_filtered;
700 #define NEFF_SCORE 10 /* FIXME: Magic Number */
701 /*for (int i; i < prHMM->L+1; i++){
702 q.Neff_M[i] = q.Neff_I[i] = q.Neff_D[i] = NEFF_SCORE;
704 q.Neff_HMM = prHMM->Neff_HMM;
705 /* skip longname,name,file,fam,sfam,fold,cl */
706 q.lamda = prHMM->lamda;
709 HMMshadow rShadow = {0}; /* friend of HMM to access private members */
710 rShadow.copyShadowToHMM(q, *prHMM);
715 if (par.forward>=1) q.Log2LinTransitionProbs(1.0);
724 } /*** end: ReadAndPrepare() ***/
734 FreeHMMstruct(hmm_light *prHMM)
738 if (NULL != prHMM->f){
739 for (i = 0; i < prHMM->L+1; i++){
740 if (NULL != prHMM->f[i]){
741 free(prHMM->f[i]); prHMM->f[i] = NULL;
743 } /* 0 <= i < prHMM->L+1 */
744 free(prHMM->f); prHMM->f = NULL;
746 if (NULL != prHMM->g){
747 for (i = 0; i < prHMM->L+1; i++){
748 if (NULL != prHMM->g[i]){
749 free(prHMM->g[i]); prHMM->g[i] = NULL;
751 } /* 0 <= i < prHMM->L+1 */
752 free(prHMM->g); prHMM->g = NULL;
754 if (NULL != prHMM->p){
755 for (i = 0; i < prHMM->L+1; i++){
756 if (NULL != prHMM->p[i]){
757 free(prHMM->p[i]); prHMM->p[i] = NULL;
759 } /* 0 <= i < prHMM->L+1 */
760 free(prHMM->p); prHMM->p = NULL;
762 if (NULL != prHMM->tr){
763 for (i = 0; i < prHMM->L+1; i++){
764 if (NULL != prHMM->tr[i]){
765 free(prHMM->tr[i]); prHMM->tr[i] = NULL;
767 } /* 0 <= i < prHMM->L+1 */
768 free(prHMM->tr); prHMM->tr = NULL;
770 if (NULL != prHMM->linTr){
771 for (i = 0; i < prHMM->L+1; i++){
772 if (NULL != prHMM->linTr[i]){
773 free(prHMM->linTr[i]); prHMM->linTr[i] = NULL;
775 } /* 0 <= i < prHMM->L+1 */
776 free(prHMM->linTr); prHMM->linTr = NULL;
779 if (NULL != prHMM->Neff_M){
780 free(prHMM->Neff_M); prHMM->Neff_M = NULL;
782 if (NULL != prHMM->Neff_I){
783 free(prHMM->Neff_I); prHMM->Neff_I = NULL;
785 if (NULL != prHMM->Neff_D){
786 free(prHMM->Neff_D); prHMM->Neff_D = NULL;
789 if (NULL != prHMM->seq){
790 for (i = 0; i < prHMM->n_display; i++){
791 if (NULL != prHMM->seq[i]){
792 free(prHMM->seq[i]); prHMM->seq[i] = NULL;
795 free(prHMM->seq); prHMM->seq = NULL;
798 memset(prHMM, 0, sizeof(hmm_light));
800 } /*** end: FreeHMMstruct() ***/
804 * @brief comparisin function for ascending qsort() of doubles
807 CompDblAsc(const void *pv1, const void *pv2){
809 double d1 = *(double *)pv1;
810 double d2 = *(double *)pv2;
812 if (d1 > d2) { return +1; }
813 else if (d1 < d2) { return -1; }
816 } /*** end: CompDblAsc() ***/
822 * @brief convert alignment into HMM (hhmake)
825 * structure with pseudocounts etc
826 * @param[in] pcHMM_input
827 * name of file with HMM
831 AlnToHMM2(hmm_light *rHMM_p, hhalign_para rHhalignPara,
832 char **ppcSeq, int iN){
834 SetDefaults(rHhalignPara);
835 SetSubstitutionMatrix();
840 const int ciGoodMeasureSeq = 10;
841 int iAuxLen = strlen(ppcSeq[0]);
842 par.maxResLen = iAuxLen;
843 par.maxResLen += ciGoodMeasureSeq;
844 par.maxColCnt = iAuxLen + ciGoodMeasureSeq;
845 par.max_seqid=DEFAULT_FILTER;
846 par.loc=0; par.mact=0;
847 /* some minor parameters, affecting alignment (i think) */
848 par.p = 0.0; /* minimum threshold for inclusion in hit list */
849 par.Z = 100; /* minimum threshold for inclusion in hit list and alignment listing */
850 par.z = 1; /* min number of lines in hit list */
851 par.B = 100; /* max number of alignments */
852 par.b = 1; /* min number of alignments */
853 par.E = 1e6; /* maximum threshold for inclusion in hit list and alignment listing */
854 par.altali=1;par.hitrank=0;par.showcons=1; par.showdssp=1;par.showpred=1;par.nseqdis=iN;par.cons=1;
857 const int ciGoodMeasure = 10;
858 int iLen = strlen(ppcSeq[0]) + ciGoodMeasure;
859 if (iLen > par.maxResLen){
860 par.maxResLen = par.maxColCnt = iLen;
862 HMM rTemp(iN+2, iLen); /* temporary template */
863 Alignment rTempAli(iN+2, iLen); /* temporary alignment */
864 int iParCons = par.cons;
866 /*printf(">>>>>>>>>>> %s:%s:%d: there are %d sequences\n", __FUNCTION__, __FILE__, __LINE__, iN);*/
869 if (OK != ReadAndPrepare(INTERN_ALN_2_HMM,
871 NULL/*prealigned*/, NULL/*representative*/, NULL/*weights*/, //YES/*transfer*/,
872 (char*)("")/*in-file*/, rTemp, &rTempAli)) {
878 /*** transfer class info to struct */
879 rHMM_p->n_display = rTemp.n_display;
880 rHMM_p->sname = NULL;
881 rHMM_p->seq = (char **)calloc((rTemp.n_display+1), sizeof(char *));
883 for (int i = 0; i < rTemp.n_display; i++){
884 /* NOTE: In the original hhalign code the first position
885 is kept open ('\0'). This makes it difficult to use
886 string functions like strlen/strdup etc.
887 Insert a temporary gap (.) to facilitate string operations */
888 char cHead = rTemp.seq[i][0];
889 rTemp.seq[i][0] = '.';
890 rHMM_p->seq[i] = strdup(rTemp.seq[i]);
891 rTemp.seq[i][0] = rHMM_p->seq[i][0] = cHead;
893 rHMM_p->ncons = rTemp.ncons;
894 rHMM_p->nfirst = rTemp.nfirst;
895 if (-1 == rHMM_p->ncons){
896 /* ncons needed later for alignment of
897 representative sequence and copy of profile.
898 ncons not always set (-1 default),
899 this will cause segmentation fault.
900 nfirst (probably) right index -
902 rHMM_p->ncons = rTemp.nfirst;
904 rHMM_p->nss_dssp = rTemp.nss_dssp;
905 rHMM_p->nsa_dssp = rTemp.nsa_dssp;
906 rHMM_p->nss_pred = rTemp.nss_pred;
907 rHMM_p->nss_conf = rTemp.nss_conf;
909 rHMM_p->N_in = rTemp.N_in;
910 rHMM_p->N_filtered = rTemp.N_filtered;
911 #define NEFF_SCORE 10 /* FIXME: Magic Number */ //// get rid of that, FS, 2010-12-22
912 rHMM_p->Neff_M = (float *)calloc(rHMM_p->L+1, sizeof(float));
913 rHMM_p->Neff_I = (float *)calloc(rHMM_p->L+1, sizeof(float));
914 rHMM_p->Neff_D = (float *)calloc(rHMM_p->L+1, sizeof(float));
915 for (int i = 0; i < rHMM_p->L+1; i++){
916 rHMM_p->Neff_M[i] = rHMM_p->Neff_I[i] = rHMM_p->Neff_D[i] = NEFF_SCORE; //// get rid of that, FS, 2010-12-22
918 rHMM_p->Neff_HMM = rTemp.Neff_HMM;
919 /* skip longname,name,file,fam,sfam,fold,cl */
920 rHMM_p->lamda = rTemp.lamda;
921 rHMM_p->mu = rTemp.mu;
923 HMMshadow rShadow = {0}; /* friend of HMM to access private members */
924 rShadow.copyHMMtoShadow(rTemp);
926 rHMM_p->Neff_M = (float *)calloc(rHMM_p->L+1, sizeof(float));
927 rHMM_p->Neff_I = (float *)calloc(rHMM_p->L+1, sizeof(float));
928 rHMM_p->Neff_D = (float *)calloc(rHMM_p->L+1, sizeof(float));
929 rHMM_p->f = (float **)calloc(rHMM_p->L+1, sizeof(float *));
930 rHMM_p->g = (float **)calloc(rHMM_p->L+1, sizeof(float *));
931 rHMM_p->p = (float **)calloc(rHMM_p->L+1, sizeof(float *));
932 rHMM_p->tr = (float **)calloc(rHMM_p->L+1, sizeof(float *));
933 rHMM_p->linTr = (float **)calloc(rHMM_p->L+1, sizeof(float *));
935 for (int i = 0; i < rHMM_p->L+1; i++){
936 rHMM_p->Neff_M[i] = (float)rShadow.Neff_M[i];
937 rHMM_p->Neff_I[i] = (float)rShadow.Neff_I[i];
938 rHMM_p->Neff_D[i] = (float)rShadow.Neff_D[i];
939 rHMM_p->f[i] = (float *)calloc(AMINOACIDS, sizeof(float));
940 rHMM_p->g[i] = (float *)calloc(AMINOACIDS, sizeof(float));
941 rHMM_p->p[i] = (float *)calloc(AMINOACIDS, sizeof(float));
942 for (int j = 0; j < AMINOACIDS; j++){
943 rHMM_p->f[i][j] = (float)rShadow.f[i][j];
944 rHMM_p->g[i][j] = (float)rShadow.g[i][j];
945 rHMM_p->p[i][j] = (float)rShadow.p[i][j];
947 rHMM_p->tr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float));
948 rHMM_p->linTr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float));
949 for (int j = 0; j< STATE_TRANSITIONS; j++){
950 rHMM_p->tr[i][j] = (float)rShadow.tr[i][j];
951 rHMM_p->linTr[i][j] = fpow2(rShadow.tr[i][j]);
953 } /*0 <= i < prHMM->L+1 */
954 /* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */
955 for (int j = 0; j < AMINOACIDS; j++){
956 rHMM_p->pav[j] = (float)rShadow.pav[j];
964 rTemp.ClobberGlobal();
965 rTempAli.ClobberGlobal();
969 } /*** end of AlnToHMM2() ***/
975 * @brief turn alignment (from file) into HHM (HMM) on file
977 * @param[out] hmm_out
978 * name of file with HMM info corresponding to tmp_aln
980 * name of file with alignment, to be turned into HMM (HHM)
984 HHMake_Wrapper(char *tmp_aln, char *hmm_out)
987 HMM rTemp; /* temporary template */
988 Alignment rTempAli; /* temporary alignment */
989 hmm_light *rHMM_p = NULL;
992 this is a wrapper for a stand-alone program hhmake.
993 hhmake uses a different set of parameters than hhalign.
994 However, all parameters are GLOBAL.
995 So at this point we register the hhalign settings,
996 reset them to hhmake settings and set them back
997 at the end of the function
1000 /* save settings from hhalign */
1001 int iParShowcons=par.showcons;
1002 int iParAppend=par.append;
1003 int iParNseqdis=par.nseqdis;
1004 int iParMark=par.mark;
1005 int iParMaxSeqid=par.max_seqid;
1006 int iParQid=par.qid;
1007 double dParQsc=par.qsc;
1008 int iParCoverage=par.coverage;
1009 int iParNdiff=par.Ndiff;
1010 int iParCoverageCore=par.coverage_core;
1011 double dParQscCore=par.qsc_core;
1012 double dParCoresc=par.coresc;
1014 int iParMgaps=par.Mgaps;
1015 int iParMatrix=par.matrix;
1016 int iParPcm=par.pcm;
1017 double dParPcw=par.pcw;
1018 double dParGapb=par.gapb;
1021 /* these are settings suitable for hhmake */
1022 par.showcons=1; // write consensus sequence into hhm file
1023 par.append=0; // overwrite output file
1024 par.nseqdis=10; // maximum number of query or template sequences to be recoreded in HMM and diplayed in output alignments
1025 par.mark=0; // 1: only marked sequences (or first) get displayed; 0: most divergent ones get displayed
1026 par.max_seqid=90; // default for maximum sequence identity threshold
1027 par.qid=0; // default for maximum sequence identity threshold
1028 par.qsc=-20.0f; // default for minimum score per column with query
1029 par.coverage=0; // default for minimum coverage threshold
1030 par.Ndiff=100; // pick Ndiff most different sequences from alignment
1031 par.coverage_core=30; // Minimum coverage for sequences in core alignment
1032 par.qsc_core=0.3f; // Minimum score per column with core alignment (HMM)
1033 par.coresc=-20.0f; // Minimum score per column with core alignment (HMM)
1034 par.M=1; // match state assignment is by A2M/A3M
1035 par.Mgaps=50; // above this percentage of gaps, columns are assigned to insert states
1036 par.matrix=0; // Subst.matrix 0: Gonnet, 1: HSDM, 2: BLOSUM50 3: BLOSUM62
1037 par.pcm=0; // no amino acid and transition pseudocounts added
1038 par.pcw=0; // wc>0 weighs columns according to their intra-clomun similarity
1039 par.gapb=0.0; // default values for transition pseudocounts; 0.0: add no transition pseudocounts!
1040 par.wg=0; // 0: use local sequence weights 1: use local ones
1043 if (OK != ReadAndPrepare(READ_ALN_2_HMM,
1044 NULL, 0, rHMM_p, NULL, NULL, NULL,
1045 tmp_aln, rTemp, &rTempAli)) {
1049 rTemp.WriteToFile(hmm_out);
1053 /* restore settings from hhalign */
1054 par.showcons=iParShowcons;
1055 par.append=iParAppend;
1056 par.nseqdis=iParNseqdis;
1058 par.max_seqid=iParMaxSeqid;
1061 par.coverage=iParCoverage;
1062 par.Ndiff=iParNdiff;
1063 par.coverage_core=iParCoverageCore;
1064 par.qsc_core=dParQscCore;
1065 par.coresc=dParCoresc;
1067 par.Mgaps=iParMgaps;
1068 par.matrix=iParMatrix;
1076 rTemp.ClobberGlobal();
1077 rTempAli.ClobberGlobal();
1085 * @brief read HMM from file, copy (relevant) info into struct
1088 * structure with pseudocounts etc, probably uninitialised on entry
1089 * @param[in] pcHMM_input
1090 * name of file with HMM
1094 readHMMWrapper(hmm_light *rHMM_p,
1097 par.maxResLen = 15002;
1098 HMM rTemp(1000,par.maxResLen); /* temporary template */
1099 Alignment rTempAli; /* temporary alignment */
1100 /* AW changed init from {0} to 0 because it failed to compile with
1101 * gcc 4.3.3 with the following error:
1102 * error: braces around initializer for non-aggregate type
1104 /* FS taken out initialiser alltogether */
1106 /* 0th arg of RnP is the type of RnP, ie,
1107 enum {INTERN_ALN_2_HMM = 0, READ_ALN_2_HMM, READ_HMM_2_HMM};*/
1108 /* 1st arg of ReadAndPrepare() is ppcSeqs, 2nd is #seq */
1109 /* FIXME: at this stage the 3rd arg, rHMM_p, only acts as a dummy.
1110 this is rather silly, as it is the actual struct that will
1111 carry the HMM info at the end.
1112 If we write it already in ReadAndPrepare() then we could
1113 dispense with all this friend-class nonsense */
1114 if (OK != ReadAndPrepare(READ_HMM_2_HMM,
1115 NULL, 0, rHMM_p, NULL, NULL, NULL,
1116 pcHMM_input, rTemp, &rTempAli)) {
1120 /* an important piece of information I want to get out of here
1121 is the consenssus sequence. there are however certain
1122 Pfam HMMs that don't trigger consensus calculation.
1123 at the moment I (FS) don't understand why this is
1124 (or rather why this is not). The proper place to do this
1125 should be inside ReadAndPrepare():ReadHMMer(), but
1126 I have not quite penetrated the logic there.
1127 Therefore I try to catch this condition at this point (here)
1130 if (-1 == rHMM_p->ncons){
1132 rHMM_p->ncons = rHMM_p->nfirst;
1134 for (i = 0; i < rHMM_p->L; i++){
1135 double dPmax = 0.00;
1137 for (iA = 0; iA < AMINOACIDS; iA++){
1138 if (rHMM_p->f[i][iA] > dPmax){
1140 dPmax = rHMM_p->f[i][iA];
1142 } /* (0 <= iA < AMINOACIDS) */
1143 rHMM_p->seq[rHMM_p->ncons][i] = i2aa(iAmax);
1144 } /* (0 <= i < rHMM_p->L) */
1145 rHMM_p->seq[rHMM_p->ncons][i] = '\0'; /* FS, r291 -> */
1146 } /* ncons not set */
1149 rTemp.ClobberGlobal();
1150 rTempAli.ClobberGlobal();
1154 } /*** end: readHMMWrapper() ***/
1161 /////////////////////////////////////////////////////////////////////////////
1163 * @brief Do precalculations for q and t to prepare comparison
1166 PrepareTemplate(HMM& q, HMM& t, int format)
1168 if (format==0) // HHM format
1170 // Add transition pseudocounts to template
1171 t.AddTransitionPseudocounts();
1173 // Generate an amino acid frequency matrix from t.f[i][a] with max pseudocounts (tau=1) -> t.g[i][a]
1174 t.PreparePseudocounts();
1176 // Add amino acid pseudocounts to template: t.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1177 t.AddAminoAcidPseudocounts();
1179 else // HHMER format
1181 // Don't add transition pseudocounts to template
1182 // t.AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, 0.0);
1184 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
1185 t.PreparePseudocounts();
1187 // DON'T ADD amino acid pseudocounts to temlate: pcm=0! t.p[i][a] = t.f[i][a]
1188 t.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc);
1191 // Modify transition probabilities to include SS-dependent penalties
1192 if (par.ssgap) t.UseSecStrucDependentGapPenalties();
1194 if (par.forward>=1) t.Log2LinTransitionProbs(1.0);
1196 // Factor Null model into HMM t
1197 // ATTENTION! t.p[i][a] is divided by pnul[a] (for reasons of efficiency) => do not reuse t.p
1198 t.IncludeNullModelInHMM(q,t); // Can go BEFORE the loop if not dependent on template
1204 /*** end of hhfunc-C.h ***/