/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /********************************************************************* * Clustal Omega - Multiple sequence alignment * * Copyright (C) 2010 University College Dublin * * Clustal-Omega is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This file is part of Clustal-Omega. * ********************************************************************/ /* * RCS $Id: hhfunc-C.h 256 2011-06-23 13:57:28Z fabian $ */ /* * Changelog: Michael Remmert made changes to hhalign stand-alone code * FS implemented some of the changes on 2010-11-11 -> MR1 * * did not incorporate SS prediction PSIpred (yet); functions are: * CalculateSS(3), */ // hhfunc.C /** * AddBackgroundFrequencies() * * @brief add background frequencies (derived from HMM) to * sequence/profile * * @param[in,out] ppfFreq, * [in] residue frequencies of sequence/profile, * [out] overlayed with HMM background frequencies * @param[in,out] ppfPseudoF, * [in] residue frequencies+pseudocounts of sequence/profile, * [out] overlayed with HMM background frequencies+pseudocounts * @param[in] iSeqLen, * length of sequence/profile (not aligned to HMM) * @pram[in] prHMM, * background HMM * @param[in] ppcSeq, * sequences/profile to be 'softened' * @param[in] pcPrealigned, * sequence aligned to HMM, this is not quite a consensus, * it is identical to 1st sequence but over-writes gap information, * if other sequences in profile have (non-gap) residues * @param[in] iPreCnt * number of sequences pre-aligned (pcPrealigned is 'consensus' of these sequences) * @param[in] pcRepresent * sequence representative of HMM, aligned to pcSeq0 */ void AddBackgroundFrequencies(float **ppfFreq, float **ppfPseudoF, float **ppfPseudoTr, int iSeqLen, hmm_light *prHMM, char **ppcSeq, char *pcPrealigned, int iPreCnt, char *pcRepresent) { char *pcS = pcPrealigned; /* current residue in pre-aligned sequence */ char *pcH = pcRepresent; /* current residue in pre-aligned HMM */ int iS = 0; /* position in un-aligned sequence (corresponds to pcS) */ int iH = 0; /* position in un-aligned HMM (corresponds to pcH) */ int iA; /* residue iterator */ int iT; /* transition state iterator */ float fFWeight = 0.50 / sqrt((float)(iPreCnt)); /* weight of backgroud frequencies */ /* FIXME: tune value, 0.50 default */ //float fFWeight = 0.75; float fFThgiew = UNITY - fFWeight; /* weight of 'true' frequencies */ float fGWeight = 0.50 / sqrt((float)(iPreCnt)); /* weight of backgroud transitions */ /* FIXME: tune value, 0.50 default */ //float fGWeight = 0.50 /*/ (float)(iPreCnt)*/; /* weight of backgroud transitions */ /* FIXME: tune value, 0.50 default */ float fGThgiew = UNITY - fGWeight; /* weight of 'true' transitions */ float fAux; /* zf1SeqTrans[] are the (phenomenological) transition probabilities (+ pseudo-counts) for a single sequence. It is almost certain (0.99) to stay in a match state, and very unlikely (0.05) to go from match to insertion/deletion */ float zf1SeqTrans[] = {0.98, 0.01, 0.01, 0.25, 0.75, 0.25, 0.75}; float zf1SeqInit[] = {0.98, 0.01, 0.01, 0.99, 0.01, 0.99, 0.01}; float zf1SeqDel[] = {0.24, 0.01, 0.75, 0.01, 0.01, 0.01, 0.01}; float zf1SeqRevrt[] = {0.98, 0.01, 0.01, 0.01, 0.01, 0.99, 0.01}; if ( (NULL == pcPrealigned) || (NULL == pcRepresent) ){ /*printf("%s/%s:%d: WARNING HMM=NULL -- didn't think I would get here (carry on, no danger)\n", __FUNCTION__, __FILE__, __LINE__);*/ return; } if (NULL == prHMM->p){ printf("%s:%s:%d: WARNING -- Background Pseudocounts point to NULL\n" "\tthis is not intended - don't add background but CONTINUE\n", __FUNCTION__, __FILE__, __LINE__); return; } /* FIXME: should be 0 (FS thinks) but -1 gives better results */ iH = iS = 0/*-1*//*+1*/; while ( ('\0' != *pcS) && ('\0' != *pcH) ){ if ( ('-' != *pcH) && ('-' != *pcS) ){ iH++; iS++; #if 0 /* match state * - HMM had a gap in previous position (now closed) * FIXME: this does not really work */ if ((iH > 0) && ('-' == *(pcH-1))){ for (iT = 0; iT < STATE_TRANSITIONS; iT++){ ppfPseudoTr[iS-1][iT] = log2f(zf1SeqRevrt[iT]); } } #endif #if 1 /* do frequencies -- this is not really useful; frequencies are derived from HMM and contain already pseudocounts (PCs), adding frequencies and then PCs will add PCs _twice_ results are better than not to add them, but not as good as PCs */ for (iA = 0; iA < AMINOACIDS; iA++){ fAux = fFThgiew * ppfFreq[iS][iA] + fFWeight * prHMM->f[iH][iA]; ppfFreq[iS][iA] = fAux; } #endif /* do pseudo-counts */ for (iA = 0; iA < AMINOACIDS; iA++){ fAux = fFThgiew * ppfPseudoF[iS][iA] + fFWeight * prHMM->p[iH][iA]; ppfPseudoF[iS][iA] = fAux; } #if 0 /* do state transitions */ for (iT = 0; iT < STATE_TRANSITIONS; iT++){ #if 1 /* this is a very crude method, which averages the logarithms of the transitions, effectively performing a geometric mean - this presumably violates normalisation. however, results are surprisingly good */ fAux = fGThgiew * ppfPseudoTr[iS][iT] + fGWeight * prHMM->tr[iH][iT]; ppfPseudoTr[iS][iT] = fAux; #else /* crude averaging */ /* There are 2 things to consider: (1) one should really blend the probabilities of the transitions, however, by default we have the logarithms thereof. So must exponentiate, blend, and then take log again. This is expensive, and does not seem to lead to better results than blending the logarithms (and violating normalisation) (2) transition probabilities for a single sequence are really easy, there are no insert/delete transitions. However, there is a begin state that is different from the main body. But again, this does not seem to make a blind bit of difference */ if (iS > 0){ fAux = fGThgiew * zf1SeqTrans[iT] + fGWeight * prHMM->linTr[iH][iT]; ppfPseudoTr[iS][iT] = log2f(fAux); } else { fAux = fGThgiew * zf1SeqInit[iT] + fGWeight * prHMM->linTr[iH][iT]; ppfPseudoTr[iS][iT] = log2f(fAux); } #endif /* mixing of linear/log */ } /* 0 <= iT < STATE_TRANSITIONS */ #endif /* did state transitions */ } /* was Match -- neither HMM nor sequence have gap */ else if ('-' == *pcH){ /* sequence opened up gap in HMM */ #if 0 if ((iH > 0) && ('-' != *(pcH-1)) && (iS > 0)){ /* this is the first gap in HMM * FIXME: this does not really work */ for (iT = 0; iT < STATE_TRANSITIONS; iT++){ ppfPseudoTr[iS-1][iT] = log2f(zf1SeqDel[iT]); } } else { /* do nothing, keep single sequence values exactly as they are*/ } #endif iS++; } else if ('-' == *pcS){ /* here the single sequence has a gap, and the HMM (as a whole) does not. There may be individual gaps in the HMM at this stage. By ignoring this we say that the HMM dominates the overall behaviour - as in the M2M state as well */ iH++; } pcH++; pcS++; } /* !EO[seq/hmm] */ return; } /* this is the end of AddBackgroundFrequencies() */ /** * ReadAndPrepare() * * @brief Read HMM input file or transfer alignment * and add pseudocounts etc. * * @param[in] iRnPtype * type of read/prepare * enum {INTERN_ALN_2_HMM = 0, READ_ALN_2_HMM, READ_HMM_2_HMM}; * @param[in] ppcProf * alignment * @param[in] iCnt * number of seqs in alignment * @param[in,out] prHMM, * [in] if sequences read/prepared, [out] if HMM from file * @param[in] pcPrealigned, * (single) sequence aligned to background HMM * @param[in] pcRepresent, * sequence representing HMM aligned to individual sequence * param[in] pdExWeights * (external) sequence weights, derived from tree * @param[in] infile * name of file with HMM info (formerly also alignment) * @param[out] q * HMM structure with transition probabilities, residue frequencies etc * @param[???] qali * FIXME: what is qali? * * @return FAILURE on error, OK otherwise */ int ReadAndPrepare(int iRnPtype, char **ppcProf, int iCnt, hmm_light *prHMM, char *pcPrealigned, char *pcRepresent, double *pdExWeights, char* infile, HMM& q, Alignment* qali=NULL) { //#ifndef CLUSTALO_NOFILE char path[NAMELEN]; /* NOTE: there are different scenarios: (i) ("" != infile) - read HMM from file, transfer frequency/transition/pseudo-count (f/t/p) info into prHMM (ii) ('\0' != ppcProf[0]) - transfer sequence/alignment into q/qali, don't save f/t/p into prHMM, on the contrary, if prior f/t/p available then add it to q/qali, this is only done if (1==iCnt) (iii) ('\0' == ppcProf[0]) - re-cycle old HMM information recreate a HMM from previous data */ /********************************/ /*** (o) Recycle internal HMM ***/ /********************************/ if ( (INTERN_ALN_2_HMM == iRnPtype) && (iCnt <= 0) ){ /* NOTE: here we are writing into a HMM structure/class; memory has been allocated for this in hhalign.cpp; however, as iCnt<=0, there may not be memory for prHMM->n_display sequences/names. But then, there doesn't have to be. At this stage we are just copying one HMM into another HMM, sequences are irrelevant. The only sequence of (marginal) interest is the consensus sequence */ /* FIXME: check that prHMM is valid -- how? */ const int ciCons = 0; const int ciNoof = ciCons+1; const int ciInvalid = -1; q.n_display = ciNoof; /* only consensus */ q.sname = NULL; q.ncons = ciCons; q.nfirst = ciCons;//prHMM->nfirst; q.nss_dssp = ciInvalid;//prHMM->nss_dssp; q.nsa_dssp = ciInvalid;//prHMM->nsa_dssp; q.nss_pred = ciInvalid;//prHMM->nss_pred; q.nss_conf = ciInvalid;//prHMM->nss_conf; q.L = prHMM->L; q.N_in = prHMM->N_in; q.N_filtered = prHMM->N_filtered; /* NOTE: I (FS) think that only ever 1 sequence will be transferred here, that is, the consensus sequence. However, we might want to allow (in the future) to transfer more sequences, hence the awkward for() loop */ #if 0 for (int i = prHMM->ncons+0; i < prHMM->ncons+q.n_display; i++){ /* NOTE: In the original hhalign code the first position is kept open ('\0'). This makes it difficult to use string functions like strlen/strdup etc. Insert a temporary gap (.) to facilitate string operations */ char cHead = prHMM->seq[i][0]; prHMM->seq[i][0] = '.'; q.seq[i] = strdup(prHMM->seq[i]); prHMM->seq[i][0] = q.seq[i][0] = cHead; } #else { char cHead = prHMM->seq[prHMM->ncons][0]; prHMM->seq[prHMM->ncons][0] = '.'; q.seq[q.ncons] = strdup(prHMM->seq[prHMM->ncons]); prHMM->seq[prHMM->ncons][0] = q.seq[q.ncons][0] = cHead; } #endif const int NEFF_SCORE = 10; /* FIXME: Magic Number */ for (int i = 0; i < prHMM->L+1; i++){ q.Neff_M[i] = prHMM->Neff_M[i]; q.Neff_I[i] = prHMM->Neff_I[i]; q.Neff_D[i] = prHMM->Neff_D[i]; } q.Neff_HMM = prHMM->Neff_HMM; /* skip longname,name,file,fam,sfam,fold,cl */ q.lamda = prHMM->lamda; q.mu = prHMM->mu; HMMshadow rShadow = {0}; /* friend of HMM to access private members */ rShadow.copyShadowToHMM(q, *prHMM); /* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */ /* pav already done in copyShadowToHMM */ /* skip pnul */ return OK; } /* INTERN_ALN_2_HMM && iCnt<=0 */ /******************************/ /*** (i) Read HMM from file ***/ /******************************/ char line[LINELEN] = {0}; // input line FILE *inf = NULL; //if ( (0 != strcmp(infile,"")) /*&& (iCnt > 0)*/ ) if ( (READ_HMM_2_HMM == iRnPtype) || (READ_ALN_2_HMM == iRnPtype)) { if (0 == strcmp(infile,"")){ printf("%s:%s:%d:\n" "\texpected to re %s from file but no file specified\n" "" , __FUNCTION__, __FILE__, __LINE__ , (READ_HMM_2_HMM==iRnPtype?"HMM":"alignment")); return FAILURE; } inf = fopen(infile, "r"); if (!inf) OpenFileError(infile); Pathname(path,infile); //} //else { //inf = stdin; //if (v>=2) printf("Reading HMM / multiple alignment from standard input ...\n(To get a help list instead, quit and type %s -h.)\n",program_name); //*path='\0'; //} fgetline(line,LINELEN-1,inf); } //if ( (0 != strcmp(infile,"")) && (iCnt > 0) ) if ( (READ_HMM_2_HMM == iRnPtype) ){ if (0 == strcmp(infile,"")){ printf("%s:%s:%d: expected to read HMM from file but no file-name\n", __FUNCTION__, __FILE__, __LINE__); } // Is infile a HMMER3 file? /* MR1 */ if (!strncmp(line,"HMMER3",6)) { if (v>=2) { cout<<"Query file is in HMMER3 format\n"; cout<<"WARNING! Use of HMMER3 format as input results in dramatically loss of sensitivity!\n"; } // Read 'query HMMER file rewind(inf); q.ReadHMMer3(inf,path); // Don't add transition pseudocounts to query!! // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q.PreparePseudocounts(); // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a] q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); /* further down there is a q.Log2LinTransitionProbs but only if (iCnt>0), however, we still need it it here (i think), there is no danger of doing this twice, as trans_lin is checked FIXME (FS, 2011-01-12) */ /* further down there is a q.Log2LinTransitionProbs but only if (iCnt>0), however, we still need it it here (i think), there is no danger of doing this twice, as trans_lin is checked FIXME (FS, 2011-01-12) */ //if (par.forward >= 1) { q.Log2LinTransitionProbs(1.0); } } // ... or Is infile an old HMMER file? else if (!strncmp(line,"HMMER",5)) { if (v>=2) { cout<<"Query file is in HMMER format\n"; cout<<"WARNING! Use of HMMER format as input results in dramatically loss of sensitivity!\n"; } // Read 'query HMMER file q.ReadHMMer(inf,path); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM); // Don't add transition pseudocounts to query!! // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q.PreparePseudocounts(); // DON'T ADD amino acid pseudocounts to query: pcm=0! q.p[i][a] = f[i][a] q.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); /* further down there is a q.Log2LinTransitionProbs but only if (iCnt>0), however, we still need it it here (i think), there is no danger of doing this twice, as trans_lin is checked FIXME (FS, 2011-01-12) */ //if (par.forward >= 1) { q.Log2LinTransitionProbs(1.0); } } /* it was a HMMer file */ // ... or is it an hhm file? else if (!strncmp(line,"NAME",4) || !strncmp(line,"HH",2)) { if (v>=2) cout<<"Query file is in HHM format\n"; // Rewind to beginning of line and read query hhm file rewind(inf); q.Read(inf,path); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: HMM %s looks too diverse (Neff=%.1f>11). Better check the underlying alignment... \n",q.name,q.Neff_HMM); // Add transition pseudocounts to query -> q.p[i][a] q.AddTransitionPseudocounts(); // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q.PreparePseudocounts(); // Add amino acid pseudocounts to query: q.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] q.AddAminoAcidPseudocounts(); } /* it was a HHM file */ else { fprintf(stderr, "%s:%s:%d: Unknown HMM format in infile\n" "infile=%s, #seq=%d\n" , __FUNCTION__, __FILE__, __LINE__ , infile, iCnt); return FAILURE; } /*fclose(inf);*/ /*** transfer class info to struct */ prHMM->n_display = q.n_display; /* ignore sname*/ prHMM->seq = (char **)calloc((q.n_display+1), sizeof(char *)); /* FIXME valgrind says bytes get lost in the above calloc during * hmm-iteration */ for (int i = 0; i < q.n_display; i++){ /* NOTE: In the original hhalign code the first position is kept open ('\0'). This makes it difficult to use string functions like strlen/strdup etc. Insert a temporary gap (.) to facilitate string operations */ char cHead = q.seq[i][0]; q.seq[i][0] = '.'; prHMM->seq[i] = strdup(q.seq[i]); q.seq[i][0] = prHMM->seq[i][0] = cHead; } prHMM->ncons = q.ncons; prHMM->nfirst = q.nfirst; prHMM->nss_dssp = q.nss_dssp; prHMM->nsa_dssp = q.nsa_dssp; prHMM->nss_pred = q.nss_pred; prHMM->nss_conf = q.nss_conf; prHMM->L = q.L; prHMM->N_in = q.N_in; prHMM->N_filtered = q.N_filtered; const int NEFF_SCORE = 10; /* FIXME: Magic Number */ prHMM->Neff_M = (float *)calloc(prHMM->L+1, sizeof(float)); prHMM->Neff_I = (float *)calloc(prHMM->L+1, sizeof(float)); prHMM->Neff_D = (float *)calloc(prHMM->L+1, sizeof(float)); /*for (int i = 0; i < prHMM->L+1; i++){ prHMM->Neff_M[i] = prHMM->Neff_I[i] = prHMM->Neff_D[i] = NEFF_SCORE; }*/ prHMM->Neff_HMM = q.Neff_HMM; /* skip longname,name,file,fam,sfam,fold,cl */ prHMM->lamda = q.lamda; prHMM->mu = q.mu; HMMshadow rShadow = {0}; /* friend of HMM to access private members */ rShadow.copyHMMtoShadow(q); prHMM->f = (float **)calloc(prHMM->L+1, sizeof(float *)); prHMM->g = (float **)calloc(prHMM->L+1, sizeof(float *)); prHMM->p = (float **)calloc(prHMM->L+1, sizeof(float *)); prHMM->tr = (float **)calloc(prHMM->L+1, sizeof(float *)); prHMM->linTr = (float **)calloc(prHMM->L+1, sizeof(float *)); for (int i = 0; i < prHMM->L+1; i++){ prHMM->f[i] = (float *)calloc(AMINOACIDS, sizeof(float)); prHMM->g[i] = (float *)calloc(AMINOACIDS, sizeof(float)); prHMM->p[i] = (float *)calloc(AMINOACIDS, sizeof(float)); for (int j = 0; j < AMINOACIDS; j++){ prHMM->f[i][j] = (float)rShadow.f[i][j]; prHMM->g[i][j] = (float)rShadow.g[i][j]; prHMM->p[i][j] = (float)rShadow.p[i][j]; } prHMM->tr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float)); prHMM->linTr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float)); for (int j = 0; j< STATE_TRANSITIONS; j++){ prHMM->tr[i][j] = (float)rShadow.tr[i][j]; prHMM->linTr[i][j] = fpow2(rShadow.tr[i][j]); } } /*0 <= i < prHMM->L+1 */ /* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */ for (int j = 0; j < AMINOACIDS; j++){ prHMM->pav[j] = (float)rShadow.pav[j]; } /* skip pnul */ } /* have read HHM from file */ /*else if ( ((NULL != ppcProf) && (iCnt > 0) && ('\0' != ppcProf[0][0])) || ( (0 != strcmp(infile,"") && (0 == iCnt) )) )*/ else if ( (INTERN_ALN_2_HMM == iRnPtype) || (READ_ALN_2_HMM == iRnPtype) ) { if ( (INTERN_ALN_2_HMM == iRnPtype) && ( (NULL == ppcProf) || (iCnt <= 0) || ('\0' == ppcProf[0][0]) ) ){ printf("%s:%s:%d: was expecting internal alignment but\n" "\tppcProf=%p, #seq=%d, ppcProf[0][0]=%c\n" , __FUNCTION__, __FILE__, __LINE__ , ppcProf, iCnt, ppcProf[0][0]); return FAILURE; } else if ( (READ_ALN_2_HMM == iRnPtype) && (0 == strcmp(infile,"")) ){ printf("%s:%s:%d: was expecting to read alignment from file but no filename\n" , __FUNCTION__, __FILE__, __LINE__); return FAILURE; } /*******************************/ /*** (ii) it is an alignment ***/ /*******************************/ /* transfer alignment information from clustal character array into pali/q/t classes */ /* * NOTE that emissions in HMMer file format contain pseudo-counts. * HHM file format does not contain emission pseudo-counts. * the structure that stores background HMM information does contain pseudo-counts */ Alignment* pali; if (qali==NULL){ pali=new Alignment(iCnt); /* FIXME: pass in iCnt to get rid of MAXSEQ */ } else{ pali=qali; } if (par.calibrate) { printf("\nError in %s: only HHM files can be calibrated.\n",program_name); printf("Build an HHM file from your alignment with 'hhmake -i %s' and rerun hhsearch with the hhm file\n\n",infile); exit(1); } if (v>=2 && strcmp(infile,"stdin")) cout< 0) if (INTERN_ALN_2_HMM == iRnPtype){ pali->Transfer(ppcProf, iCnt); } //else if (0 == iCnt) else if (READ_ALN_2_HMM == iRnPtype){ pali->Read(inf, infile, line); } else { printf("%s:%s:%d: FATAL problem\n" "infile = (%s), #seq = %d\n" , __FUNCTION__, __FILE__, __LINE__ , infile, iCnt); return FAILURE; } /* Convert ASCII to int (0-20), throw out all insert states, record their number in I[k][i] and store marked sequences in name[k] and seq[k] */ pali->Compress(infile); /* Sort out the nseqdis most dissimilar sequences for display in the output alignments */ pali->FilterForDisplay(par.max_seqid, par.coverage, par.qid, par.qsc,par.nseqdis); // Filter alignment for min score per column with core query profile, defined by coverage_core and qsc_core //if (par.coresc>-10) pali->HomologyFilter(par.coverage_core, par.qsc_core, par.coresc); /* Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) */ pali->N_filtered = pali->Filter(par.max_seqid, par.coverage, par.qid, par.qsc, par.Ndiff); /* Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a] */ pali->FrequenciesAndTransitions(q); if (v>=2 && q.Neff_HMM>11.0) fprintf(stderr,"WARNING: alignment %s looks too diverse (Neff=%.1f>11). Better check it with an alignment viewer... \n",q.name,q.Neff_HMM); /*printf("%d %d %f %d (N,Nf,Neff,L) %s:%s:%d\n" , q.N_in, q.N_filtered, q.Neff_HMM, q.L, __FUNCTION__, __FILE__, __LINE__);*/ // Add transition pseudocounts to query -> p[i][a] q.AddTransitionPseudocounts(); /* Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] */ q.PreparePseudocounts(); /* Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] */ q.AddAminoAcidPseudocounts(); /* ****** add aligned background pseudocounts ***** */ HMMshadow rShadowQ = {0}; rShadowQ.copyHMMtoShadow(q); AddBackgroundFrequencies(rShadowQ.f, rShadowQ.p, rShadowQ.tr, q.L, prHMM, q.seq, pcPrealigned, iCnt, pcRepresent); if (qali==NULL){ delete(pali); pali = NULL; } } /* else if (NULL != ppcProf) // not hmmer/hhalign but alignment */ //else if ((prHMM->L > 0) && ('\0' == ppcProf[0][0])) else if (INTERN_HMM_2_HMM == iRnPtype){ /******************************************/ /*** (iii) re-cycle old HMM information ***/ /******************************************/ if (prHMM->L <= 0){ printf("%s:%s:%d: was expecting to copy HMM structure but L=%d\n" , __FUNCTION__, __FILE__, __LINE__, prHMM->L); } printf("%s:%s:%d: RE-CYCLING HMM\n", __FUNCTION__, __FILE__, __LINE__); #if 0 q.n_display = prHMM->n_display; /* ignore sname*/ for (int i = 0; i < q.n_display; i++){ /* NOTE: In the original hhalign code the first position is kept open ('\0'). This makes it difficult to use string functions like strlen/strdup etc. Insert a temporary gap (.) to facilitate string operations */ char cHead = prHMM->seq[i][0]; prHMM->seq[i][0] = '.'; q.seq[i] = strdup(prHMM->seq[i]); q.seq[i][0] = prHMM->seq[i][0] = cHead; } q.nfirst = prHMM->nfirst; #else q.n_display = 1; q.nfirst = 0; char cHead = prHMM->seq[prHMM->nfirst][0]; prHMM->seq[prHMM->nfirst][0] = '.'; q.seq[0] = strdup(prHMM->seq[prHMM->nfirst]); q.seq[q.n_display-1][0] = prHMM->seq[prHMM->nfirst][0] = cHead; #endif q.ncons = prHMM->ncons; q.nss_dssp = prHMM->nss_dssp; q.nsa_dssp = prHMM->nsa_dssp; q.nss_pred = prHMM->nss_pred; q.nss_conf = prHMM->nss_conf; q.L = prHMM->L; q.N_in = prHMM->N_in; q.N_filtered = prHMM->N_filtered; #define NEFF_SCORE 10 /* FIXME: Magic Number */ /*for (int i; i < prHMM->L+1; i++){ q.Neff_M[i] = q.Neff_I[i] = q.Neff_D[i] = NEFF_SCORE; }*/ q.Neff_HMM = prHMM->Neff_HMM; /* skip longname,name,file,fam,sfam,fold,cl */ q.lamda = prHMM->lamda; q.mu = prHMM->mu; HMMshadow rShadow = {0}; /* friend of HMM to access private members */ rShadow.copyShadowToHMM(q, *prHMM); } if (iCnt > 0){ if (par.forward>=1) q.Log2LinTransitionProbs(1.0); } if (NULL != inf){ fclose(inf); } return OK; } /*** end: ReadAndPrepare() ***/ /** * FreeHMMstruct() * * @brief FIXME * * @param[in,out] */ extern "C" void FreeHMMstruct(hmm_light *prHMM) { int i; if (NULL != prHMM->f){ for (i = 0; i < prHMM->L+1; i++){ if (NULL != prHMM->f[i]){ free(prHMM->f[i]); prHMM->f[i] = NULL; } } /* 0 <= i < prHMM->L+1 */ free(prHMM->f); prHMM->f = NULL; } if (NULL != prHMM->g){ for (i = 0; i < prHMM->L+1; i++){ if (NULL != prHMM->g[i]){ free(prHMM->g[i]); prHMM->g[i] = NULL; } } /* 0 <= i < prHMM->L+1 */ free(prHMM->g); prHMM->g = NULL; } if (NULL != prHMM->p){ for (i = 0; i < prHMM->L+1; i++){ if (NULL != prHMM->p[i]){ free(prHMM->p[i]); prHMM->p[i] = NULL; } } /* 0 <= i < prHMM->L+1 */ free(prHMM->p); prHMM->p = NULL; } if (NULL != prHMM->tr){ for (i = 0; i < prHMM->L+1; i++){ if (NULL != prHMM->tr[i]){ free(prHMM->tr[i]); prHMM->tr[i] = NULL; } } /* 0 <= i < prHMM->L+1 */ free(prHMM->tr); prHMM->tr = NULL; } if (NULL != prHMM->linTr){ for (i = 0; i < prHMM->L+1; i++){ if (NULL != prHMM->linTr[i]){ free(prHMM->linTr[i]); prHMM->linTr[i] = NULL; } } /* 0 <= i < prHMM->L+1 */ free(prHMM->linTr); prHMM->linTr = NULL; } if (NULL != prHMM->Neff_M){ free(prHMM->Neff_M); prHMM->Neff_M = NULL; } if (NULL != prHMM->Neff_I){ free(prHMM->Neff_I); prHMM->Neff_I = NULL; } if (NULL != prHMM->Neff_D){ free(prHMM->Neff_D); prHMM->Neff_D = NULL; } if (NULL != prHMM->seq){ for (i = 0; i < prHMM->n_display; i++){ if (NULL != prHMM->seq[i]){ free(prHMM->seq[i]); prHMM->seq[i] = NULL; } } free(prHMM->seq); prHMM->seq = NULL; } memset(prHMM, 0, sizeof(hmm_light)); } /*** end: FreeHMMstruct() ***/ /** * @brief comparisin function for ascending qsort() of doubles */ int CompDblAsc(const void *pv1, const void *pv2){ double d1 = *(double *)pv1; double d2 = *(double *)pv2; if (d1 > d2) { return +1; } else if (d1 < d2) { return -1; } else { return 0; } } /*** end: CompDblAsc() ***/ /** * AlnToHMM2() * * @brief convert alignment into HMM (hhmake) * * @param[out] prHMM * structure with pseudocounts etc * @param[in] pcHMM_input * name of file with HMM * */ extern "C" int AlnToHMM2(hmm_light *rHMM_p, char **ppcSeq, int iN){ if (0 == par.M){ SetDefaults(); SetSubstitutionMatrix(); par.cons = 1; par.M = 2; par.forward=2; par.Mgaps=100; const int ciGoodMeasureSeq = 10; int iAuxLen = strlen(ppcSeq[0]); par.maxResLen = iAuxLen; par.maxResLen += ciGoodMeasureSeq; par.maxColCnt = iAuxLen + ciGoodMeasureSeq; par.max_seqid=DEFAULT_FILTER; par.loc=0; par.mact=0; /* some minor parameters, affecting alignment (i think) */ par.p = 0.0; /* minimum threshold for inclusion in hit list */ par.Z = 100; /* minimum threshold for inclusion in hit list and alignment listing */ par.z = 1; /* min number of lines in hit list */ par.B = 100; /* max number of alignments */ par.b = 1; /* min number of alignments */ par.E = 1e6; /* maximum threshold for inclusion in hit list and alignment listing */ par.altali=1;par.hitrank=0;par.showcons=1; par.showdssp=1;par.showpred=1;par.nseqdis=iN;par.cons=1; } const int ciGoodMeasure = 10; int iLen = strlen(ppcSeq[0]) + ciGoodMeasure; if (iLen > par.maxResLen){ par.maxResLen = par.maxColCnt = iLen; } HMM rTemp(iN+2, iLen); /* temporary template */ Alignment rTempAli(iN+2, iLen); /* temporary alignment */ int iParCons = par.cons; /*printf(">>>>>>>>>>> %s:%s:%d: there are %d sequences\n", __FUNCTION__, __FILE__, __LINE__, iN);*/ par.cons = 1; if (OK != ReadAndPrepare(INTERN_ALN_2_HMM, ppcSeq, iN, rHMM_p, NULL/*prealigned*/, NULL/*representative*/, NULL/*weights*/, //YES/*transfer*/, (char*)("")/*in-file*/, rTemp, &rTempAli)) { return FAILURE; } par.cons = iParCons; /******/ /*** transfer class info to struct */ rHMM_p->n_display = rTemp.n_display; rHMM_p->sname = NULL; rHMM_p->seq = (char **)calloc((rTemp.n_display+1), sizeof(char *)); for (int i = 0; i < rTemp.n_display; i++){ /* NOTE: In the original hhalign code the first position is kept open ('\0'). This makes it difficult to use string functions like strlen/strdup etc. Insert a temporary gap (.) to facilitate string operations */ char cHead = rTemp.seq[i][0]; rTemp.seq[i][0] = '.'; rHMM_p->seq[i] = strdup(rTemp.seq[i]); rTemp.seq[i][0] = rHMM_p->seq[i][0] = cHead; } rHMM_p->ncons = rTemp.ncons; rHMM_p->nfirst = rTemp.nfirst; if (-1 == rHMM_p->ncons){ /* ncons needed later for alignment of representative sequence and copy of profile. ncons not always set (-1 default), this will cause segmentation fault. nfirst (probably) right index - no problem if not */ rHMM_p->ncons = rTemp.nfirst; } rHMM_p->nss_dssp = rTemp.nss_dssp; rHMM_p->nsa_dssp = rTemp.nsa_dssp; rHMM_p->nss_pred = rTemp.nss_pred; rHMM_p->nss_conf = rTemp.nss_conf; rHMM_p->L = rTemp.L; rHMM_p->N_in = rTemp.N_in; rHMM_p->N_filtered = rTemp.N_filtered; #define NEFF_SCORE 10 /* FIXME: Magic Number */ //// get rid of that, FS, 2010-12-22 rHMM_p->Neff_M = (float *)calloc(rHMM_p->L+1, sizeof(float)); rHMM_p->Neff_I = (float *)calloc(rHMM_p->L+1, sizeof(float)); rHMM_p->Neff_D = (float *)calloc(rHMM_p->L+1, sizeof(float)); for (int i = 0; i < rHMM_p->L+1; i++){ rHMM_p->Neff_M[i] = rHMM_p->Neff_I[i] = rHMM_p->Neff_D[i] = NEFF_SCORE; //// get rid of that, FS, 2010-12-22 } rHMM_p->Neff_HMM = rTemp.Neff_HMM; /* skip longname,name,file,fam,sfam,fold,cl */ rHMM_p->lamda = rTemp.lamda; rHMM_p->mu = rTemp.mu; HMMshadow rShadow = {0}; /* friend of HMM to access private members */ rShadow.copyHMMtoShadow(rTemp); rHMM_p->Neff_M = (float *)calloc(rHMM_p->L+1, sizeof(float)); rHMM_p->Neff_I = (float *)calloc(rHMM_p->L+1, sizeof(float)); rHMM_p->Neff_D = (float *)calloc(rHMM_p->L+1, sizeof(float)); rHMM_p->f = (float **)calloc(rHMM_p->L+1, sizeof(float *)); rHMM_p->g = (float **)calloc(rHMM_p->L+1, sizeof(float *)); rHMM_p->p = (float **)calloc(rHMM_p->L+1, sizeof(float *)); rHMM_p->tr = (float **)calloc(rHMM_p->L+1, sizeof(float *)); rHMM_p->linTr = (float **)calloc(rHMM_p->L+1, sizeof(float *)); for (int i = 0; i < rHMM_p->L+1; i++){ rHMM_p->Neff_M[i] = (float)rShadow.Neff_M[i]; rHMM_p->Neff_I[i] = (float)rShadow.Neff_I[i]; rHMM_p->Neff_D[i] = (float)rShadow.Neff_D[i]; rHMM_p->f[i] = (float *)calloc(AMINOACIDS, sizeof(float)); rHMM_p->g[i] = (float *)calloc(AMINOACIDS, sizeof(float)); rHMM_p->p[i] = (float *)calloc(AMINOACIDS, sizeof(float)); for (int j = 0; j < AMINOACIDS; j++){ rHMM_p->f[i][j] = (float)rShadow.f[i][j]; rHMM_p->g[i][j] = (float)rShadow.g[i][j]; rHMM_p->p[i][j] = (float)rShadow.p[i][j]; } rHMM_p->tr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float)); rHMM_p->linTr[i] = (float *)calloc(STATE_TRANSITIONS, sizeof(float)); for (int j = 0; j< STATE_TRANSITIONS; j++){ rHMM_p->tr[i][j] = (float)rShadow.tr[i][j]; rHMM_p->linTr[i][j] = fpow2(rShadow.tr[i][j]); } } /*0 <= i < prHMM->L+1 */ /* skip trans_lin,ss_dssp,sa_dssp,ss_pred,ss_conf,Xcons */ for (int j = 0; j < AMINOACIDS; j++){ rHMM_p->pav[j] = (float)rShadow.pav[j]; } /* skip pnul */ /******/ rTemp.ClobberGlobal(); rTempAli.ClobberGlobal(); return OK; } /*** end of AlnToHMM2() ***/ /** * HHMake_Wrapper() * * @brief turn alignment (from file) into HHM (HMM) on file * * @param[out] hmm_out * name of file with HMM info corresponding to tmp_aln * @param[in] tmp_aln * name of file with alignment, to be turned into HMM (HHM) * */ extern "C" int HHMake_Wrapper(char *tmp_aln, char *hmm_out) { HMM rTemp; /* temporary template */ Alignment rTempAli; /* temporary alignment */ hmm_light *rHMM_p = NULL; /** Note: this is a wrapper for a stand-alone program hhmake. hhmake uses a different set of parameters than hhalign. However, all parameters are GLOBAL. So at this point we register the hhalign settings, reset them to hhmake settings and set them back at the end of the function */ /* save settings from hhalign */ int iParShowcons=par.showcons; int iParAppend=par.append; int iParNseqdis=par.nseqdis; int iParMark=par.mark; int iParMaxSeqid=par.max_seqid; int iParQid=par.qid; double dParQsc=par.qsc; int iParCoverage=par.coverage; int iParNdiff=par.Ndiff; int iParCoverageCore=par.coverage_core; double dParQscCore=par.qsc_core; double dParCoresc=par.coresc; int iParM=par.M; int iParMgaps=par.Mgaps; int iParMatrix=par.matrix; int iParPcm=par.pcm; double dParPcw=par.pcw; double dParGapb=par.gapb; int iParWg=par.wg; /* these are settings suitable for hhmake */ par.showcons=1; // write consensus sequence into hhm file par.append=0; // overwrite output file par.nseqdis=10; // maximum number of query or template sequences to be recoreded in HMM and diplayed in output alignments par.mark=0; // 1: only marked sequences (or first) get displayed; 0: most divergent ones get displayed par.max_seqid=90; // default for maximum sequence identity threshold par.qid=0; // default for maximum sequence identity threshold par.qsc=-20.0f; // default for minimum score per column with query par.coverage=0; // default for minimum coverage threshold par.Ndiff=100; // pick Ndiff most different sequences from alignment par.coverage_core=30; // Minimum coverage for sequences in core alignment par.qsc_core=0.3f; // Minimum score per column with core alignment (HMM) par.coresc=-20.0f; // Minimum score per column with core alignment (HMM) par.M=1; // match state assignment is by A2M/A3M par.Mgaps=50; // above this percentage of gaps, columns are assigned to insert states par.matrix=0; // Subst.matrix 0: Gonnet, 1: HSDM, 2: BLOSUM50 3: BLOSUM62 par.pcm=0; // no amino acid and transition pseudocounts added par.pcw=0; // wc>0 weighs columns according to their intra-clomun similarity par.gapb=0.0; // default values for transition pseudocounts; 0.0: add no transition pseudocounts! par.wg=0; // 0: use local sequence weights 1: use local ones if (OK != ReadAndPrepare(READ_ALN_2_HMM, NULL, 0, rHMM_p, NULL, NULL, NULL, tmp_aln, rTemp, &rTempAli)) { return FAILURE; } rTemp.WriteToFile(hmm_out); /* restore settings from hhalign */ par.showcons=iParShowcons; par.append=iParAppend; par.nseqdis=iParNseqdis; par.mark=iParMark; par.max_seqid=iParMaxSeqid; par.qid=iParQid; par.qsc=dParQsc; par.coverage=iParCoverage; par.Ndiff=iParNdiff; par.coverage_core=iParCoverageCore; par.qsc_core=dParQscCore; par.coresc=dParCoresc; par.M=iParM; par.Mgaps=iParMgaps; par.matrix=iParMatrix; par.pcm=iParPcm; par.pcw=dParPcw; par.gapb=dParGapb; par.wg=iParWg; /* prepare exit */ rTemp.ClobberGlobal(); rTempAli.ClobberGlobal(); return OK; } /** * readHMMWrapper() * * @brief read HMM from file, copy (relevant) info into struct * * @param[out] prHMM * structure with pseudocounts etc, probably uninitialised on entry * @param[in] pcHMM_input * name of file with HMM * */ extern "C" int readHMMWrapper(hmm_light *rHMM_p, char *pcHMM_input) { par.maxResLen = 15002; HMM rTemp(1000,par.maxResLen); /* temporary template */ Alignment rTempAli; /* temporary alignment */ /* AW changed init from {0} to 0 because it failed to compile with * gcc 4.3.3 with the following error: * error: braces around initializer for non-aggregate type */ /* FS taken out initialiser alltogether */ /* 0th arg of RnP is the type of RnP, ie, enum {INTERN_ALN_2_HMM = 0, READ_ALN_2_HMM, READ_HMM_2_HMM};*/ /* 1st arg of ReadAndPrepare() is ppcSeqs, 2nd is #seq */ /* FIXME: at this stage the 3rd arg, rHMM_p, only acts as a dummy. this is rather silly, as it is the actual struct that will carry the HMM info at the end. If we write it already in ReadAndPrepare() then we could dispense with all this friend-class nonsense */ if (OK != ReadAndPrepare(READ_HMM_2_HMM, NULL, 0, rHMM_p, NULL, NULL, NULL, pcHMM_input, rTemp, &rTempAli)) { return FAILURE; } /* an important piece of information I want to get out of here is the consenssus sequence. there are however certain Pfam HMMs that don't trigger consensus calculation. at the moment I (FS) don't understand why this is (or rather why this is not). The proper place to do this should be inside ReadAndPrepare():ReadHMMer(), but I have not quite penetrated the logic there. Therefore I try to catch this condition at this point (here) and rectify it. */ if (-1 == rHMM_p->ncons){ int i, iA; rHMM_p->ncons = rHMM_p->nfirst; for (i = 0; i < rHMM_p->L; i++){ double dPmax = 0.00; int iAmax = -1; for (iA = 0; iA < AMINOACIDS; iA++){ if (rHMM_p->f[i][iA] > dPmax){ iAmax = iA; dPmax = rHMM_p->f[i][iA]; } } /* (0 <= iA < AMINOACIDS) */ rHMM_p->seq[rHMM_p->ncons][i] = i2aa(iAmax); } /* (0 <= i < rHMM_p->L) */ } /* ncons not set */ rTemp.ClobberGlobal(); rTempAli.ClobberGlobal(); return OK; } /*** end: readHMMWrapper() ***/ ///////////////////////////////////////////////////////////////////////////// /** * @brief Do precalculations for q and t to prepare comparison */ void PrepareTemplate(HMM& q, HMM& t, int format) { if (format==0) // HHM format { // Add transition pseudocounts to template t.AddTransitionPseudocounts(); // Generate an amino acid frequency matrix from t.f[i][a] with max pseudocounts (tau=1) -> t.g[i][a] t.PreparePseudocounts(); // Add amino acid pseudocounts to template: t.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] t.AddAminoAcidPseudocounts(); } else // HHMER format { // Don't add transition pseudocounts to template // t.AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, 0.0); // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] t.PreparePseudocounts(); // DON'T ADD amino acid pseudocounts to temlate: pcm=0! t.p[i][a] = t.f[i][a] t.AddAminoAcidPseudocounts(0, par.pca, par.pcb, par.pcc); } // Modify transition probabilities to include SS-dependent penalties if (par.ssgap) t.UseSecStrucDependentGapPenalties(); if (par.forward>=1) t.Log2LinTransitionProbs(1.0); // Factor Null model into HMM t // ATTENTION! t.p[i][a] is divided by pnul[a] (for reasons of efficiency) => do not reuse t.p t.IncludeNullModelInHMM(q,t); // Can go BEFORE the loop if not dependent on template return; } /*** end of hhfunc-C.h ***/