********************************************************************/
/*
- * RCS $Id: hhalign_wrapper.c 256 2011-06-23 13:57:28Z fabian $
+ * RCS $Id: hhalign_wrapper.c 306 2016-06-13 13:49:04Z fabian $
*/
#ifdef HAVE_CONFIG_H
#include <string.h>
#include <assert.h>
#include <ctype.h>
+#include <stdbool.h>
#include "seq.h"
#include "tree.h"
#define TRACE 0
/**
+ * @brief FIXME
+ *
+ * @note prHalignPara has to point to an already allocated instance
+ *
+ */
+void SetDefaultHhalignPara(hhalign_para *prHhalignPara)
+{
+ prHhalignPara->iMacRamMB = 8000; /* default|give just under 8GB to MAC algorithm, FS, 2016-04-18, went from 2GB to 8GB */
+ prHhalignPara->bIsDna = false; /* protein mode unless we say otherwise */
+ prHhalignPara->bIsRna = false;
+ prHhalignPara->pca = -UNITY;
+ prHhalignPara->pcb = -UNITY;
+ prHhalignPara->pcc = -UNITY;
+ prHhalignPara->pcw = -UNITY;
+ prHhalignPara->gapb = -UNITY;
+ prHhalignPara->gapd = -UNITY;
+ prHhalignPara->gape = -UNITY;
+ prHhalignPara->gapf = -UNITY;
+ prHhalignPara->gapg = -UNITY;
+ prHhalignPara->gaph = -UNITY;
+ prHhalignPara->gapi = -UNITY;
+ prHhalignPara->pcaV = -UNITY;
+ prHhalignPara->pcbV = -UNITY;
+ prHhalignPara->pccV = -UNITY;
+ prHhalignPara->pcwV = -UNITY;
+ prHhalignPara->gapbV = -UNITY;
+ prHhalignPara->gapdV = -UNITY;
+ prHhalignPara->gapeV = -UNITY;
+ prHhalignPara->gapfV = -UNITY;
+ prHhalignPara->gapgV = -UNITY;
+ prHhalignPara->gaphV = -UNITY;
+ prHhalignPara->gapiV = -UNITY;
+
+}
+/*** end: SetDefaultHhalignPara() ***/
+
+
+
+/**
* @brief get rid of unknown residues
*
* @note HHalignWrapper can be entered in 2 different ways: (i) all
{
int iS; /* iterator for sequence */
- int iR; /* iterator for residue */
- int iLen; /* length of sequence */
+ /*int iR;*/ /* iterator for residue */
+ /*int iLen;*/ /* length of sequence */
char *pcRes = NULL;
for (pcRes = mseq->seq[iS]; '\0' != *pcRes; pcRes++){
if (isgap(*pcRes)){
+ /* don't like MSF gap characters ('~'),
+ sanitise them (and '.' and ' '); FS, r258 -> r259 */
+ *pcRes = '-';
continue;
}
/**
* @brief re-attach leading and trailing gaps to alignment
*
- * @param[in,out] prMSeq,
+ * @param[in,out] prMSeq
* alignment structure (at this stage there should be no un-aligned sequences)
* @param[in] iProfProfSeparator
* gives sizes of input profiles, -1 if no input-profiles but un-aligned sequences
/**
+ * @brief PosteriorProbabilities() calculates posterior probabilities
+ * of aligning a single sequences on-to an alignment containing this sequence
+ *
+ * @param[in] prMSeq
+ * holds the aligned sequences [in]
+ * @param[in] rHMMalignment
+ * HMM of the alignment in prMSeq
+ * @param[in] rHhalignPara
+ * various parameters read from commandline, needed by hhalign()
+ * @param[in] pcPosteriorfile
+ * name of file into which posterior probability information is written
+ *
+ * @return score of the alignment FIXME what is this?
+ *
+ * @note the PP-loop can be parallelised easily FIXME
+ */
+
+int
+PosteriorProbabilities(mseq_t *prMSeq, hmm_light rHMMalignment, hhalign_para rHhalignPara, char *pcPosteriorfile)
+{
+
+ double dScore = 0.0;
+ int i;
+ int iS; /* iterator for sequences */
+ int iNseq = prMSeq->nseqs; /* number of sequences */
+ int iLenHMM = rHMMalignment.L; /* length of alignment */
+ int iViterbiCount = 0; /* counts how often Viterbi is triggered */
+ char **ppcCopy = NULL;
+ char **ppcRepresent = NULL;
+ char zcAux[10000] = {0};
+ char zcError[10000] = {0};
+ hhalign_scores *prHHscores = NULL;
+ FILE *pfPP = NULL;
+
+ pfPP = fopen(pcPosteriorfile, "w");
+ fprintf(pfPP, "#1.i\t2.name\t\t3.L1\t4.L2\t5.sum\t\t6.sum/L1\t7.HH\n");
+
+
+ prHHscores = CKCALLOC(iNseq, sizeof(hhalign_scores));
+ for (iS = 0; iS < iNseq; iS++){
+ memset(&(prHHscores[iS]), 0, sizeof(hhalign_scores));
+ }
+
+ ppcCopy = CKCALLOC(1, sizeof(char *));
+ ppcCopy[0] = CKCALLOC(2*iLenHMM, sizeof(char));
+ ppcRepresent = CKCALLOC(1, sizeof(char *));
+ ppcRepresent[0] = CKCALLOC(2*iLenHMM, sizeof(char));
+
+ for (i = 0; i < iLenHMM; i++){
+ ppcRepresent[0][i] = rHMMalignment.seq[rHMMalignment.ncons][i+1];
+ }
+
+ /* FIXME: this loop can be parallelised, FS r288 */
+ iViterbiCount = 0;
+ for (iS = 0; iS < iNseq; iS++){
+
+ /* single sequences may very well contain leading/trailing gaps,
+ * this will trigger warning in Transfer:hhalignment-C.h */
+ char *pcIter = NULL;
+ for (pcIter = prMSeq->orig_seq[iS]; ('-' == *pcIter) || ('.' == *pcIter); pcIter++);
+ strcpy(ppcCopy[0], pcIter/*prMSeq->orig_seq[iS]*/);
+ for (pcIter = &ppcCopy[0][strlen(ppcCopy[0])-1]; ('-' == *pcIter) || ('.' == *pcIter); pcIter--);
+ pcIter++;
+ *pcIter = '\0';
+
+ zcError[0] = '\0';
+ hhalign(ppcCopy, 1, NULL,
+ ppcRepresent, 0, NULL,
+ &dScore, &rHMMalignment, &rHMMalignment,
+ NULL, NULL, NULL, NULL,
+ rHhalignPara, &prHHscores[iS], 0/* debug */, FALSE /* Log-Level*/,
+ zcAux, zcError);
+ if (NULL != strstr(zcError, "Viterbi")){
+ iViterbiCount++;
+ }
+
+ } /* 0 <= iS < iNseq */
+ Log(&rLog, LOG_INFO, "Viterbi algorithm triggered %d times (out of %d)", iViterbiCount, iNseq-1);
+
+ for (iS = 0; iS < iNseq; iS++){
+
+ fprintf(pfPP, "%d\t%10s\t%3d"
+ , iS, prMSeq->sqinfo[iS].name, (int)(strlen(prMSeq->orig_seq[iS])));
+ fprintf(pfPP, "\t%3d\t%f\t%f\t%f",
+ prHHscores[iS].L, prHHscores[iS].sumPP, prHHscores[iS].sumPP/strlen(prMSeq->orig_seq[iS]), prHHscores[iS].hhScore);
+ fprintf(pfPP, "\n");
+ } /* 0 <= iS < iNseq */
+
+
+
+ fclose(pfPP); pfPP = NULL;
+ free(ppcRepresent[0]); ppcRepresent[0] = NULL;
+ free(ppcCopy[0]); ppcCopy[0] = NULL;
+ free(ppcRepresent); ppcRepresent = NULL;
+ free(ppcCopy); ppcCopy = NULL;
+ free(prHHscores); prHHscores = NULL;
+
+ return 1;
+
+} /* this is the end of PosteriorProbabilities() */
+
+
+/**
+ * @brief sequentially align (chain) sequences
+ *
+ * @param[in,out] prMSeq
+ * holds the un-aligned sequences (in) and the final alignment (out)
+ * @param[in] rHhalignPara
+ * various parameters needed by hhalign()
+ * @param[in] iClustersize
+ * parameter that controls how often HMM is updated
+ *
+ * @note chained alignment takes much longer than balanced alignment
+ * because at every step ClustalO has to scan all previously aligned residues.
+ * for a balanced tree this takes N*log(N) time but for a chained tree
+ * it takes N^2 time.
+ * This function has a short-cut, that the HMM need not be updated
+ * for every single alignment step, but the HMM from the previous
+ * step(s) can be re-cycled. The HMM is updated (i) at the very first step,
+ * (ii) if a gap has been inserted into the HMM during alignment or
+ * (iii) if the HMM has been used for too many steps without having
+ * been updated. This update-frequency is controlled by the input
+ * parameter iClustersize. iClustersize is the number of sequences used
+ * to build a HMM to allow for one non-updating step. For example,
+ * if iClustersize=100 and a HMM has been build from 100 sequences,
+ * then this HMM can be used once without updating. If the HMM has
+ * been built from 700 sequences (and iClustersize=100) then this
+ * HMM can be used 7-times without having to be updated, etc.
+ * For this reason the initial iClustersize sequences are always
+ * aligned with fully updated HMMs.
+ */
+double
+PileUp(mseq_t *prMSeq, hhalign_para rHhalignPara, int iClustersize)
+{
+ /* @<variables local to PileUp> */
+ int iI; /* iterator for sequences */
+ int iCountPro = 0; /* count how many sequences are already in profile */
+ int *piLeafListSeq = NULL; /* lists required by PrepareAlignment() for attaching sequences pointers to ppcProfileSeq */
+ int *piLeafListPro = NULL; /* -"- ppcProfilePro */
+ double *pdWeightL = NULL; /* argument used by hhalign, not used here */
+ double *pdWeightR = NULL; /* -"- */
+ double *pdWeightP = NULL; /* -"- */
+ char **ppcProfileSeq = NULL; /* pointer to sequences in profile */
+ char **ppcProfilePro = NULL; /* pointer to sequences in profile */
+ double dScore = -1.0; /* alignment score, calculated by hhalign() */
+ hhalign_scores rHHscores = {0}; /* more explicit scores than dScore */
+ hmm_light *prHMM = NULL; /* HMM against which to align single sequence */
+ hmm_light rHMMprofile = {0}; /* HMM calculated externally from profile */
+ int iHHret = -1; /* return value of hhalign() */
+ char zcAux[10000] = {0}; /* auxilliary print string used by hhalign() */
+ char zcError[10000] = {0}; /* error message printed by hhalign() */
+ char *pcConsensPro = NULL, /* auxilliary sequence strings required by hhalign() */
+ *pcReprsntPro = NULL, /* not used here */
+ *pcConsensSeq = NULL, *pcReprsntSeq = NULL;
+ bool bHMM_stale = YES; /* flag if HMM has to be updated */
+ int iSinceLastUpdate = -1; /* counts how often HMM has been used since last update */
+ progress_t *prProgress; /* structure required for progress report */
+ bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE; /* progress report */
+ char **ppcReprsnt = NULL; /* string used to represent HMM */
+ ppcReprsnt = CKCALLOC(1, sizeof(char *));
+
+ /* weights are not really used, but hhalign() expects them */
+ pdWeightL = (double *)CKMALLOC(prMSeq->nseqs * sizeof(double));
+ pdWeightR = (double *)CKMALLOC(prMSeq->nseqs * sizeof(double));
+ pdWeightP = (double *)CKMALLOC(prMSeq->nseqs * sizeof(double));
+
+ /* lists used for attaching ppcProfileSeq/ppcProfilePro to prMSeq */
+ piLeafListSeq = (int *)CKMALLOC(1 * sizeof(int));
+ piLeafListPro = (int *)CKMALLOC(prMSeq->nseqs * sizeof(int));
+
+ ppcProfileSeq = (char **)CKMALLOC(1 * sizeof(char *));
+ ppcProfileSeq[0] = NULL;
+ ppcProfilePro = (char **)CKMALLOC(prMSeq->nseqs * sizeof(char *));
+ for (iI = 0; iI < prMSeq->nseqs; iI++){
+ ppcProfilePro[iI] = NULL;
+ }
+
+ piLeafListPro[0] = 0; /* first sequences in profile is simply un-aligned sequence */
+ iCountPro = 1; /* 'profile' now contains 1 sequence */
+
+ NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
+ "Progressive alignment progress", bPrintCR);
+
+ /* build up the initial alignment:
+ * sequences are chained but hhalign() is used normally,
+ * that is, one sequence is aligned to a profile */
+ for (iI = 1; iI < MIN(prMSeq->nseqs, iClustersize); iI++){
+
+ piLeafListSeq[0] = iI;
+
+ /* PrepareAlignment() connects ppcProfileSeq/ppcProfilePro and prMSeq */
+ PrepareAlignment(prMSeq, ppcProfilePro, ppcProfileSeq,
+ pdWeightL, pdWeightR, pdWeightP,
+ iI, piLeafListPro,
+ 1, piLeafListSeq);
+
+ /* align 1 (5th argument) sequence to the growing profile
+ * (2nd argument number of sequences is iI),
+ * external HMM not used at this stage (prHMM=NULL) */
+ iHHret = hhalign(ppcProfilePro, iI, NULL/*pdWeightL*/,
+ ppcProfileSeq, 1, NULL/*pdWeightR*/,
+ &dScore, prHMM, prHMM,
+ pcConsensPro, pcReprsntPro,
+ pcConsensSeq, pcReprsntSeq,
+ rHhalignPara, &rHHscores,
+ CALL_FROM_REGULAR/* DEBUG ARGUMENT */, rLog.iLogLevelEnabled,
+ zcAux, zcError);
+
+ piLeafListPro[iI] = iI;
+ iCountPro = iI+1;
+ ProgressLog(prProgress, iI, prMSeq->nseqs-1, FALSE); /* FIXME: test! FS, r288 */
+
+ } /* 1 <= iI < MIN(prMSeq->nseqs, iClustersize) */
+
+ /* now align single sequences not to a profile but to a HMM of a potentially old profile */
+ while (iI < prMSeq->nseqs){
+
+ /* if HMM not up-to-date then create new HMM for profile:
+ * HMM is stale (i) at the beginning,
+ * (ii) if a gap has been inserted into the HMM during previous step,
+ * (iii) too many steps after the last up-date */
+ if ( (YES == bHMM_stale) ){
+ FreeHMMstruct(&rHMMprofile);
+ if (OK !=
+ AlnToHMM2(&rHMMprofile, rHhalignPara, prMSeq->seq, iI)
+ ) {
+ Log(&rLog, LOG_ERROR, "Couldn't convert alignment to HMM. Will not proceed with chained alignment, step %d", iI);
+ }
+ bHMM_stale = NO;
+ iSinceLastUpdate = 0;
+ }
+
+ /* align single sequence to HMM */
+ piLeafListSeq[0] = iI;
+
+ PrepareAlignment(prMSeq, ppcProfilePro, ppcProfileSeq,
+ pdWeightL, pdWeightR, pdWeightP,
+ iI, piLeafListPro,
+ 1, piLeafListSeq);
+
+ /* use representative sequence to track where gaps are inserted into HMM */
+ ppcReprsnt[0] = CKREALLOC(ppcReprsnt[0], strlen(ppcProfilePro[0])+strlen(ppcProfileSeq[0])+1);
+ memcpy(ppcReprsnt[0], rHMMprofile.seq[rHMMprofile.ncons]+1, rHMMprofile.L);
+ ppcReprsnt[0][rHMMprofile.L] = '\0';
+
+ /* align 1 (5th argument) sequence to a HMM (2nd argument number of sequences is '0',
+ * not iI as for a profile or '1' for one representative sequence),
+ * external HMM (rHMMprofile calculated by AlnToHMM2()) is used at this stage */
+ prHMM = &rHMMprofile;
+ hhalign(ppcReprsnt, 0, NULL,
+ ppcProfileSeq, 1, NULL,
+ &dScore, prHMM, prHMM,
+ pcConsensPro, pcReprsntPro,
+ pcConsensSeq, pcReprsntSeq,
+ rHhalignPara, &rHHscores,
+ CALL_FROM_ALN_HMM/* DEBUG ARGUMENT */, rLog.iLogLevelEnabled,
+ zcAux, zcError);
+ iSinceLastUpdate++; /* HMM has been used one more time */
+ ProgressLog(prProgress, iI, prMSeq->nseqs-1, FALSE); /* FIXME: test! FS, r288 */
+
+ /* check if gaps have been inserted into HMM.
+ * if this is the case then HMM is stale
+ * and the profile used to create the HMM must get gaps at the same positions.
+ * gaps are shown in the representative sequence */
+ if (rHMMprofile.L != strlen(ppcReprsnt[0])){
+
+ int iSeq; /* iterate through sequences of profile */
+
+ /* manually insert gaps into existing profile,
+ * there should already be enough memory */
+#ifdef HAVE_OPENMP
+ /* all sequences in profile obtain gaps at the same position,
+ * can do this in parallel */
+ #pragma omp parallel for
+#endif
+ for (iSeq = 0; iSeq < iI; iSeq++){
+
+ int iPtrRep, /* points to 'residue' in representative sequence */
+ iPtrPrf; /* points to the corresponding position in the profile */
+
+ for (iPtrRep = strlen(ppcReprsnt[0])-1, iPtrPrf = strlen(ppcProfilePro[iSeq])-1;
+ iPtrRep >= 0; iPtrRep--){
+
+ if ('-' == ppcReprsnt[0][iPtrRep]){ /* gaps are newly introduced into profile */
+ ppcProfilePro[iSeq][iPtrRep] = '-';
+ }
+ else { /* non-gap characters (residues) are shifted towards the back */
+ ppcProfilePro[iSeq][iPtrRep] = ppcProfilePro[iSeq][iPtrPrf];
+ iPtrPrf--;
+ }
+ if (iPtrRep == iPtrPrf){
+ /* if profile and representative seq point to same position then no more gaps */
+ break;
+ }
+ } /* strlen(ppcReprsnt[0])-1 >= iPtrRep >= 0 */
+ ppcProfilePro[iSeq][strlen(ppcReprsnt[0])] = '\0';
+
+ } /* 0 <= iSeq < iI */
+
+ /* make HMM stale */
+ bHMM_stale = YES;
+
+ } /* strlen(represent) != HMM.L (gaps inserted into HMM) */
+
+ /* check if maximum number of sequences exceeded */
+ if ( (iClustersize <= 1) || (iSinceLastUpdate > (double)(rHMMprofile.N_in)/(double)(iClustersize)) ){
+ bHMM_stale = YES;
+ }
+
+
+ piLeafListPro[iI] = iI;
+ iCountPro = iI+1;
+
+ iI++;
+
+ } /* iI < prMSeq->nseqs */
+ ProgressDone(prProgress);
+ FreeProgress(&prProgress);
+
+ FreeHMMstruct(&rHMMprofile);
+ CKFREE(pdWeightL); CKFREE(pdWeightR); CKFREE(pdWeightP);
+ if (NULL != ppcReprsnt){
+ if (NULL != ppcReprsnt[0]){
+ CKFREE(ppcReprsnt[0]);
+ }
+ CKFREE(ppcReprsnt);
+ }
+ CKFREE(piLeafListSeq); CKFREE(piLeafListPro);
+
+ return 0.0;
+
+} /* this is the end of PileUp() */
+
+
+
+
+
+/**
* @brief wrapper for hhalign. This is a frontend function to
* the ported hhalign code.
*
* @note: if hhalign() fails then try with Viterbi by setting MAC-RAM=0; FS, r241 -> r243
*/
+
double
HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
double *pdSeqWeights, int iNodeCount,
double *pdWeightsL = NULL; /* sequence weights of left profile */
double *pdWeightsR = NULL; /* sequence weights of right profile */
int iMergeNodeCounter = 0;
- hmm_light *prHMM = NULL;
+ hmm_light *prHMM = NULL;
+ hmm_light *prHMMleft = NULL;
+ hmm_light *prHMMrght = NULL;
+ hmm_light *prHMMnull = NULL;
bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
#if TIMING
char zcStopwatchMsg[1024];
StopwatchZero(stopwatch);
StopwatchStart(stopwatch);
#endif
-
-
- if (NULL != prHMMList) {
+ hhalign_scores rHHscores = {0};
+
+#if 0 /* DEVEL 291 */
+ if (NULL != prHMMList) { /* FIXME DEVEL 291: replace this outer test with iHMMCount>0*/
if (iHMMCount>1) {
Log(&rLog, LOG_WARN, "FIXME: Using only first of %u HMMs (needs implementation)", iHMMCount);
}
- prHMM = &(prHMMList[0]);
+ prHMM = &(prHMMList[0]); /* FIXME DEVEL 291: check for NULL */
} else {
/* FIXME: prHMM not allowed to be NULL and therefore pseudo allocated here */
prHMM = (hmm_light *) CKCALLOC(1, sizeof(hmm_light));
}
+#else
+ prHMMnull = (hmm_light *) CKCALLOC(1, sizeof(hmm_light));
+ if ( (iHMMCount > 0) && (NULL != prHMMList) ){
+ prHMM = &(prHMMList[0]);
+ if (iHMMCount > 1) {
+ Log(&rLog, LOG_WARN, "FIXME: Using only first of %u HMMs (needs implementation)", iHMMCount);
+ }
+ }
+ else {
+ prHMM = prHMMnull; /* prHMM not allowed to be NULL and therefore assigned to pseudo allocated */
+ }
+#endif
assert(NULL!=prMSeq);
* protein
*/
if (SEQTYPE_PROTEIN != prMSeq->seqtype) {
- Log(&rLog, LOG_WARN, "Sequence type is %s. HHalign has only been tested on protein.",
- SeqTypeToStr(prMSeq->seqtype));
+ /*Log(&rLog, LOG_WARN, "%s alignment is still experimental.",
+ SeqTypeToStr(prMSeq->seqtype));*/
+ if(prMSeq->seqtype == SEQTYPE_DNA)
+ rHhalignPara.bIsDna = true;
+ if(prMSeq->seqtype == SEQTYPE_RNA)
+ rHhalignPara.bIsRna = true;
}
/* hhalign produces funny results if sequences contain gaps, so
NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
"Progressive alignment progress", bPrintCR);
+
/* Profile-profile alignment? Then setup piLeafCount and
* piLeafList here. FIXME this is just an awful haaaack
}
}
+#if 1 /* DEVEL 291 */
+ /*
+ * Note: if there is a HMM-batch file, then prMSeq->ppiHMMBindex is not NULL;
+ * ppiLeafList[iL/iR][0] is the 'lead' sequence in a profile;
+ * prMSeq->ppiHMMBindex[ppiLeafList[iL][0]] are the HMMs associated with the lead sequence;
+ * this could be NULL if there are no HMMs associated with this particular sequence
+ * at the moment only 1st HMM can be used, prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0];
+ * the index of this HMM can be '-1' if the specified HMM file does not exist
+ * Note: we only use prHMMleft/prHMMrght, even if global HMM (--hmm-in) is specified
+ **/
+ if ( (NULL != prMSeq->ppiHMMBindex) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iL][0]]) &&
+ (prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] > -1) ){
+ prHMMleft = &(prHMMList[prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0]]);
+ }
+ else if (iHMMCount > 0){
+ prHMMleft = prHMM;
+ }
+ else {
+ prHMMleft = prHMMnull;
+ }
+ if ( (NULL != prMSeq->ppiHMMBindex) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iR][0]]) &&
+ (prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] > -1) ){
+ prHMMrght = &(prHMMList[prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0]]);
+ }
+ else if (iHMMCount > 0){
+ prHMMrght = prHMM;
+ }
+ else {
+ prHMMrght = prHMMnull;
+ }
+#endif
/* align individual sequences to HMM;
* - use representative sequence to get gapping
* full HMM but that does not seem to work at all
* -- try harder! Fail better!
*/
- if ( (piLeafCount[iL] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){
+ /*if ( (piLeafCount[iL] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){*/
+ if (0 != prHMMleft->L){
int i, j;
- pcReprsnt1 = CKCALLOC(prHMM->L+strlen(ppcProfile1[0])+1, sizeof(char));
- for (i = 0; i < prHMM->L; i++){
- pcReprsnt1[i] = prHMM->seq[prHMM->ncons][i+1];
+ pcReprsnt1 = CKCALLOC(prHMMleft->L+strlen(ppcProfile1[0])+1, sizeof(char));
+ for (i = 0; i < prHMMleft->L; i++){
+ pcReprsnt1[i] = prHMMleft->seq[prHMMleft->ncons][i+1];
}
ppcCopy1 = CKCALLOC(piLeafCount[iL], sizeof(char *));
for (j = 0; j < piLeafCount[iL]; j++){
- ppcCopy1[j] = CKCALLOC(prHMM->L+strlen(ppcProfile1[0])+1, sizeof(char));
+ ppcCopy1[j] = CKCALLOC(prHMMleft->L+strlen(ppcProfile1[0])+1, sizeof(char));
for (i = 0; i < (int) strlen(ppcProfile1[0]); i++){
ppcCopy1[j][i] = ppcProfile1[j][i];
}
FS, r236 -> r237
*/
int iLenA = strlen(ppcCopy1[0]);
- int iLenH = prHMM->L;
+ int iLenH = prHMMleft->L;
int iHHret = 0;
if (iLenH < iLenA){
iHHret = hhalign(ppcReprsnt1, 0/* only one representative seq*/, NULL,
ppcCopy1, piLeafCount[iL], pdWeightsL,
- &dScore, prHMM,
+ &dScore, prHMMleft, prHMMleft,
NULL, NULL, NULL, NULL,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
+ rHhalignPara, &rHHscores,
+ iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
zcAux, zcError);
}
else {
iHHret = hhalign(ppcCopy1, piLeafCount[iL], pdWeightsL,
ppcReprsnt1, 0/* only one representative seq*/, NULL,
- &dScore, prHMM,
+ &dScore, prHMMleft, prHMMleft,
NULL, NULL, NULL, NULL,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
+ rHhalignPara, &rHHscores,
+ iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
zcAux, zcError);
}
- if (0 != iHHret){ /* FS, r255 -> */
+ if ( (0 != iHHret) && (rLog.iLogLevelEnabled <= LOG_VERBOSE) ){ /* FS, r255 -> */
fprintf(stderr, "%s:%s:%d: (not essential) HMM pre-alignment failed, error %d, \n"
- "\t#=%d (len=%d), lead-seq=%s, len(HMM)=%d\tCARRY ON REGARDLESS\n",
+ "\t#=%d (len=%d), lead-seq=%s, len(HMM)=%d\n%s\nCARRY ON REGARDLESS\n",
__FUNCTION__, __FILE__, __LINE__, iHHret,
- piLeafCount[iL], (int)strlen(ppcCopy1[0]), prMSeq->sqinfo[ppiLeafList[iL][0]].name, (int)strlen(ppcReprsnt1[0]));
+ piLeafCount[iL], (int)strlen(ppcCopy1[0]), prMSeq->sqinfo[ppiLeafList[iL][0]].name,
+ (int)strlen(ppcReprsnt1[0]), zcError);
}
}
pdScores[ppiLeafList[iL][0]] = dScore;
* it only contains a gap if all sequences of the profile
* have a gap at this position
*/
- pcConsens1 = CKCALLOC(prHMM->L+strlen(ppcProfile1[0])+1, sizeof(char));
- for (i = 0; i < prHMM->L; i++){
+ pcConsens1 = CKCALLOC(prHMMleft->L+strlen(ppcProfile1[0])+1, sizeof(char));
+ for (i = 0; i < prHMMleft->L; i++){
for (j = 0, pcConsens1[i] = '-'; (j < piLeafCount[iL]) && ('-' == pcConsens1[i]); j++){
pcConsens1[i] = ppcCopy1[j][i];
}
#endif
} /* ( (1 == piLeafCount[iL]) && (0 != prHMM->L) ) */
- if ( (piLeafCount[iR] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){
+ /*if ( (piLeafCount[iR] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){*/
+ if (0 != prHMMrght->L){
int i, j;
- pcReprsnt2 = CKCALLOC(prHMM->L+strlen(ppcProfile2[0])+1, sizeof(char));
- for (i = 0; i < prHMM->L; i++){
- pcReprsnt2[i] = prHMM->seq[prHMM->ncons][i+1];
+ pcReprsnt2 = CKCALLOC(prHMMrght->L+strlen(ppcProfile2[0])+1, sizeof(char));
+ for (i = 0; i < prHMMrght->L; i++){
+ pcReprsnt2[i] = prHMMrght->seq[prHMMrght->ncons][i+1];
}
ppcCopy2 = CKCALLOC(piLeafCount[iR], sizeof(char *));
for (j = 0; j < piLeafCount[iR]; j++){
- ppcCopy2[j] = CKCALLOC(prHMM->L+strlen(ppcProfile2[0])+1, sizeof(char));
+ ppcCopy2[j] = CKCALLOC(prHMMrght->L+strlen(ppcProfile2[0])+1, sizeof(char));
for (i = 0; i < (int) strlen(ppcProfile2[0]); i++){
ppcCopy2[j][i] = ppcProfile2[j][i];
}
FS, r236 -> r237
*/
int iLenA = strlen(ppcCopy2[0]);
- int iLenH = prHMM->L;
+ int iLenH = prHMMrght->L;
int iHHret = 0;
if (iLenH < iLenA){
iHHret = hhalign(ppcReprsnt2, 0/* only one representative seq */, NULL,
ppcCopy2, piLeafCount[iR], pdWeightsR,
- &dScore, prHMM,
+ &dScore, prHMMrght, prHMMrght,
NULL, NULL, NULL, NULL,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
+ rHhalignPara, &rHHscores,
+ iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
zcAux, zcError);
}
else {
iHHret = hhalign(ppcCopy2, piLeafCount[iR], pdWeightsR,
ppcReprsnt2, 0/* only one representative seq */, NULL,
- &dScore, prHMM,
+ &dScore, prHMMrght, prHMMrght,
NULL, NULL, NULL, NULL,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
+ rHhalignPara, &rHHscores,
+ iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
zcAux, zcError);
}
- if (0 != iHHret){ /* FS, r255 -> */
+ if ( (0 != iHHret) && (rLog.iLogLevelEnabled <= LOG_VERBOSE) ){ /* FS, r255 -> */
fprintf(stderr, "%s:%s:%d: (not essential) HMM pre-alignment failed, error %d, \n"
- "\t#=%d (len=%d), lead-seq=%s, len(HMM)=%d\tCARRY ON REGARDLESS\n",
+ "\t#=%d (len=%d), lead-seq=%s, len(HMM)=%d\n%s\nCARRY ON REGARDLESS\n",
__FUNCTION__, __FILE__, __LINE__, iHHret,
- piLeafCount[iR], (int)strlen(ppcCopy2[0]), prMSeq->sqinfo[ppiLeafList[iR][0]].name, (int)strlen(ppcReprsnt2[0]));
+ piLeafCount[iR], (int)strlen(ppcCopy2[0]), prMSeq->sqinfo[ppiLeafList[iR][0]].name,
+ (int)strlen(ppcReprsnt2[0]), zcError);
}
}
pdScores[ppiLeafList[iR][0]] = dScore;
* it only contains a gap if all sequences of the profile
* have a gap at this position
*/
- pcConsens2 = CKCALLOC(prHMM->L+strlen(ppcProfile2[0])+1, sizeof(char));
- for (i = 0; i < prHMM->L; i++){
+ pcConsens2 = CKCALLOC(prHMMrght->L+strlen(ppcProfile2[0])+1, sizeof(char));
+ for (i = 0; i < prHMMrght->L; i++){
for (j = 0, pcConsens2[i] = '-'; (j < piLeafCount[iR]) && ('-' == pcConsens2[i]); j++){
pcConsens2[i] = ppcCopy2[j][i];
}
Log(&rLog, LOG_FATAL, "strlen(prof1)=%d, strlen(prof2)=%d -- nothing to align\n",
iLen1, iLen2);
}
-
if (iLen1 < iLen2){
int iHHret = 0;
int iOldMacRam = rHhalignPara.iMacRamMB;
iHHret = hhalign(ppcProfile1, piLeafCount[iL], pdWeightsL,
ppcProfile2, piLeafCount[iR], pdWeightsR,
- &dScore, prHMM,
+ &dScore, prHMMleft, prHMMrght,
pcConsens1, pcReprsnt1,
pcConsens2, pcReprsnt2,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
+ rHhalignPara, &rHHscores,
+ iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
zcAux, zcError);
if (RETURN_OK != iHHret){ /* FS, r241 -> */
+ /*fprintf(stderr, "%s:%d: emergency EXIT\n", __FILE__, __LINE__); exit(-1);*/
fprintf(stderr,
"%s:%s:%d: problem in alignment (profile sizes: %d + %d) (%s + %s), forcing Viterbi\n"
- "\thh-error-code=%d (mac-ram=%d)\n",
+ "\thh-error-code=%d (mac-ram=%d)\n%s",
__FUNCTION__, __FILE__, __LINE__, piLeafCount[iL], piLeafCount[iR],
prMSeq->sqinfo[ppiLeafList[iL][0]].name, prMSeq->sqinfo[ppiLeafList[iR][0]].name,
- iHHret, rHhalignPara.iMacRamMB);
+ iHHret, rHhalignPara.iMacRamMB, zcError);
/* at this stage hhalign() has failed,
the only thing we can do (easily) is to re-run it in Viterbi mode,
for this set MAC-RAM=0, set it back to its original value after 2nd try.
FS, r241 -> r243 */
+
if (RETURN_FROM_MAC == iHHret){
/* Note: the default way to run hhalign() is to initially select MAC
by giving it all the memory it needs. MAC may fail due to overflow (repeats?).
iHHret = hhalign(ppcProfile1, piLeafCount[iL], pdWeightsL,
ppcProfile2, piLeafCount[iR], pdWeightsR,
- &dScore, prHMM,
+ &dScore, prHMMleft, prHMMrght,
pcConsens1, pcReprsnt1,
pcConsens2, pcReprsnt2,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
+ rHhalignPara, &rHHscores,
+ iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
zcAux, zcError);
if (RETURN_OK != iHHret){ /* at this stage hhalign() has failed twice,
1st time MAC, 2nd time Viterbi, don't know what to do else. FS, r241 -> r243 */
- fprintf(stderr, "%s:%s:%d: problem in alignment, even Viterbi did not work\n"
- "\thh-error-code=%d (mac-ram=%d)\n",
- __FUNCTION__, __FILE__, __LINE__, iHHret, rHhalignPara.iMacRamMB);
+ fprintf(stderr, "%s:%s:%d: problem in alignment, Viterbi did not work\n"
+ "\thh-error-code=%d (mac-ram=%d)\n%s",
+ __FUNCTION__, __FILE__, __LINE__, iHHret, rHhalignPara.iMacRamMB, zcError);
Log(&rLog, LOG_FATAL, "could not perform alignment -- bailing out\n");
}
else {
int iOldMacRam = rHhalignPara.iMacRamMB;
iHHret = hhalign(ppcProfile2, piLeafCount[iR], pdWeightsR,
ppcProfile1, piLeafCount[iL], pdWeightsL,
- &dScore, prHMM,
+ &dScore, prHMMrght, prHMMleft,
pcConsens2, pcReprsnt2,
pcConsens1, pcReprsnt1,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
+ rHhalignPara, &rHHscores,
+ iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
zcAux, zcError);
if (RETURN_OK != iHHret){ /* FS, r241 -> r243 */
+ /*fprintf(stderr, "%s:%d: emergency EXIT\n", __FILE__, __LINE__); exit(-1);*/
fprintf(stderr,
"%s:%s:%d: problem in alignment (profile sizes: %d + %d) (%s + %s), forcing Viterbi\n"
- "\thh-error-code=%d (mac-ram=%d)\n",
+ "\thh-error-code=%d (mac-ram=%d)\n%s",
__FUNCTION__, __FILE__, __LINE__, piLeafCount[iL], piLeafCount[iR],
prMSeq->sqinfo[ppiLeafList[iL][0]].name, prMSeq->sqinfo[ppiLeafList[iR][0]].name,
- iHHret, rHhalignPara.iMacRamMB);
+ iHHret, rHhalignPara.iMacRamMB, zcError);
/* at this stage hhalign() has failed,
the only thing we can do (easily) is to re-run it in Viterbi mode,
for this set MAC-RAM=0, set it back to its original value after 2nd try.
FS, r241 -> r243 */
+
if (RETURN_FROM_MAC == iHHret){
/* see above */
rHhalignPara.iMacRamMB = 0;
iHHret = hhalign(ppcProfile2, piLeafCount[iR], pdWeightsR,
ppcProfile1, piLeafCount[iL], pdWeightsL,
- &dScore, prHMM,
+ &dScore, prHMMrght, prHMMleft,
pcConsens2, pcReprsnt2,
pcConsens1, pcReprsnt1,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
+ rHhalignPara, &rHHscores,
+ iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
zcAux, zcError);
if (RETURN_OK != iHHret){ /* at this stage hhalign() has failed twice,
1st time MAC, 2nd time Viterbi, don't know what to do else. FS, r241 -> r243 */
- fprintf(stderr, "%s:%s:%d: problem in alignment, even Viterbi did not work\n"
- "\thh-error-code=%d (mac-ram=%d)\n",
- __FUNCTION__, __FILE__, __LINE__, iHHret, rHhalignPara.iMacRamMB);
+ fprintf(stderr, "%s:%s:%d: problem in alignment, Viterbi did not work\n"
+ "\thh-error-code=%d (mac-ram=%d)\n%s",
+ __FUNCTION__, __FILE__, __LINE__, iHHret, rHhalignPara.iMacRamMB, zcError);
Log(&rLog, LOG_FATAL, "could not perform alignment -- bailing out\n");
}
else {
} /* 1st invocation failed */
} /* 2nd profile was shorter than 1st */
+
+ /*
+ * at this stage have performed alignment of 2 profiles/sequences.
+ * if HMM batch information had been used then have choices:
+ * (i) if HMM info only intended for initial alignment (of sequences) then make both HMMs stale;
+ * (iia) if alignment of 2 profiles/sequences where same HMM used, then retain;
+ * (iib) if alignment of 2 profiles/sequences where different HMMs used, then make both stale;
+ * (iii) some majority voting
+ */
+#if 0 /* always make HMM batch stale (after 1st invocation) */
+ if ( (NULL != prMSeq->ppiHMMBindex) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iL][0]]) ){
+ prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] = -1;
+ }
+ if ( (NULL != prMSeq->ppiHMMBindex) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iR][0]]) ){
+ prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] = -1;
+ }
+#else /* retain HMMs if they were the same for both profiles */
+ if (NULL != prMSeq->ppiHMMBindex) {
+ int i;
+
+ if ( (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iL][0]]) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iR][0]]) ){
+ if ( prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] == -1){
+ prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] = -1; /* this is conservative, could do H[iL] = H[iR] */
+ for (i = 0; i < piLeafCount[iR]; i++){
+ prMSeq->ppiHMMBindex[ppiLeafList[iR][i]][0] = -1;
+ }
+ }
+ else if ( prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] == -1){
+ prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] = -1; /* this is conservative, could do H[iR] = H[iL] */
+ for (i = 0; i < piLeafCount[iL]; i++){
+ prMSeq->ppiHMMBindex[ppiLeafList[iL][i]][0] = -1;
+ }
+ }
+ else if (prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] != prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0]){
+ prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] = -1; /* this is NOT conservative, mandatory */
+ for (i = 0; i < piLeafCount[iL]; i++){
+ prMSeq->ppiHMMBindex[ppiLeafList[iL][i]][0] = -1;
+ }
+ prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] = -1; /* this is NOT conservative, mandatory */
+ for (i = 0; i < piLeafCount[iR]; i++){
+ prMSeq->ppiHMMBindex[ppiLeafList[iR][i]][0] = -1;
+ }
+ }
+ else {
+ /* void, HMMs should be same */
+ }
+ }
+ } /* there was a HMM batch */
+#endif
+ if (rLog.iLogLevelEnabled <= LOG_DEBUG){
+ int i;
+
+ printf("@@iL=%d, #(iL)=%d, iR=%d, #(iR)=%d\n", iL, piLeafCount[iL], iR, piLeafCount[iR]);
+ for (i = 0; i < piLeafCount[iL]; i++){
+ char *pc = ppcProfile1[i];
+ printf("@@>%s\n", prMSeq->sqinfo[ppiLeafList[iL][i]].name);
+ printf("@@");
+ while('\0' != *pc){
+ printf("%c", toupper(*pc));
+ pc++;
+ }
+ printf("\n");
+ }
+ for (i = 0; i < piLeafCount[iR]; i++){
+ char *pc = ppcProfile2[i];
+ printf("@@>%s\n", prMSeq->sqinfo[ppiLeafList[iR][i]].name);
+ printf("@@");
+ while('\0' != *pc){
+ printf("%c", toupper(*pc));
+ pc++;
+ }
+ printf("\n");
+ }
+ printf("\n");
+ } /* LOG_DEBUG */
+
}
/* free left/right node lists,
* after alignment left/right profiles no longer needed
ppiLeafList[iL] = CKFREE(ppiLeafList[iL]);
ppiLeafList[iR] = CKFREE(ppiLeafList[iR]);
piLeafCount[iL] = piLeafCount[iR] = 0;
-
+
} /* was a merge node */
if (rLog.iLogLevelEnabled <= LOG_DEBUG){
TranslateUnknown2Ambiguity(prMSeq);
ReAttachLeadingGaps(prMSeq, iProfProfSeparator);
+#if 0 /* DEVEL 291 */
if (NULL == prHMMList){
CKFREE(prHMM);
}
+#else
+ CKFREE(prHMMnull);
+#endif
CKFREE(ppcProfile2);
CKFREE(ppcProfile1);
CKFREE(ppiLeafList[piOrderLR[DIFF_NODE*(iNodeCount-1)+PRNT_NODE]]);