+++ /dev/null
-/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
-
-/*********************************************************************
- * Clustal Omega - Multiple sequence alignment
- *
- * Copyright (C) 2010 University College Dublin
- *
- * Clustal-Omega is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This file is part of Clustal-Omega.
- *
- ********************************************************************/
-
-/*
- * RCS $Id: hhalign_wrapper.c 256 2011-06-23 13:57:28Z fabian $
- */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include <ctype.h>
-
-#include "seq.h"
-#include "tree.h"
-#include "progress.h"
-#include "hhalign/general.h"
-#include "hhalign/hhfunc.h"
-#include "hhalign/hhalign.h"
-
-/* up to this level (from leaf) will background HMM info be applied */
-#define APPLY_BG_HMM_UP_TO_TREE_DEPTH 10
-
-#define TIMING 0
-
-#define TRACE 0
-
-/**
- * @brief get rid of unknown residues
- *
- * @note HHalignWrapper can be entered in 2 different ways: (i) all
- * sequences are un-aligned (ii) there are 2 (aligned) profiles. in
- * the un-aligned case (i) the sequences come straight from Squid,
- * that is, they have been sanitised, all non-alphabetic residues
- * have been rendered as X's. In profile mode (ii) one profile may
- * have been produced internally. In that case residues may have
- * been translated back into their 'native' form, that is, they may
- * contain un-sanitised residues. These will cause trouble during
- * alignment
- * FS, r213->214
- */
-void
-SanitiseUnknown(mseq_t *mseq)
-{
-
- int iS; /* iterator for sequence */
- int iR; /* iterator for residue */
- int iLen; /* length of sequence */
- char *pcRes = NULL;
-
-
- for (iS = 0; iS < mseq->nseqs; iS++){
-
- for (pcRes = mseq->seq[iS]; '\0' != *pcRes; pcRes++){
-
- if (isgap(*pcRes)){
- continue;
- }
-
- if (mseq->seqtype==SEQTYPE_PROTEIN) {
- if (NULL == strchr(AMINO_ALPHABET, toupper(*pcRes))) {
- *pcRes = AMINOACID_ANY;
- }
- } else if (mseq->seqtype==SEQTYPE_DNA) {
- if (NULL == strchr(DNA_ALPHABET, toupper(*pcRes))) {
- *pcRes = NUCLEOTIDE_ANY;
- }
- } else if (mseq->seqtype==SEQTYPE_RNA) {
- if (NULL == strchr(RNA_ALPHABET, toupper(*pcRes))) {
- *pcRes = NUCLEOTIDE_ANY;
- }
- }
-
- } /* !EO String */
-
- } /* 0 <= iS < mseq->nseqs */
-
- return;
-
-} /*** end: SanitiseUnknown() ***/
-
-/**
- * @brief translate unknown residues back to ambiguity codes;
- * hhalign translates ambiguity codes (B,Z) into unknown residue (X).
- * we still have the original (un-aligned) residue information,
- * by iterating along the original and aligned sequences we can
- * reconstruct where codes have been changed and restore them
- * to their original value
- *
- * @param[in,out] mseq
- * sequence/profile data, mseq->seq [in,out] is changed to conform
- * with mseq->orig_seq [in]
- *
- */
-void
-TranslateUnknown2Ambiguity(mseq_t *mseq)
-{
-
- int iS; /* iterator for sequence */
- int iR, iRo; /* iterator for residue (original) */
- int iChange, iCase, iAmbi; /* counts how many replacements */
- static int siOffset = 'a' - 'A';
-
- for (iS = 0; iS < mseq->nseqs; iS++){
-
- iR = iRo = 0;
- iChange = iCase = iAmbi = 0;
-
- while(('\0' != mseq->seq[iS][iR]) &&
- ('\0' != mseq->orig_seq[iS][iRo])) {
-
- /* skip gaps in aligned sequences */
- while(isgap(mseq->seq[iS][iR])) {
- iR++;
- } /* was gap in unaligned seq
- * this should probably not happen */
- while(isgap(mseq->orig_seq[iS][iRo])) {
- iRo++;
- } /* was gap in aligned seq */
-
- /* check if we reached the end of the sequence after
- * skipping the gaps */
- if ( ('\0' == mseq->seq[iS][iR]) ||
- ('\0' == mseq->orig_seq[iS][iRo]) ){
- break;
- }
-
-
- if (mseq->seq[iS][iR] != mseq->orig_seq[iS][iRo]){
- /* FIXME: count replacements, discard case changes */
- iChange++;
- if ( (mseq->seq[iS][iR] == mseq->orig_seq[iS][iRo]+siOffset) ||
- (mseq->seq[iS][iR] == mseq->orig_seq[iS][iRo]-siOffset) ){
- iCase++;
- }
- else {
- iAmbi++;
- }
-#if TRACE
- Log(&rLog, LOG_FORCED_DEBUG, "seq=%d, pos=(%d:%d), (%c:%c)",
- iS, iR, iRo,
- mseq->seq[iS][iR], mseq->orig_seq[iS][iRo]);
-#endif
- mseq->seq[iS][iR] = mseq->orig_seq[iS][iRo];
- }
-
- iR++;
- iRo++;
-
- } /* !EO seq */
-
- Log(&rLog, LOG_DEBUG,
- "in seq %d re-translated %d residue codes (%d true, %d case)",
- iS, iChange, iAmbi, iCase);
-
- /* assert that both sequences (un/aligned) have terminated */
- /* skip gaps in aligned sequences */
- while(isgap(mseq->seq[iS][iR])) {
- iR++;
- } /* was gap in unaligned seq
- * this should probably not happen */
- while(isgap(mseq->orig_seq[iS][iRo])) {
- iRo++;
- } /* was gap in aligned seq */
-
-
- if ( ('\0' != mseq->seq[iS][iR]) ||
- ('\0' != mseq->orig_seq[iS][iRo]) ){
-
- Log(&rLog, LOG_FATAL, "inconsistency in un/aligned sequence %d\n>%s\n>%s\n",
- iS, mseq->seq[iS], mseq->orig_seq[iS]);
- }
-
- } /* 0 <= iS < mseq->nseqs */
-
-} /*** end: TranslateUnknown2Ambiguity() ***/
-
-
-/**
- * @brief re-attach leading and trailing gaps to alignment
- *
- * @param[in,out] prMSeq,
- * alignment structure (at this stage there should be no un-aligned sequences)
- * @param[in] iProfProfSeparator
- * gives sizes of input profiles, -1 if no input-profiles but un-aligned sequences
- *
- * @note leading and tailing profile columns
- * that only contain gaps have no effect on the alignment
- * and are removed during the alignment. if they are
- * encountered a warning message is printed to screen.
- * some users like to preserve these gap columns
- * FS, r213->214
- */
-void
-ReAttachLeadingGaps(mseq_t *prMSeq, int iProfProfSeparator)
-{
-
- int i, j;
- int iSize1 = 0; /* #seqs in 1st profile */
- int iSize2 = 0; /* #seqs in 2nd profile */
- int iPPS = iProfProfSeparator;
- int iLeadO1 = 0; /* leading gaps in 1st seq of 1st profile */
- int iLeadO2 = 0; /* leading gaps in 1st seq of 2nd profile */
- int iLeadA1 = 0; /* leading gaps in 1st seq of final alignment */
- int iLeadA2 = 0; /* leading gaps in PPS seq of final alignment */
- int iTrailO1 = 0; /* trailing gaps in 1st seq of 1st profile */
- int iTrailO2 = 0; /* trailing gaps in 1st seq of 2nd profile */
- int iTrailA1 = 0; /* trailing gaps in 1st seq of final alignment */
- int iTrailA2 = 0; /* trailing gaps in PPS seq of final alignment */
- int iLen = 0; /* length of final alignment */
- int iLen1 = 0; /* length of 1st profile */
- int iLen2 = 0; /* length of 2nd profile */
- int iCutHead = 0; /* make up truncation at head */
- int iCutTail = 0; /* make up truncation at tail */
- char *pcIter = NULL;
-
- if (-1 == iProfProfSeparator){
- return;
- }
- else {
- assert(prMSeq->aligned);
- assert(prMSeq->nseqs > iProfProfSeparator);
- }
- iSize1 = iProfProfSeparator;
- iSize2 = prMSeq->nseqs - iProfProfSeparator;
- iLen = strlen(prMSeq->seq[0]);
- iLen1 = strlen(prMSeq->orig_seq[0]);
- iLen2 = strlen(prMSeq->orig_seq[iPPS]);
-
- /* count leading/trailing gaps in 1st sequence of 1st/2nd profile and final alignmant */
- for (iLeadO1 = 0, pcIter = prMSeq->orig_seq[0]; isgap(*pcIter); pcIter++, iLeadO1++);
- for (iLeadO2 = 0, pcIter = prMSeq->orig_seq[iPPS]; isgap(*pcIter); pcIter++, iLeadO2++);
- for (iLeadA1 = 0, pcIter = prMSeq->seq[0]; isgap(*pcIter); pcIter++, iLeadA1++);
- for (iLeadA2 = 0, pcIter = prMSeq->seq[iPPS]; isgap(*pcIter); pcIter++, iLeadA2++);
- for (iTrailO1 = 0, pcIter = &prMSeq->orig_seq[0][iLen1-1]; isgap(*pcIter); pcIter--, iTrailO1++);
- for (iTrailO2 = 0, pcIter = &prMSeq->orig_seq[iPPS][iLen2-1]; isgap(*pcIter); pcIter--, iTrailO2++);
- for (iTrailA1 = 0, pcIter = &prMSeq->seq[0][iLen-1]; isgap(*pcIter); pcIter--, iTrailA1++);
- for (iTrailA2 = 0, pcIter = &prMSeq->seq[iPPS][iLen-1]; isgap(*pcIter); pcIter--, iTrailA2++);
-
-
- /* turn leading/trailing gaps into truncation */
- iLeadO1 = iLeadO1 > iLeadA1 ? iLeadO1-iLeadA1 : 0;
- iLeadO2 = iLeadO2 > iLeadA2 ? iLeadO2-iLeadA2 : 0;
- iTrailO1 = iTrailO1 > iTrailA1 ? iTrailO1-iTrailA1 : 0;
- iTrailO2 = iTrailO2 > iTrailA2 ? iTrailO2-iTrailA2 : 0;
- iCutHead = iLeadO1 > iLeadO2 ? iLeadO1 : iLeadO2;
- iCutTail = iTrailO1 > iTrailO2 ? iTrailO1 : iTrailO2;
-
- /* re-allocate and shift memory */
- if ( (iCutHead > 0) || (iCutTail > 0) ){ /* skip if no re-attachment, FS, r244 -> r245 */
- for (i = 0; i < prMSeq->nseqs; i++){
-
- CKREALLOC(prMSeq->seq[i], iLen+iCutHead+iCutTail+2);
- if (iCutHead > 0){ /* skip if no re-attachment, FS, r244 -> r245 */
- memmove(prMSeq->seq[i]+iCutHead, prMSeq->seq[i], iLen);
- }
- for (j = 0; j < iCutHead; j++){
- prMSeq->seq[i][j] = '-';
- }
- for (j = iLen+iCutHead; j < iLen+iCutHead+iCutTail; j++){
- prMSeq->seq[i][j] = '-';
- }
- prMSeq->seq[i][j] = '\0';
- }
- } /* (iCutHead > 0) || (iCutTail > 0) */
-
-} /*** end: ReAttachLeadingGaps() ***/
-
-/**
- * @brief reallocate enough memory for alignment and
- * attach sequence pointers to profiles
- *
- * @param[in,out] mseq
- * sequence/profile data, increase memory for sequences in profiles
- * @param[out] ppcProfile1
- * pointers to sequencese in 1st profile
- * @param[out] ppcProfile2
- * pointers to sequencese in 2nd profile
- * @param[out] pdWeightsL
- * weights (normalised to 1.0) for sequences in left profile
- * @param[out] pdWeightsR
- * weights (normalised to 1.0) for sequences in right profile
- * @param[in] pdSeqWeights
- * weights for _all_ sequences in alignment
- * @param[in] iLeafCountL
- * number of sequences in 1st profile
- * @param[in] piLeafListL
- * array of integer IDs of sequences in 1st profile
- * @param[in] iLeafCountR
- * number of sequences in 2nd profile
- * @param[in] piLeafListR
- * array of integer IDs of sequences in 2nd profile
- *
- */
-void
-PrepareAlignment(mseq_t *mseq, char **ppcProfile1, char **ppcProfile2,
- double *pdWeightsL, double *pdWeightsR, double *pdSeqWeights,
- int iLeafCountL, int *piLeafListL,
- int iLeafCountR, int *piLeafListR)
-{
-
- int iLenL = 0; /* length of 1st profile */
- int iLenR = 0; /* length of 2nd profile */
- int iMaxLen = 0; /* maximum possible length of alignment */
- int i; /* aux */
- double dWeight = 0.00;
- double dWeightInv = 0.00;
-
- assert(NULL!=mseq);
- assert(NULL!=ppcProfile1);
- assert(NULL!=ppcProfile2);
- assert(NULL!=piLeafListL);
- assert(NULL!=piLeafListR);
-
- /* get length of profiles,
- * in a profile all sequences should have same length so only look at 1st
- */
- iLenL = strlen(mseq->seq[piLeafListL[0]]);
- iLenR = strlen(mseq->seq[piLeafListR[0]]);
- iMaxLen = iLenL + iLenR + 1;
-
- /* reallocate enough memory for sequences in alignment (for adding
- * gaps)
- */
- for (i = 0; i < iLeafCountL; i++){
- mseq->seq[piLeafListL[i]] =
- CKREALLOC(mseq->seq[piLeafListL[i]], iMaxLen);
- }
- for (i = 0; i < iLeafCountR; i++){
- mseq->seq[piLeafListR[i]] =
- CKREALLOC(mseq->seq[piLeafListR[i]], iMaxLen);
- }
-
- /* attach sequences to profiles
- */
- for (i = 0; i < iLeafCountL; i++){
- ppcProfile1[i] = mseq->seq[piLeafListL[i]];
- }
- ppcProfile1[i] = NULL;
-
- for (i = 0; i < iLeafCountR; i++){
- ppcProfile2[i] = mseq->seq[piLeafListR[i]];
- }
- ppcProfile2[i] = NULL;
-
-
- /* remove terminal 'X' for single sequences:
- * it is a quirk of hhalign() to delete 2 individual sequences
- * if 2 terminal X's meet during alignment,
- * just replace (one of) them.
- * this can be undone at the end.
- * profiles -consisting of more than 1 sequence-
- * appear to be all-right.
- * there seems to be no problem with B's and Z's
- */
- if ( (1 == iLeafCountL) && (1 == iLeafCountR) ){
-
- if ( ('X' == ppcProfile1[0][0]) && ('X' == ppcProfile2[0][0]) ){
-#define NOTX 'N'
- ppcProfile1[0][0] = NOTX; /* FIXME: arbitrary assignment */
- ppcProfile2[0][0] = NOTX; /* FIXME: arbitrary assignment */
- }
- if ( ('X' == ppcProfile1[0][iLenL-1]) && ('X' == ppcProfile2[0][iLenR-1]) ){
- ppcProfile1[0][iLenL-1] = NOTX; /* FIXME: arbitrary assignment */
- ppcProfile2[0][iLenR-1] = NOTX; /* FIXME: arbitrary assignment */
- }
- }
-
- /* obtain sequence weights
- */
- if (NULL != pdSeqWeights){
-
- dWeight = 0.00;
- for (i = 0; i < iLeafCountL; i++){
- register double dAux = pdSeqWeights[piLeafListL[i]];
-#ifndef NDEBUG
- if (dAux <= 0.00){
- Log(&rLog, LOG_DEBUG, "seq-weight %d = %f", piLeafListL[i], dAux);
- }
-#endif
- pdWeightsL[i] = dAux;
- dWeight += dAux;
- } /* 0 <= i < iLeafCountL */
- dWeightInv = 1.00 / dWeight;
- for (i = 0; i < iLeafCountL; i++){
- pdWeightsL[i] *= dWeightInv;
- }
-
- dWeight = 0.00;
- for (i = 0; i < iLeafCountR; i++){
- register double dAux = pdSeqWeights[piLeafListR[i]];
-#ifndef NDEBUG
- if (dAux <= 0.00){
- Log(&rLog, LOG_DEBUG, "seq-weight %d = %f", piLeafListR[i], dAux);
- }
-#endif
- pdWeightsR[i] = dAux;
- dWeight += dAux;
- } /* 0 <= i < iLeafCountL */
- dWeightInv = 1.00 / dWeight;
- for (i = 0; i < iLeafCountR; i++){
- pdWeightsR[i] *= dWeightInv;
- }
- } /* (NULL != pdSeqWeights) */
- else {
- pdWeightsL[0] = pdWeightsR[0] = -1.00;
- }
-
-#if TRACE
- for (i = 0; i < iLeafCountL; i++){
- Log(&rLog, LOG_FORCED_DEBUG, "ppcProfile1[%d/%d] pointing to mseq %d = %s",
- i, iLeafCountR, piLeafListL[i], ppcProfile1[i]);
- }
- for (i = 0; i < iLeafCountR; i++){
- Log(&rLog, LOG_FORCED_DEBUG, "ppcProfile2[%d/%d] pointing to mseq %d = %s",
- i, iLeafCountR, piLeafListR[i], ppcProfile2[i]);
- }
-#endif
-
- return;
-
-} /*** end: PrepareAlignment() ***/
-
-
-/**
- * @brief wrapper for hhalign. This is a frontend function to
- * the ported hhalign code.
- *
- * @param[in,out] prMSeq
- * holds the unaligned sequences [in] and the final alignment [out]
- * @param[in] piOrderLR
- * holds order in which sequences/profiles are to be aligned,
- * even elements specify left nodes, odd elements right nodes,
- * if even and odd are same then it is a leaf
- * @param[in] pdSeqWeights
- * Weight per sequence. No weights used if NULL
- * @param[in] iNodeCount
- * number of nodes in tree, piOrderLR has 2*iNodeCount elements
- * @param[in] prHMMList
- * List of background HMMs (transition/emission probabilities)
- * @param[in] iHMMCount
- * Number of input background HMMs
- * @param[in] iProfProfSeparator
- * Gives the number of sequences in the first profile, if in
- * profile/profile alignment mode (iNodeCount==3). That assumes mseqs
- * holds the sequences of profile 1 and profile 2.
- * @param[in] rHhalignPara
- * various parameters read from commandline
- *
- * @return score of the alignment FIXME what is this?
- *
- * @note complex function. could use some simplification, more and
- * documentation and a struct'uring of piOrderLR
- *
- * @note HHalignWrapper can be entered in 2 different ways:
- * (i) all sequences are un-aligned (ii) there are 2 (aligned) profiles.
- * in the un-aligned case (i) the sequences come straight from Squid,
- * that is, they have been sanitised, all non-alphabetic residues
- * have been rendered as X's. In profile mode (ii) one profile may
- * have been produced internally. In that case residues may have
- * been translated back into their 'native' form, that is, they
- * may contain un-sanitised residues. These will cause trouble
- * during alignment
- *
- * @note: introduced argument hhalign_para rHhalignPara; FS, r240 -> r241
- * @note: if hhalign() fails then try with Viterbi by setting MAC-RAM=0; FS, r241 -> r243
- */
-
-double
-HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
- double *pdSeqWeights, int iNodeCount,
- hmm_light *prHMMList, int iHMMCount,
- int iProfProfSeparator, hhalign_para rHhalignPara)
-{
- int iN; /* node iterator */
- int *piLeafCount = NULL; /* number of leaves beneath a certain node */
- int **ppiLeafList = NULL; /* list of leaves beneath a certain node */
- char **ppcProfile1 = NULL; /* pointer to sequences in profile */
- char **ppcProfile2 = NULL; /* pointer to sequences in profile */
- char *pcReprsnt1 = NULL; /* representative of HMM aligned to left */
- char *pcReprsnt2 = NULL; /* representative of HMM aligned to right */
- char **ppcReprsnt1 = &pcReprsnt1; /* representative of HMM aligned to L */
- char **ppcReprsnt2 = &pcReprsnt2; /* representative of HMM aligned to R */
- char *pcConsens1 = NULL; /* copy of left sequence */
- char *pcConsens2 = NULL; /* copy of right sequence */
- char **ppcCopy1 = /*&pcCopy1*/NULL; /* copy of left sequences */
- char **ppcCopy2 = /*&pcCopy2*/NULL; /* copy of right sequences */
- double *pdScores = NULL; /* alignment scores (seq/HMM) */
- double dScore = 0.0; /* alignment score (seq/HMM) */
- int iAux_FS = 0;
- char zcAux[10000] = {0};
- char zcError[10000] = {0};
- int i; /* aux */
- progress_t *prProgress;
- int iAlnLen; /* alignment length */
- double *pdWeightsL = NULL; /* sequence weights of left profile */
- double *pdWeightsR = NULL; /* sequence weights of right profile */
- int iMergeNodeCounter = 0;
- hmm_light *prHMM = NULL;
- bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
-#if TIMING
- char zcStopwatchMsg[1024];
- Stopwatch_t *stopwatch = StopwatchCreate();
- StopwatchZero(stopwatch);
- StopwatchStart(stopwatch);
-#endif
-
-
- if (NULL != prHMMList) {
- if (iHMMCount>1) {
- Log(&rLog, LOG_WARN, "FIXME: Using only first of %u HMMs (needs implementation)", iHMMCount);
- }
- prHMM = &(prHMMList[0]);
- } else {
- /* FIXME: prHMM not allowed to be NULL and therefore pseudo allocated here */
- prHMM = (hmm_light *) CKCALLOC(1, sizeof(hmm_light));
- }
-
- assert(NULL!=prMSeq);
-
- if (NULL==piOrderLR) {
- assert(3==iNodeCount);
- }
- SanitiseUnknown(prMSeq);
-
- /* hhalign was not made for DNA/RNA. So warn if sequences are not
- * protein
- */
- if (SEQTYPE_PROTEIN != prMSeq->seqtype) {
- Log(&rLog, LOG_WARN, "Sequence type is %s. HHalign has only been tested on protein.",
- SeqTypeToStr(prMSeq->seqtype));
- }
-
- /* hhalign produces funny results if sequences contain gaps, so
- * dealign. Only way to use alignment info is to use it as a
- * background HMM
- */
- if (TRUE == prMSeq->aligned) {
- Log(&rLog, LOG_DEBUG, "Dealigning aligned sequences (inside %s)", __FUNCTION__);
- DealignMSeq(prMSeq);
- }
-
-#if TRACE
- Log(&rLog, LOG_FORCED_DEBUG, "iNodeCount = %d", iNodeCount);
-#endif
-
-
- /* allocate top-level memory for leaf tracking arrays and profiles,
- * and sequence weights*/
- piLeafCount = CKCALLOC(iNodeCount, sizeof(int));
- ppiLeafList = CKCALLOC(iNodeCount, sizeof(int *));
- ppcProfile1 = CKCALLOC(prMSeq->nseqs*2-1, sizeof(char *));
- ppcProfile2 = CKCALLOC(prMSeq->nseqs*2-1, sizeof(char *));
- pdScores = CKCALLOC(iNodeCount, sizeof(double));
- pdWeightsL = CKCALLOC(iNodeCount, sizeof(double));
- pdWeightsR = CKCALLOC(iNodeCount, sizeof(double));
-
- NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
- "Progressive alignment progress", bPrintCR);
-
-
- /* Profile-profile alignment? Then setup piLeafCount and
- * piLeafList here. FIXME this is just an awful haaaack
- */
- if (iNodeCount==3 && NULL==piOrderLR) {
- int iSizeProf1 = iProfProfSeparator;
- int iSizeProf2 = prMSeq->nseqs - iProfProfSeparator;
-
- piLeafCount[0] = iSizeProf1;
- ppiLeafList[0] = (int *)CKMALLOC(iSizeProf1 * sizeof(int));
- for (i=0;i<iSizeProf1;i++) {
- ppiLeafList[0][i] = i;
- }
- piLeafCount[1] = iSizeProf2;
- ppiLeafList[1] = (int *)CKMALLOC(iSizeProf2 * sizeof(int));
- for (i=0;i<iSizeProf2;i++) {
- ppiLeafList[1][i] = i+iSizeProf1;
- }
-
- /* awful hack inside an awful hack: we had to setup piLeafCount
- * and piLeafList outside the node iteration. this which is
- * normally done at leaf level inside the node iteration. to
- * avoid overwriting the already setup vars set...
- */
- iNodeCount=1;
-
- piOrderLR = (int *)CKCALLOC(DIFF_NODE, sizeof(int));
- piOrderLR[LEFT_NODE] = 0;
- piOrderLR[RGHT_NODE] = 1;
- piOrderLR[PRNT_NODE] = 2;
- }
-
-
-
- iMergeNodeCounter = 0;
- for (iN = 0; iN < iNodeCount; iN++){
-
- register int riAux = DIFF_NODE * iN;
-
- /*LOG_DEBUG("node %d ", iN);*/
-
- if (piOrderLR[riAux+LEFT_NODE] == piOrderLR[riAux+RGHT_NODE]){
-
- register int riLeaf = piOrderLR[riAux+LEFT_NODE];
-#if TRACE
- if (NULL == pdSeqWeights) {
- Log(&rLog, LOG_FORCED_DEBUG, "node %d is a leaf with entry %d (seq %s)",
- iN, riLeaf, prMSeq->sqinfo[riLeaf].name);
- } else {
- Log(&rLog, LOG_FORCED_DEBUG, "node %d is a leaf with entry %d (seq %s) and weight %f",
- iN, riLeaf, prMSeq->sqinfo[riLeaf].name, pdSeqWeights[riLeaf]);
- }
-#endif
- /* left/right entry same, this is a leaf */
- piLeafCount[piOrderLR[riAux+PRNT_NODE]] = 1; /* number of leaves is '1' */
- ppiLeafList[piOrderLR[riAux+PRNT_NODE]] = (int *)CKMALLOC(1 * sizeof(int));
- ppiLeafList[piOrderLR[riAux+PRNT_NODE]][0] = riLeaf;
-
- } /* was a leaf */
- else {
- int iL, iR, iP; /* ID of left/right nodes, parent */
- int i, j; /* aux */
-
- Log(&rLog, LOG_DEBUG,
- "merge profiles at node %d", iN, piOrderLR[riAux]);
-
- /* iNodeCount - prMSeq->nseqs = total # of merge-nodes
- * unless in profile/profile alignment mode
- */
- if (1 == iNodeCount) {
- ProgressLog(prProgress, ++iMergeNodeCounter,
- 1, FALSE);
- } else {
- ProgressLog(prProgress, ++iMergeNodeCounter,
- iNodeCount - prMSeq->nseqs, FALSE);
- }
-
- /* left/right entry are not same, this is a merge node */
- iL = piOrderLR[riAux+LEFT_NODE];
- iR = piOrderLR[riAux+RGHT_NODE];
- iP = piOrderLR[riAux+PRNT_NODE];
- piLeafCount[iP] = piLeafCount[iL] + piLeafCount[iR];
- ppiLeafList[iP] = (int *)CKMALLOC(piLeafCount[iP] * sizeof(int));
-
- for (i = j = 0; i < piLeafCount[iL]; i++, j++){
- ppiLeafList[iP][j] = ppiLeafList[iL][i];
- }
- for (i = 0; i < piLeafCount[iR]; i++, j++){
- ppiLeafList[iP][j] = ppiLeafList[iR][i];
- }
-
- /* prepare simulation arena:
- * - make sure enough memory in sequences
- * - attach sequence pointers to profiles
- */
- /* idea: switch template and query according to nseq? */
-
- PrepareAlignment(prMSeq, ppcProfile1, ppcProfile2,
- pdWeightsL, pdWeightsR, pdSeqWeights,
- piLeafCount[iL], ppiLeafList[iL],
- piLeafCount[iR], ppiLeafList[iR]);
- if (rLog.iLogLevelEnabled <= LOG_DEBUG){
- int i;
- FILE *fp = LogGetFP(&rLog, LOG_DEBUG);
- Log(&rLog, LOG_DEBUG, "merging profiles %d & %d", iL, iR);
- for (i = 0; i < piLeafCount[iL]; i++){
- fprintf(fp, "L/#=%3d (ID=%3d, w=%f): %s\n",
- i, ppiLeafList[iL][i], pdWeightsL[i], ppcProfile1[i]);
- }
- for (i = 0; i < piLeafCount[iR]; i++){
- fprintf(fp, "R/#=%3d (ID=%3d, w=%f): %s\n",
- i, ppiLeafList[iR][i], pdWeightsR[i], ppcProfile2[i]);
- }
- }
-
-
-
- /* align individual sequences to HMM;
- * - use representative sequence to get gapping
- * - create copies of both, individual/representative sequences
- * as we don't want to introduce gaps into original
- *
- * FIXME: representative sequence is crutch, should use
- * full HMM but that does not seem to work at all
- * -- try harder! Fail better!
- */
- if ( (piLeafCount[iL] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){
- int i, j;
- pcReprsnt1 = CKCALLOC(prHMM->L+strlen(ppcProfile1[0])+1, sizeof(char));
- for (i = 0; i < prHMM->L; i++){
- pcReprsnt1[i] = prHMM->seq[prHMM->ncons][i+1];
- }
- ppcCopy1 = CKCALLOC(piLeafCount[iL], sizeof(char *));
- for (j = 0; j < piLeafCount[iL]; j++){
- ppcCopy1[j] = CKCALLOC(prHMM->L+strlen(ppcProfile1[0])+1, sizeof(char));
- for (i = 0; i < (int) strlen(ppcProfile1[0]); i++){
- ppcCopy1[j][i] = ppcProfile1[j][i];
- }
- }
-
- {
- /* the size of the elements in the forward/backward matrices
- depends very much on the lengths of the profiles _and_
- in which position (1st/2nd) the longer/shorter profile/HMM is.
- the matrix elements can easily exceed the size of a (long?) double
- if the longer profile/HMM is associated with the query (q) and the
- shorter with the target (t).
- FIXME: however, pseudo-count adding may also depend on position,
- this is only really tested for the HMM being in the 1st position (q)
- MUST TEST THIS MORE THOROUGHLY
-
- this switch appears to be most easily (although unelegantly)
- effected here. Don't want to do it (upstairs) in PrepareAlignment()
- because it might jumble up the nodes. Don't want to do it in hhalign()
- either because ppcProfile1/2 and q/t may be used independently.
- FS, r236 -> r237
- */
- int iLenA = strlen(ppcCopy1[0]);
- int iLenH = prHMM->L;
- int iHHret = 0;
-
- if (iLenH < iLenA){
- iHHret = hhalign(ppcReprsnt1, 0/* only one representative seq*/, NULL,
- ppcCopy1, piLeafCount[iL], pdWeightsL,
- &dScore, prHMM,
- NULL, NULL, NULL, NULL,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
- zcAux, zcError);
- }
- else {
- iHHret = hhalign(ppcCopy1, piLeafCount[iL], pdWeightsL,
- ppcReprsnt1, 0/* only one representative seq*/, NULL,
- &dScore, prHMM,
- NULL, NULL, NULL, NULL,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
- zcAux, zcError);
- }
- if (0 != iHHret){ /* FS, r255 -> */
- fprintf(stderr, "%s:%s:%d: (not essential) HMM pre-alignment failed, error %d, \n"
- "\t#=%d (len=%d), lead-seq=%s, len(HMM)=%d\tCARRY ON REGARDLESS\n",
- __FUNCTION__, __FILE__, __LINE__, iHHret,
- piLeafCount[iL], (int)strlen(ppcCopy1[0]), prMSeq->sqinfo[ppiLeafList[iL][0]].name, (int)strlen(ppcReprsnt1[0]));
- }
- }
- pdScores[ppiLeafList[iL][0]] = dScore;
-#if 0
- printf("score: %f\nL: %s\nH: %s\n",
- dScore, ppcCopy1[0], ppcReprsnt1[0]);
-#endif
- /* assemble 'consensus';
- * this is not a real consensus, it is more a gap indicator,
- * for each position it consists of residues/gaps in the 1st sequences,
- * or a residue (if any) of the other sequences.
- * it only contains a gap if all sequences of the profile
- * have a gap at this position
- */
- pcConsens1 = CKCALLOC(prHMM->L+strlen(ppcProfile1[0])+1, sizeof(char));
- for (i = 0; i < prHMM->L; i++){
- for (j = 0, pcConsens1[i] = '-'; (j < piLeafCount[iL]) && ('-' == pcConsens1[i]); j++){
- pcConsens1[i] = ppcCopy1[j][i];
- }
- }
-#if 0
- for (j = 0; (j < piLeafCount[iL]); j++){
- printf("L%d:%s\n", j, ppcCopy1[j]);
- }
- printf("LC:%s\n", pcConsens1);
-#endif
- } /* ( (1 == piLeafCount[iL]) && (0 != prHMM->L) ) */
-
- if ( (piLeafCount[iR] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){
- int i, j;
-
- pcReprsnt2 = CKCALLOC(prHMM->L+strlen(ppcProfile2[0])+1, sizeof(char));
- for (i = 0; i < prHMM->L; i++){
- pcReprsnt2[i] = prHMM->seq[prHMM->ncons][i+1];
- }
- ppcCopy2 = CKCALLOC(piLeafCount[iR], sizeof(char *));
- for (j = 0; j < piLeafCount[iR]; j++){
- ppcCopy2[j] = CKCALLOC(prHMM->L+strlen(ppcProfile2[0])+1, sizeof(char));
- for (i = 0; i < (int) strlen(ppcProfile2[0]); i++){
- ppcCopy2[j][i] = ppcProfile2[j][i];
- }
- }
-
- {
- /* the size of the elements in the forward/backward matrices
- depends very much on the lengths of the profiles _and_
- in which position (1st/2nd) the longer/shorter profile/HMM is.
- the matrix elements can easily exceed the size of a (long?) double
- if the longer profile/HMM is associated with the query (q) and the
- shorter with the target (t).
- FIXME: however, pseudo-count adding may also depend on position,
- this is only really tested for the HMM being in the 1st position (q)
- MUST TEST THIS MORE THOROUGHLY
-
- this switch appears to be most easily (although unelegantly)
- effected here. Don't want to do it (upstairs) in PrepareAlignment()
- because it might jumble up the nodes. Don't want to do it in hhalign()
- either because ppcProfile1/2 and q/t may be used independently.
- FS, r236 -> r237
- */
- int iLenA = strlen(ppcCopy2[0]);
- int iLenH = prHMM->L;
- int iHHret = 0;
-
- if (iLenH < iLenA){
- iHHret = hhalign(ppcReprsnt2, 0/* only one representative seq */, NULL,
- ppcCopy2, piLeafCount[iR], pdWeightsR,
- &dScore, prHMM,
- NULL, NULL, NULL, NULL,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
- zcAux, zcError);
- }
- else {
- iHHret = hhalign(ppcCopy2, piLeafCount[iR], pdWeightsR,
- ppcReprsnt2, 0/* only one representative seq */, NULL,
- &dScore, prHMM,
- NULL, NULL, NULL, NULL,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
- zcAux, zcError);
- }
- if (0 != iHHret){ /* FS, r255 -> */
- fprintf(stderr, "%s:%s:%d: (not essential) HMM pre-alignment failed, error %d, \n"
- "\t#=%d (len=%d), lead-seq=%s, len(HMM)=%d\tCARRY ON REGARDLESS\n",
- __FUNCTION__, __FILE__, __LINE__, iHHret,
- piLeafCount[iR], (int)strlen(ppcCopy2[0]), prMSeq->sqinfo[ppiLeafList[iR][0]].name, (int)strlen(ppcReprsnt2[0]));
- }
- }
- pdScores[ppiLeafList[iR][0]] = dScore;
-#if 0
- printf("H: %s\nR: %s\nscore: %f\n",
- ppcReprsnt2[0], ppcCopy2[0], dScore);
-#endif
- /* assemble 'consensus';
- * this is not a real consensus, it is more a gap indicator,
- * for each position it consists of residues/gaps in the 1st sequences,
- * or a residue (if any) of the other sequences.
- * it only contains a gap if all sequences of the profile
- * have a gap at this position
- */
- pcConsens2 = CKCALLOC(prHMM->L+strlen(ppcProfile2[0])+1, sizeof(char));
- for (i = 0; i < prHMM->L; i++){
- for (j = 0, pcConsens2[i] = '-'; (j < piLeafCount[iR]) && ('-' == pcConsens2[i]); j++){
- pcConsens2[i] = ppcCopy2[j][i];
- }
- }
-#if 0
- for (j = 0; (j < piLeafCount[iR]); j++){
- printf("R%d:%s\n", j, ppcCopy2[j]);
- }
- printf("RC:%s\n", pcConsens2);
-#endif
- } /* ( (1 == piLeafCount[iR]) && (0 != prHMM->L) ) */
-
-
-
- /* do alignment here (before free)
- */
- {
- /* the size of the elements in the forward/backward matrices
- depends very much on the lengths of the profiles _and_
- in which position (1st/2nd) the longer/shorter profile is.
- the matrix elements can easily exceed the size of a (long?) double
- if the longer profile is associated with the query (q) and the
- shorter with the target (t).
- this switch appears to be most easily (although unelegantly)
- effected here. Don't want to do it (upstairs) in PrepareAlignment()
- because it might jumble up the nodes. Don't want to do it in hhalign()
- either because ppcProfile1/2 and q/t may be used independently.
- FS, r228 -> 229
- */
- int iLen1 = strlen(ppcProfile1[0]);
- int iLen2 = strlen(ppcProfile2[0]);
- /* potential problem with empty profiles, FS, r249 -> r250 */
- if ( (0 == iLen1) || (0 == iLen2) ){
- Log(&rLog, LOG_FATAL, "strlen(prof1)=%d, strlen(prof2)=%d -- nothing to align\n",
- iLen1, iLen2);
- }
-
-
- if (iLen1 < iLen2){
- int iHHret = 0;
- int iOldMacRam = rHhalignPara.iMacRamMB;
- iHHret = hhalign(ppcProfile1, piLeafCount[iL], pdWeightsL,
- ppcProfile2, piLeafCount[iR], pdWeightsR,
- &dScore, prHMM,
- pcConsens1, pcReprsnt1,
- pcConsens2, pcReprsnt2,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
- zcAux, zcError);
-
- if (RETURN_OK != iHHret){ /* FS, r241 -> */
- fprintf(stderr,
- "%s:%s:%d: problem in alignment (profile sizes: %d + %d) (%s + %s), forcing Viterbi\n"
- "\thh-error-code=%d (mac-ram=%d)\n",
- __FUNCTION__, __FILE__, __LINE__, piLeafCount[iL], piLeafCount[iR],
- prMSeq->sqinfo[ppiLeafList[iL][0]].name, prMSeq->sqinfo[ppiLeafList[iR][0]].name,
- iHHret, rHhalignPara.iMacRamMB);
- /* at this stage hhalign() has failed,
- the only thing we can do (easily) is to re-run it in Viterbi mode,
- for this set MAC-RAM=0, set it back to its original value after 2nd try.
- FS, r241 -> r243 */
- if (RETURN_FROM_MAC == iHHret){
- /* Note: the default way to run hhalign() is to initially select MAC
- by giving it all the memory it needs. MAC may fail due to overflow (repeats?).
- alternatively, the problem may be (genuinely) too big for MAC.
- in thses cases it is legitimate to switch to Viterbi.
- However, selecting Viterbi from the outset is an abuse (abomination!),
- should this 1st invocation of Viterbi fail, then we (FS) will overrule
- the user and hammer the system with a massive memory request.
- (Jos 2:19) If anyone goes outside your house into the street,
- his blood will be on his own head; we will not be responsible. FS, r246 -> r247 */
- rHhalignPara.iMacRamMB = 0;
- }
- else {
- rHhalignPara.iMacRamMB = REALLY_BIG_MEMORY_MB;
- }
-
- iHHret = hhalign(ppcProfile1, piLeafCount[iL], pdWeightsL,
- ppcProfile2, piLeafCount[iR], pdWeightsR,
- &dScore, prHMM,
- pcConsens1, pcReprsnt1,
- pcConsens2, pcReprsnt2,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
- zcAux, zcError);
- if (RETURN_OK != iHHret){ /* at this stage hhalign() has failed twice,
- 1st time MAC, 2nd time Viterbi, don't know what to do else. FS, r241 -> r243 */
- fprintf(stderr, "%s:%s:%d: problem in alignment, even Viterbi did not work\n"
- "\thh-error-code=%d (mac-ram=%d)\n",
- __FUNCTION__, __FILE__, __LINE__, iHHret, rHhalignPara.iMacRamMB);
- Log(&rLog, LOG_FATAL, "could not perform alignment -- bailing out\n");
- }
- else {
- fprintf(stderr, "%s:%s:%d: 2nd attempt worked", __FUNCTION__, __FILE__, __LINE__);
- }
- rHhalignPara.iMacRamMB = iOldMacRam;
- } /* 1st invocation failed */
-
- } /* 1st profile was shorter than 2nd */
- else {
- int iHHret = 0;
- int iOldMacRam = rHhalignPara.iMacRamMB;
- iHHret = hhalign(ppcProfile2, piLeafCount[iR], pdWeightsR,
- ppcProfile1, piLeafCount[iL], pdWeightsL,
- &dScore, prHMM,
- pcConsens2, pcReprsnt2,
- pcConsens1, pcReprsnt1,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
- zcAux, zcError);
-
- if (RETURN_OK != iHHret){ /* FS, r241 -> r243 */
- fprintf(stderr,
- "%s:%s:%d: problem in alignment (profile sizes: %d + %d) (%s + %s), forcing Viterbi\n"
- "\thh-error-code=%d (mac-ram=%d)\n",
- __FUNCTION__, __FILE__, __LINE__, piLeafCount[iL], piLeafCount[iR],
- prMSeq->sqinfo[ppiLeafList[iL][0]].name, prMSeq->sqinfo[ppiLeafList[iR][0]].name,
- iHHret, rHhalignPara.iMacRamMB);
- /* at this stage hhalign() has failed,
- the only thing we can do (easily) is to re-run it in Viterbi mode,
- for this set MAC-RAM=0, set it back to its original value after 2nd try.
- FS, r241 -> r243 */
- if (RETURN_FROM_MAC == iHHret){
- /* see above */
- rHhalignPara.iMacRamMB = 0;
- }
- else {
- rHhalignPara.iMacRamMB = REALLY_BIG_MEMORY_MB;
- }
-
- iHHret = hhalign(ppcProfile2, piLeafCount[iR], pdWeightsR,
- ppcProfile1, piLeafCount[iL], pdWeightsL,
- &dScore, prHMM,
- pcConsens2, pcReprsnt2,
- pcConsens1, pcReprsnt1,
- rHhalignPara,
- iAux_FS++, /* DEBUG ARGUMENT */
- zcAux, zcError);
- if (RETURN_OK != iHHret){ /* at this stage hhalign() has failed twice,
- 1st time MAC, 2nd time Viterbi, don't know what to do else. FS, r241 -> r243 */
- fprintf(stderr, "%s:%s:%d: problem in alignment, even Viterbi did not work\n"
- "\thh-error-code=%d (mac-ram=%d)\n",
- __FUNCTION__, __FILE__, __LINE__, iHHret, rHhalignPara.iMacRamMB);
- Log(&rLog, LOG_FATAL, "could not perform alignment -- bailing out\n");
- }
- else {
- fprintf(stderr, "%s:%s:%d: 2nd attempt worked", __FUNCTION__, __FILE__, __LINE__);
- }
- rHhalignPara.iMacRamMB = iOldMacRam;
- } /* 1st invocation failed */
-
- } /* 2nd profile was shorter than 1st */
-
- }
- /* free left/right node lists,
- * after alignment left/right profiles no longer needed
- */
- if (NULL != ppcCopy1){
- int i;
- for (i = 0; i < piLeafCount[iL]; i++){
- CKFREE(ppcCopy1[i]);
- }
- CKFREE(ppcCopy1);
- CKFREE(pcReprsnt1);
- CKFREE(pcConsens1);
- }
- if (NULL != ppcCopy2){
- int i;
- for (i = 0; i < piLeafCount[iR]; i++){
- CKFREE(ppcCopy2[i]);
- }
- CKFREE(ppcCopy2);
- CKFREE(pcReprsnt2);
- CKFREE(pcConsens2);
- }
- ppiLeafList[iL] = CKFREE(ppiLeafList[iL]);
- ppiLeafList[iR] = CKFREE(ppiLeafList[iR]);
- piLeafCount[iL] = piLeafCount[iR] = 0;
-
- } /* was a merge node */
-
- if (rLog.iLogLevelEnabled <= LOG_DEBUG){
- int i, j;
- FILE *fp = LogGetFP(&rLog, LOG_DEBUG);
- for (i = 0; i < iNodeCount; i++){
- if (0 == piLeafCount[i]){
- continue;
- }
- fprintf(fp, "node %3d, #leaves=%d:\t", i, piLeafCount[i]);
- for (j = 0; ppiLeafList && (j < piLeafCount[i]); j++){
- fprintf(fp, "%d,", ppiLeafList[i][j]);
- }
- fprintf(fp, "\n");
- }
- }
-
-
- } /* 0 <= iN < iNodeCount */
- ProgressDone(prProgress);
-
-
- /* check length and set length info
- */
- iAlnLen = strlen(prMSeq->seq[0]);
- for (i=0; i<prMSeq->nseqs; i++) {
-#if 0
- Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: name %s; len %d; %s",
- i, prMSeq->sqinfo[i].name, strlen(prMSeq->seq[i]), prMSeq->seq[i]);
-#endif
-
-#ifndef NDEBUG
- assert(iAlnLen == strlen(prMSeq->seq[i]));
-#endif
- prMSeq->sqinfo[i].len = iAlnLen;
- }
- prMSeq->aligned = TRUE;
-
-
- if (rLog.iLogLevelEnabled <= LOG_DEBUG){
- if (0 != prHMM->L){
- int i;
- Log(&rLog, LOG_DEBUG, "Alignment scores with HMM:");
- for (i = 0; /*pdScores[i] > 0.0*/i < prMSeq->nseqs; i++){
- Log(&rLog, LOG_DEBUG, "%2d:\t%f\n", i, pdScores[i]);
- }
- }
- }
-
-
- /** translate back ambiguity residues
- * hhalign translates ambiguity codes (B,Z) into unknown residues (X).
- * as we still have the original input we can substitute them back
- */
- TranslateUnknown2Ambiguity(prMSeq);
- ReAttachLeadingGaps(prMSeq, iProfProfSeparator);
-
- if (NULL == prHMMList){
- CKFREE(prHMM);
- }
- CKFREE(ppcProfile2);
- CKFREE(ppcProfile1);
- CKFREE(ppiLeafList[piOrderLR[DIFF_NODE*(iNodeCount-1)+PRNT_NODE]]);
- CKFREE(ppiLeafList);
- CKFREE(piLeafCount);
- CKFREE(pdScores);
- FreeProgress(&prProgress);
- CKFREE(pdWeightsL);
- CKFREE(pdWeightsR);
-
-#if TIMING
- StopwatchStop(stopwatch);
- StopwatchDisplay(stdout, "Total time for HHalignWrapper():" , stopwatch);
- StopwatchFree(stopwatch);
-#endif
-
- return dScore; /* FIXME alternative: return averaged pdScores */
-
-}
-/*** end: HHalignWrapper() ***/