1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: hhalign_wrapper.c 256 2011-06-23 13:57:28Z fabian $
33 #include "hhalign/general.h"
34 #include "hhalign/hhfunc.h"
35 #include "hhalign/hhalign.h"
37 /* up to this level (from leaf) will background HMM info be applied */
38 #define APPLY_BG_HMM_UP_TO_TREE_DEPTH 10
45 * @brief get rid of unknown residues
47 * @note HHalignWrapper can be entered in 2 different ways: (i) all
48 * sequences are un-aligned (ii) there are 2 (aligned) profiles. in
49 * the un-aligned case (i) the sequences come straight from Squid,
50 * that is, they have been sanitised, all non-alphabetic residues
51 * have been rendered as X's. In profile mode (ii) one profile may
52 * have been produced internally. In that case residues may have
53 * been translated back into their 'native' form, that is, they may
54 * contain un-sanitised residues. These will cause trouble during
59 SanitiseUnknown(mseq_t *mseq)
62 int iS; /* iterator for sequence */
63 int iR; /* iterator for residue */
64 int iLen; /* length of sequence */
68 for (iS = 0; iS < mseq->nseqs; iS++){
70 for (pcRes = mseq->seq[iS]; '\0' != *pcRes; pcRes++){
76 if (mseq->seqtype==SEQTYPE_PROTEIN) {
77 if (NULL == strchr(AMINO_ALPHABET, toupper(*pcRes))) {
78 *pcRes = AMINOACID_ANY;
80 } else if (mseq->seqtype==SEQTYPE_DNA) {
81 if (NULL == strchr(DNA_ALPHABET, toupper(*pcRes))) {
82 *pcRes = NUCLEOTIDE_ANY;
84 } else if (mseq->seqtype==SEQTYPE_RNA) {
85 if (NULL == strchr(RNA_ALPHABET, toupper(*pcRes))) {
86 *pcRes = NUCLEOTIDE_ANY;
92 } /* 0 <= iS < mseq->nseqs */
96 } /*** end: SanitiseUnknown() ***/
99 * @brief translate unknown residues back to ambiguity codes;
100 * hhalign translates ambiguity codes (B,Z) into unknown residue (X).
101 * we still have the original (un-aligned) residue information,
102 * by iterating along the original and aligned sequences we can
103 * reconstruct where codes have been changed and restore them
104 * to their original value
106 * @param[in,out] mseq
107 * sequence/profile data, mseq->seq [in,out] is changed to conform
108 * with mseq->orig_seq [in]
112 TranslateUnknown2Ambiguity(mseq_t *mseq)
115 int iS; /* iterator for sequence */
116 int iR, iRo; /* iterator for residue (original) */
117 int iChange, iCase, iAmbi; /* counts how many replacements */
118 static int siOffset = 'a' - 'A';
120 for (iS = 0; iS < mseq->nseqs; iS++){
123 iChange = iCase = iAmbi = 0;
125 while(('\0' != mseq->seq[iS][iR]) &&
126 ('\0' != mseq->orig_seq[iS][iRo])) {
128 /* skip gaps in aligned sequences */
129 while(isgap(mseq->seq[iS][iR])) {
131 } /* was gap in unaligned seq
132 * this should probably not happen */
133 while(isgap(mseq->orig_seq[iS][iRo])) {
135 } /* was gap in aligned seq */
137 /* check if we reached the end of the sequence after
138 * skipping the gaps */
139 if ( ('\0' == mseq->seq[iS][iR]) ||
140 ('\0' == mseq->orig_seq[iS][iRo]) ){
145 if (mseq->seq[iS][iR] != mseq->orig_seq[iS][iRo]){
146 /* FIXME: count replacements, discard case changes */
148 if ( (mseq->seq[iS][iR] == mseq->orig_seq[iS][iRo]+siOffset) ||
149 (mseq->seq[iS][iR] == mseq->orig_seq[iS][iRo]-siOffset) ){
156 Log(&rLog, LOG_FORCED_DEBUG, "seq=%d, pos=(%d:%d), (%c:%c)",
158 mseq->seq[iS][iR], mseq->orig_seq[iS][iRo]);
160 mseq->seq[iS][iR] = mseq->orig_seq[iS][iRo];
168 Log(&rLog, LOG_DEBUG,
169 "in seq %d re-translated %d residue codes (%d true, %d case)",
170 iS, iChange, iAmbi, iCase);
172 /* assert that both sequences (un/aligned) have terminated */
173 /* skip gaps in aligned sequences */
174 while(isgap(mseq->seq[iS][iR])) {
176 } /* was gap in unaligned seq
177 * this should probably not happen */
178 while(isgap(mseq->orig_seq[iS][iRo])) {
180 } /* was gap in aligned seq */
183 if ( ('\0' != mseq->seq[iS][iR]) ||
184 ('\0' != mseq->orig_seq[iS][iRo]) ){
186 Log(&rLog, LOG_FATAL, "inconsistency in un/aligned sequence %d\n>%s\n>%s\n",
187 iS, mseq->seq[iS], mseq->orig_seq[iS]);
190 } /* 0 <= iS < mseq->nseqs */
192 } /*** end: TranslateUnknown2Ambiguity() ***/
196 * @brief re-attach leading and trailing gaps to alignment
198 * @param[in,out] prMSeq,
199 * alignment structure (at this stage there should be no un-aligned sequences)
200 * @param[in] iProfProfSeparator
201 * gives sizes of input profiles, -1 if no input-profiles but un-aligned sequences
203 * @note leading and tailing profile columns
204 * that only contain gaps have no effect on the alignment
205 * and are removed during the alignment. if they are
206 * encountered a warning message is printed to screen.
207 * some users like to preserve these gap columns
211 ReAttachLeadingGaps(mseq_t *prMSeq, int iProfProfSeparator)
215 int iSize1 = 0; /* #seqs in 1st profile */
216 int iSize2 = 0; /* #seqs in 2nd profile */
217 int iPPS = iProfProfSeparator;
218 int iLeadO1 = 0; /* leading gaps in 1st seq of 1st profile */
219 int iLeadO2 = 0; /* leading gaps in 1st seq of 2nd profile */
220 int iLeadA1 = 0; /* leading gaps in 1st seq of final alignment */
221 int iLeadA2 = 0; /* leading gaps in PPS seq of final alignment */
222 int iTrailO1 = 0; /* trailing gaps in 1st seq of 1st profile */
223 int iTrailO2 = 0; /* trailing gaps in 1st seq of 2nd profile */
224 int iTrailA1 = 0; /* trailing gaps in 1st seq of final alignment */
225 int iTrailA2 = 0; /* trailing gaps in PPS seq of final alignment */
226 int iLen = 0; /* length of final alignment */
227 int iLen1 = 0; /* length of 1st profile */
228 int iLen2 = 0; /* length of 2nd profile */
229 int iCutHead = 0; /* make up truncation at head */
230 int iCutTail = 0; /* make up truncation at tail */
233 if (-1 == iProfProfSeparator){
237 assert(prMSeq->aligned);
238 assert(prMSeq->nseqs > iProfProfSeparator);
240 iSize1 = iProfProfSeparator;
241 iSize2 = prMSeq->nseqs - iProfProfSeparator;
242 iLen = strlen(prMSeq->seq[0]);
243 iLen1 = strlen(prMSeq->orig_seq[0]);
244 iLen2 = strlen(prMSeq->orig_seq[iPPS]);
246 /* count leading/trailing gaps in 1st sequence of 1st/2nd profile and final alignmant */
247 for (iLeadO1 = 0, pcIter = prMSeq->orig_seq[0]; isgap(*pcIter); pcIter++, iLeadO1++);
248 for (iLeadO2 = 0, pcIter = prMSeq->orig_seq[iPPS]; isgap(*pcIter); pcIter++, iLeadO2++);
249 for (iLeadA1 = 0, pcIter = prMSeq->seq[0]; isgap(*pcIter); pcIter++, iLeadA1++);
250 for (iLeadA2 = 0, pcIter = prMSeq->seq[iPPS]; isgap(*pcIter); pcIter++, iLeadA2++);
251 for (iTrailO1 = 0, pcIter = &prMSeq->orig_seq[0][iLen1-1]; isgap(*pcIter); pcIter--, iTrailO1++);
252 for (iTrailO2 = 0, pcIter = &prMSeq->orig_seq[iPPS][iLen2-1]; isgap(*pcIter); pcIter--, iTrailO2++);
253 for (iTrailA1 = 0, pcIter = &prMSeq->seq[0][iLen-1]; isgap(*pcIter); pcIter--, iTrailA1++);
254 for (iTrailA2 = 0, pcIter = &prMSeq->seq[iPPS][iLen-1]; isgap(*pcIter); pcIter--, iTrailA2++);
257 /* turn leading/trailing gaps into truncation */
258 iLeadO1 = iLeadO1 > iLeadA1 ? iLeadO1-iLeadA1 : 0;
259 iLeadO2 = iLeadO2 > iLeadA2 ? iLeadO2-iLeadA2 : 0;
260 iTrailO1 = iTrailO1 > iTrailA1 ? iTrailO1-iTrailA1 : 0;
261 iTrailO2 = iTrailO2 > iTrailA2 ? iTrailO2-iTrailA2 : 0;
262 iCutHead = iLeadO1 > iLeadO2 ? iLeadO1 : iLeadO2;
263 iCutTail = iTrailO1 > iTrailO2 ? iTrailO1 : iTrailO2;
265 /* re-allocate and shift memory */
266 if ( (iCutHead > 0) || (iCutTail > 0) ){ /* skip if no re-attachment, FS, r244 -> r245 */
267 for (i = 0; i < prMSeq->nseqs; i++){
269 CKREALLOC(prMSeq->seq[i], iLen+iCutHead+iCutTail+2);
270 if (iCutHead > 0){ /* skip if no re-attachment, FS, r244 -> r245 */
271 memmove(prMSeq->seq[i]+iCutHead, prMSeq->seq[i], iLen);
273 for (j = 0; j < iCutHead; j++){
274 prMSeq->seq[i][j] = '-';
276 for (j = iLen+iCutHead; j < iLen+iCutHead+iCutTail; j++){
277 prMSeq->seq[i][j] = '-';
279 prMSeq->seq[i][j] = '\0';
281 } /* (iCutHead > 0) || (iCutTail > 0) */
283 } /*** end: ReAttachLeadingGaps() ***/
286 * @brief reallocate enough memory for alignment and
287 * attach sequence pointers to profiles
289 * @param[in,out] mseq
290 * sequence/profile data, increase memory for sequences in profiles
291 * @param[out] ppcProfile1
292 * pointers to sequencese in 1st profile
293 * @param[out] ppcProfile2
294 * pointers to sequencese in 2nd profile
295 * @param[out] pdWeightsL
296 * weights (normalised to 1.0) for sequences in left profile
297 * @param[out] pdWeightsR
298 * weights (normalised to 1.0) for sequences in right profile
299 * @param[in] pdSeqWeights
300 * weights for _all_ sequences in alignment
301 * @param[in] iLeafCountL
302 * number of sequences in 1st profile
303 * @param[in] piLeafListL
304 * array of integer IDs of sequences in 1st profile
305 * @param[in] iLeafCountR
306 * number of sequences in 2nd profile
307 * @param[in] piLeafListR
308 * array of integer IDs of sequences in 2nd profile
312 PrepareAlignment(mseq_t *mseq, char **ppcProfile1, char **ppcProfile2,
313 double *pdWeightsL, double *pdWeightsR, double *pdSeqWeights,
314 int iLeafCountL, int *piLeafListL,
315 int iLeafCountR, int *piLeafListR)
318 int iLenL = 0; /* length of 1st profile */
319 int iLenR = 0; /* length of 2nd profile */
320 int iMaxLen = 0; /* maximum possible length of alignment */
322 double dWeight = 0.00;
323 double dWeightInv = 0.00;
326 assert(NULL!=ppcProfile1);
327 assert(NULL!=ppcProfile2);
328 assert(NULL!=piLeafListL);
329 assert(NULL!=piLeafListR);
331 /* get length of profiles,
332 * in a profile all sequences should have same length so only look at 1st
334 iLenL = strlen(mseq->seq[piLeafListL[0]]);
335 iLenR = strlen(mseq->seq[piLeafListR[0]]);
336 iMaxLen = iLenL + iLenR + 1;
338 /* reallocate enough memory for sequences in alignment (for adding
341 for (i = 0; i < iLeafCountL; i++){
342 mseq->seq[piLeafListL[i]] =
343 CKREALLOC(mseq->seq[piLeafListL[i]], iMaxLen);
345 for (i = 0; i < iLeafCountR; i++){
346 mseq->seq[piLeafListR[i]] =
347 CKREALLOC(mseq->seq[piLeafListR[i]], iMaxLen);
350 /* attach sequences to profiles
352 for (i = 0; i < iLeafCountL; i++){
353 ppcProfile1[i] = mseq->seq[piLeafListL[i]];
355 ppcProfile1[i] = NULL;
357 for (i = 0; i < iLeafCountR; i++){
358 ppcProfile2[i] = mseq->seq[piLeafListR[i]];
360 ppcProfile2[i] = NULL;
363 /* remove terminal 'X' for single sequences:
364 * it is a quirk of hhalign() to delete 2 individual sequences
365 * if 2 terminal X's meet during alignment,
366 * just replace (one of) them.
367 * this can be undone at the end.
368 * profiles -consisting of more than 1 sequence-
369 * appear to be all-right.
370 * there seems to be no problem with B's and Z's
372 if ( (1 == iLeafCountL) && (1 == iLeafCountR) ){
374 if ( ('X' == ppcProfile1[0][0]) && ('X' == ppcProfile2[0][0]) ){
376 ppcProfile1[0][0] = NOTX; /* FIXME: arbitrary assignment */
377 ppcProfile2[0][0] = NOTX; /* FIXME: arbitrary assignment */
379 if ( ('X' == ppcProfile1[0][iLenL-1]) && ('X' == ppcProfile2[0][iLenR-1]) ){
380 ppcProfile1[0][iLenL-1] = NOTX; /* FIXME: arbitrary assignment */
381 ppcProfile2[0][iLenR-1] = NOTX; /* FIXME: arbitrary assignment */
385 /* obtain sequence weights
387 if (NULL != pdSeqWeights){
390 for (i = 0; i < iLeafCountL; i++){
391 register double dAux = pdSeqWeights[piLeafListL[i]];
394 Log(&rLog, LOG_DEBUG, "seq-weight %d = %f", piLeafListL[i], dAux);
397 pdWeightsL[i] = dAux;
399 } /* 0 <= i < iLeafCountL */
400 dWeightInv = 1.00 / dWeight;
401 for (i = 0; i < iLeafCountL; i++){
402 pdWeightsL[i] *= dWeightInv;
406 for (i = 0; i < iLeafCountR; i++){
407 register double dAux = pdSeqWeights[piLeafListR[i]];
410 Log(&rLog, LOG_DEBUG, "seq-weight %d = %f", piLeafListR[i], dAux);
413 pdWeightsR[i] = dAux;
415 } /* 0 <= i < iLeafCountL */
416 dWeightInv = 1.00 / dWeight;
417 for (i = 0; i < iLeafCountR; i++){
418 pdWeightsR[i] *= dWeightInv;
420 } /* (NULL != pdSeqWeights) */
422 pdWeightsL[0] = pdWeightsR[0] = -1.00;
426 for (i = 0; i < iLeafCountL; i++){
427 Log(&rLog, LOG_FORCED_DEBUG, "ppcProfile1[%d/%d] pointing to mseq %d = %s",
428 i, iLeafCountR, piLeafListL[i], ppcProfile1[i]);
430 for (i = 0; i < iLeafCountR; i++){
431 Log(&rLog, LOG_FORCED_DEBUG, "ppcProfile2[%d/%d] pointing to mseq %d = %s",
432 i, iLeafCountR, piLeafListR[i], ppcProfile2[i]);
438 } /*** end: PrepareAlignment() ***/
442 * @brief wrapper for hhalign. This is a frontend function to
443 * the ported hhalign code.
445 * @param[in,out] prMSeq
446 * holds the unaligned sequences [in] and the final alignment [out]
447 * @param[in] piOrderLR
448 * holds order in which sequences/profiles are to be aligned,
449 * even elements specify left nodes, odd elements right nodes,
450 * if even and odd are same then it is a leaf
451 * @param[in] pdSeqWeights
452 * Weight per sequence. No weights used if NULL
453 * @param[in] iNodeCount
454 * number of nodes in tree, piOrderLR has 2*iNodeCount elements
455 * @param[in] prHMMList
456 * List of background HMMs (transition/emission probabilities)
457 * @param[in] iHMMCount
458 * Number of input background HMMs
459 * @param[in] iProfProfSeparator
460 * Gives the number of sequences in the first profile, if in
461 * profile/profile alignment mode (iNodeCount==3). That assumes mseqs
462 * holds the sequences of profile 1 and profile 2.
463 * @param[in] rHhalignPara
464 * various parameters read from commandline
466 * @return score of the alignment FIXME what is this?
468 * @note complex function. could use some simplification, more and
469 * documentation and a struct'uring of piOrderLR
471 * @note HHalignWrapper can be entered in 2 different ways:
472 * (i) all sequences are un-aligned (ii) there are 2 (aligned) profiles.
473 * in the un-aligned case (i) the sequences come straight from Squid,
474 * that is, they have been sanitised, all non-alphabetic residues
475 * have been rendered as X's. In profile mode (ii) one profile may
476 * have been produced internally. In that case residues may have
477 * been translated back into their 'native' form, that is, they
478 * may contain un-sanitised residues. These will cause trouble
481 * @note: introduced argument hhalign_para rHhalignPara; FS, r240 -> r241
482 * @note: if hhalign() fails then try with Viterbi by setting MAC-RAM=0; FS, r241 -> r243
486 HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
487 double *pdSeqWeights, int iNodeCount,
488 hmm_light *prHMMList, int iHMMCount,
489 int iProfProfSeparator, hhalign_para rHhalignPara)
491 int iN; /* node iterator */
492 int *piLeafCount = NULL; /* number of leaves beneath a certain node */
493 int **ppiLeafList = NULL; /* list of leaves beneath a certain node */
494 char **ppcProfile1 = NULL; /* pointer to sequences in profile */
495 char **ppcProfile2 = NULL; /* pointer to sequences in profile */
496 char *pcReprsnt1 = NULL; /* representative of HMM aligned to left */
497 char *pcReprsnt2 = NULL; /* representative of HMM aligned to right */
498 char **ppcReprsnt1 = &pcReprsnt1; /* representative of HMM aligned to L */
499 char **ppcReprsnt2 = &pcReprsnt2; /* representative of HMM aligned to R */
500 char *pcConsens1 = NULL; /* copy of left sequence */
501 char *pcConsens2 = NULL; /* copy of right sequence */
502 char **ppcCopy1 = /*&pcCopy1*/NULL; /* copy of left sequences */
503 char **ppcCopy2 = /*&pcCopy2*/NULL; /* copy of right sequences */
504 double *pdScores = NULL; /* alignment scores (seq/HMM) */
505 double dScore = 0.0; /* alignment score (seq/HMM) */
507 char zcAux[10000] = {0};
508 char zcError[10000] = {0};
510 progress_t *prProgress;
511 int iAlnLen; /* alignment length */
512 double *pdWeightsL = NULL; /* sequence weights of left profile */
513 double *pdWeightsR = NULL; /* sequence weights of right profile */
514 int iMergeNodeCounter = 0;
515 hmm_light *prHMM = NULL;
516 bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
518 char zcStopwatchMsg[1024];
519 Stopwatch_t *stopwatch = StopwatchCreate();
520 StopwatchZero(stopwatch);
521 StopwatchStart(stopwatch);
525 if (NULL != prHMMList) {
527 Log(&rLog, LOG_WARN, "FIXME: Using only first of %u HMMs (needs implementation)", iHMMCount);
529 prHMM = &(prHMMList[0]);
531 /* FIXME: prHMM not allowed to be NULL and therefore pseudo allocated here */
532 prHMM = (hmm_light *) CKCALLOC(1, sizeof(hmm_light));
535 assert(NULL!=prMSeq);
537 if (NULL==piOrderLR) {
538 assert(3==iNodeCount);
540 SanitiseUnknown(prMSeq);
542 /* hhalign was not made for DNA/RNA. So warn if sequences are not
545 if (SEQTYPE_PROTEIN != prMSeq->seqtype) {
546 Log(&rLog, LOG_WARN, "Sequence type is %s. HHalign has only been tested on protein.",
547 SeqTypeToStr(prMSeq->seqtype));
550 /* hhalign produces funny results if sequences contain gaps, so
551 * dealign. Only way to use alignment info is to use it as a
554 if (TRUE == prMSeq->aligned) {
555 Log(&rLog, LOG_DEBUG, "Dealigning aligned sequences (inside %s)", __FUNCTION__);
560 Log(&rLog, LOG_FORCED_DEBUG, "iNodeCount = %d", iNodeCount);
564 /* allocate top-level memory for leaf tracking arrays and profiles,
565 * and sequence weights*/
566 piLeafCount = CKCALLOC(iNodeCount, sizeof(int));
567 ppiLeafList = CKCALLOC(iNodeCount, sizeof(int *));
568 ppcProfile1 = CKCALLOC(prMSeq->nseqs*2-1, sizeof(char *));
569 ppcProfile2 = CKCALLOC(prMSeq->nseqs*2-1, sizeof(char *));
570 pdScores = CKCALLOC(iNodeCount, sizeof(double));
571 pdWeightsL = CKCALLOC(iNodeCount, sizeof(double));
572 pdWeightsR = CKCALLOC(iNodeCount, sizeof(double));
574 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
575 "Progressive alignment progress", bPrintCR);
578 /* Profile-profile alignment? Then setup piLeafCount and
579 * piLeafList here. FIXME this is just an awful haaaack
581 if (iNodeCount==3 && NULL==piOrderLR) {
582 int iSizeProf1 = iProfProfSeparator;
583 int iSizeProf2 = prMSeq->nseqs - iProfProfSeparator;
585 piLeafCount[0] = iSizeProf1;
586 ppiLeafList[0] = (int *)CKMALLOC(iSizeProf1 * sizeof(int));
587 for (i=0;i<iSizeProf1;i++) {
588 ppiLeafList[0][i] = i;
590 piLeafCount[1] = iSizeProf2;
591 ppiLeafList[1] = (int *)CKMALLOC(iSizeProf2 * sizeof(int));
592 for (i=0;i<iSizeProf2;i++) {
593 ppiLeafList[1][i] = i+iSizeProf1;
596 /* awful hack inside an awful hack: we had to setup piLeafCount
597 * and piLeafList outside the node iteration. this which is
598 * normally done at leaf level inside the node iteration. to
599 * avoid overwriting the already setup vars set...
603 piOrderLR = (int *)CKCALLOC(DIFF_NODE, sizeof(int));
604 piOrderLR[LEFT_NODE] = 0;
605 piOrderLR[RGHT_NODE] = 1;
606 piOrderLR[PRNT_NODE] = 2;
611 iMergeNodeCounter = 0;
612 for (iN = 0; iN < iNodeCount; iN++){
614 register int riAux = DIFF_NODE * iN;
616 /*LOG_DEBUG("node %d ", iN);*/
618 if (piOrderLR[riAux+LEFT_NODE] == piOrderLR[riAux+RGHT_NODE]){
620 register int riLeaf = piOrderLR[riAux+LEFT_NODE];
622 if (NULL == pdSeqWeights) {
623 Log(&rLog, LOG_FORCED_DEBUG, "node %d is a leaf with entry %d (seq %s)",
624 iN, riLeaf, prMSeq->sqinfo[riLeaf].name);
626 Log(&rLog, LOG_FORCED_DEBUG, "node %d is a leaf with entry %d (seq %s) and weight %f",
627 iN, riLeaf, prMSeq->sqinfo[riLeaf].name, pdSeqWeights[riLeaf]);
630 /* left/right entry same, this is a leaf */
631 piLeafCount[piOrderLR[riAux+PRNT_NODE]] = 1; /* number of leaves is '1' */
632 ppiLeafList[piOrderLR[riAux+PRNT_NODE]] = (int *)CKMALLOC(1 * sizeof(int));
633 ppiLeafList[piOrderLR[riAux+PRNT_NODE]][0] = riLeaf;
637 int iL, iR, iP; /* ID of left/right nodes, parent */
640 Log(&rLog, LOG_DEBUG,
641 "merge profiles at node %d", iN, piOrderLR[riAux]);
643 /* iNodeCount - prMSeq->nseqs = total # of merge-nodes
644 * unless in profile/profile alignment mode
646 if (1 == iNodeCount) {
647 ProgressLog(prProgress, ++iMergeNodeCounter,
650 ProgressLog(prProgress, ++iMergeNodeCounter,
651 iNodeCount - prMSeq->nseqs, FALSE);
654 /* left/right entry are not same, this is a merge node */
655 iL = piOrderLR[riAux+LEFT_NODE];
656 iR = piOrderLR[riAux+RGHT_NODE];
657 iP = piOrderLR[riAux+PRNT_NODE];
658 piLeafCount[iP] = piLeafCount[iL] + piLeafCount[iR];
659 ppiLeafList[iP] = (int *)CKMALLOC(piLeafCount[iP] * sizeof(int));
661 for (i = j = 0; i < piLeafCount[iL]; i++, j++){
662 ppiLeafList[iP][j] = ppiLeafList[iL][i];
664 for (i = 0; i < piLeafCount[iR]; i++, j++){
665 ppiLeafList[iP][j] = ppiLeafList[iR][i];
668 /* prepare simulation arena:
669 * - make sure enough memory in sequences
670 * - attach sequence pointers to profiles
672 /* idea: switch template and query according to nseq? */
674 PrepareAlignment(prMSeq, ppcProfile1, ppcProfile2,
675 pdWeightsL, pdWeightsR, pdSeqWeights,
676 piLeafCount[iL], ppiLeafList[iL],
677 piLeafCount[iR], ppiLeafList[iR]);
678 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
680 FILE *fp = LogGetFP(&rLog, LOG_DEBUG);
681 Log(&rLog, LOG_DEBUG, "merging profiles %d & %d", iL, iR);
682 for (i = 0; i < piLeafCount[iL]; i++){
683 fprintf(fp, "L/#=%3d (ID=%3d, w=%f): %s\n",
684 i, ppiLeafList[iL][i], pdWeightsL[i], ppcProfile1[i]);
686 for (i = 0; i < piLeafCount[iR]; i++){
687 fprintf(fp, "R/#=%3d (ID=%3d, w=%f): %s\n",
688 i, ppiLeafList[iR][i], pdWeightsR[i], ppcProfile2[i]);
694 /* align individual sequences to HMM;
695 * - use representative sequence to get gapping
696 * - create copies of both, individual/representative sequences
697 * as we don't want to introduce gaps into original
699 * FIXME: representative sequence is crutch, should use
700 * full HMM but that does not seem to work at all
701 * -- try harder! Fail better!
703 if ( (piLeafCount[iL] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){
705 pcReprsnt1 = CKCALLOC(prHMM->L+strlen(ppcProfile1[0])+1, sizeof(char));
706 for (i = 0; i < prHMM->L; i++){
707 pcReprsnt1[i] = prHMM->seq[prHMM->ncons][i+1];
709 ppcCopy1 = CKCALLOC(piLeafCount[iL], sizeof(char *));
710 for (j = 0; j < piLeafCount[iL]; j++){
711 ppcCopy1[j] = CKCALLOC(prHMM->L+strlen(ppcProfile1[0])+1, sizeof(char));
712 for (i = 0; i < (int) strlen(ppcProfile1[0]); i++){
713 ppcCopy1[j][i] = ppcProfile1[j][i];
718 /* the size of the elements in the forward/backward matrices
719 depends very much on the lengths of the profiles _and_
720 in which position (1st/2nd) the longer/shorter profile/HMM is.
721 the matrix elements can easily exceed the size of a (long?) double
722 if the longer profile/HMM is associated with the query (q) and the
723 shorter with the target (t).
724 FIXME: however, pseudo-count adding may also depend on position,
725 this is only really tested for the HMM being in the 1st position (q)
726 MUST TEST THIS MORE THOROUGHLY
728 this switch appears to be most easily (although unelegantly)
729 effected here. Don't want to do it (upstairs) in PrepareAlignment()
730 because it might jumble up the nodes. Don't want to do it in hhalign()
731 either because ppcProfile1/2 and q/t may be used independently.
734 int iLenA = strlen(ppcCopy1[0]);
735 int iLenH = prHMM->L;
739 iHHret = hhalign(ppcReprsnt1, 0/* only one representative seq*/, NULL,
740 ppcCopy1, piLeafCount[iL], pdWeightsL,
742 NULL, NULL, NULL, NULL,
744 iAux_FS++, /* DEBUG ARGUMENT */
748 iHHret = hhalign(ppcCopy1, piLeafCount[iL], pdWeightsL,
749 ppcReprsnt1, 0/* only one representative seq*/, NULL,
751 NULL, NULL, NULL, NULL,
753 iAux_FS++, /* DEBUG ARGUMENT */
756 if (0 != iHHret){ /* FS, r255 -> */
757 fprintf(stderr, "%s:%s:%d: (not essential) HMM pre-alignment failed, error %d, \n"
758 "\t#=%d (len=%d), lead-seq=%s, len(HMM)=%d\tCARRY ON REGARDLESS\n",
759 __FUNCTION__, __FILE__, __LINE__, iHHret,
760 piLeafCount[iL], (int)strlen(ppcCopy1[0]), prMSeq->sqinfo[ppiLeafList[iL][0]].name, (int)strlen(ppcReprsnt1[0]));
763 pdScores[ppiLeafList[iL][0]] = dScore;
765 printf("score: %f\nL: %s\nH: %s\n",
766 dScore, ppcCopy1[0], ppcReprsnt1[0]);
768 /* assemble 'consensus';
769 * this is not a real consensus, it is more a gap indicator,
770 * for each position it consists of residues/gaps in the 1st sequences,
771 * or a residue (if any) of the other sequences.
772 * it only contains a gap if all sequences of the profile
773 * have a gap at this position
775 pcConsens1 = CKCALLOC(prHMM->L+strlen(ppcProfile1[0])+1, sizeof(char));
776 for (i = 0; i < prHMM->L; i++){
777 for (j = 0, pcConsens1[i] = '-'; (j < piLeafCount[iL]) && ('-' == pcConsens1[i]); j++){
778 pcConsens1[i] = ppcCopy1[j][i];
782 for (j = 0; (j < piLeafCount[iL]); j++){
783 printf("L%d:%s\n", j, ppcCopy1[j]);
785 printf("LC:%s\n", pcConsens1);
787 } /* ( (1 == piLeafCount[iL]) && (0 != prHMM->L) ) */
789 if ( (piLeafCount[iR] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){
792 pcReprsnt2 = CKCALLOC(prHMM->L+strlen(ppcProfile2[0])+1, sizeof(char));
793 for (i = 0; i < prHMM->L; i++){
794 pcReprsnt2[i] = prHMM->seq[prHMM->ncons][i+1];
796 ppcCopy2 = CKCALLOC(piLeafCount[iR], sizeof(char *));
797 for (j = 0; j < piLeafCount[iR]; j++){
798 ppcCopy2[j] = CKCALLOC(prHMM->L+strlen(ppcProfile2[0])+1, sizeof(char));
799 for (i = 0; i < (int) strlen(ppcProfile2[0]); i++){
800 ppcCopy2[j][i] = ppcProfile2[j][i];
805 /* the size of the elements in the forward/backward matrices
806 depends very much on the lengths of the profiles _and_
807 in which position (1st/2nd) the longer/shorter profile/HMM is.
808 the matrix elements can easily exceed the size of a (long?) double
809 if the longer profile/HMM is associated with the query (q) and the
810 shorter with the target (t).
811 FIXME: however, pseudo-count adding may also depend on position,
812 this is only really tested for the HMM being in the 1st position (q)
813 MUST TEST THIS MORE THOROUGHLY
815 this switch appears to be most easily (although unelegantly)
816 effected here. Don't want to do it (upstairs) in PrepareAlignment()
817 because it might jumble up the nodes. Don't want to do it in hhalign()
818 either because ppcProfile1/2 and q/t may be used independently.
821 int iLenA = strlen(ppcCopy2[0]);
822 int iLenH = prHMM->L;
826 iHHret = hhalign(ppcReprsnt2, 0/* only one representative seq */, NULL,
827 ppcCopy2, piLeafCount[iR], pdWeightsR,
829 NULL, NULL, NULL, NULL,
831 iAux_FS++, /* DEBUG ARGUMENT */
835 iHHret = hhalign(ppcCopy2, piLeafCount[iR], pdWeightsR,
836 ppcReprsnt2, 0/* only one representative seq */, NULL,
838 NULL, NULL, NULL, NULL,
840 iAux_FS++, /* DEBUG ARGUMENT */
843 if (0 != iHHret){ /* FS, r255 -> */
844 fprintf(stderr, "%s:%s:%d: (not essential) HMM pre-alignment failed, error %d, \n"
845 "\t#=%d (len=%d), lead-seq=%s, len(HMM)=%d\tCARRY ON REGARDLESS\n",
846 __FUNCTION__, __FILE__, __LINE__, iHHret,
847 piLeafCount[iR], (int)strlen(ppcCopy2[0]), prMSeq->sqinfo[ppiLeafList[iR][0]].name, (int)strlen(ppcReprsnt2[0]));
850 pdScores[ppiLeafList[iR][0]] = dScore;
852 printf("H: %s\nR: %s\nscore: %f\n",
853 ppcReprsnt2[0], ppcCopy2[0], dScore);
855 /* assemble 'consensus';
856 * this is not a real consensus, it is more a gap indicator,
857 * for each position it consists of residues/gaps in the 1st sequences,
858 * or a residue (if any) of the other sequences.
859 * it only contains a gap if all sequences of the profile
860 * have a gap at this position
862 pcConsens2 = CKCALLOC(prHMM->L+strlen(ppcProfile2[0])+1, sizeof(char));
863 for (i = 0; i < prHMM->L; i++){
864 for (j = 0, pcConsens2[i] = '-'; (j < piLeafCount[iR]) && ('-' == pcConsens2[i]); j++){
865 pcConsens2[i] = ppcCopy2[j][i];
869 for (j = 0; (j < piLeafCount[iR]); j++){
870 printf("R%d:%s\n", j, ppcCopy2[j]);
872 printf("RC:%s\n", pcConsens2);
874 } /* ( (1 == piLeafCount[iR]) && (0 != prHMM->L) ) */
878 /* do alignment here (before free)
881 /* the size of the elements in the forward/backward matrices
882 depends very much on the lengths of the profiles _and_
883 in which position (1st/2nd) the longer/shorter profile is.
884 the matrix elements can easily exceed the size of a (long?) double
885 if the longer profile is associated with the query (q) and the
886 shorter with the target (t).
887 this switch appears to be most easily (although unelegantly)
888 effected here. Don't want to do it (upstairs) in PrepareAlignment()
889 because it might jumble up the nodes. Don't want to do it in hhalign()
890 either because ppcProfile1/2 and q/t may be used independently.
893 int iLen1 = strlen(ppcProfile1[0]);
894 int iLen2 = strlen(ppcProfile2[0]);
895 /* potential problem with empty profiles, FS, r249 -> r250 */
896 if ( (0 == iLen1) || (0 == iLen2) ){
897 Log(&rLog, LOG_FATAL, "strlen(prof1)=%d, strlen(prof2)=%d -- nothing to align\n",
904 int iOldMacRam = rHhalignPara.iMacRamMB;
905 iHHret = hhalign(ppcProfile1, piLeafCount[iL], pdWeightsL,
906 ppcProfile2, piLeafCount[iR], pdWeightsR,
908 pcConsens1, pcReprsnt1,
909 pcConsens2, pcReprsnt2,
911 iAux_FS++, /* DEBUG ARGUMENT */
914 if (RETURN_OK != iHHret){ /* FS, r241 -> */
916 "%s:%s:%d: problem in alignment (profile sizes: %d + %d) (%s + %s), forcing Viterbi\n"
917 "\thh-error-code=%d (mac-ram=%d)\n",
918 __FUNCTION__, __FILE__, __LINE__, piLeafCount[iL], piLeafCount[iR],
919 prMSeq->sqinfo[ppiLeafList[iL][0]].name, prMSeq->sqinfo[ppiLeafList[iR][0]].name,
920 iHHret, rHhalignPara.iMacRamMB);
921 /* at this stage hhalign() has failed,
922 the only thing we can do (easily) is to re-run it in Viterbi mode,
923 for this set MAC-RAM=0, set it back to its original value after 2nd try.
925 if (RETURN_FROM_MAC == iHHret){
926 /* Note: the default way to run hhalign() is to initially select MAC
927 by giving it all the memory it needs. MAC may fail due to overflow (repeats?).
928 alternatively, the problem may be (genuinely) too big for MAC.
929 in thses cases it is legitimate to switch to Viterbi.
930 However, selecting Viterbi from the outset is an abuse (abomination!),
931 should this 1st invocation of Viterbi fail, then we (FS) will overrule
932 the user and hammer the system with a massive memory request.
933 (Jos 2:19) If anyone goes outside your house into the street,
934 his blood will be on his own head; we will not be responsible. FS, r246 -> r247 */
935 rHhalignPara.iMacRamMB = 0;
938 rHhalignPara.iMacRamMB = REALLY_BIG_MEMORY_MB;
941 iHHret = hhalign(ppcProfile1, piLeafCount[iL], pdWeightsL,
942 ppcProfile2, piLeafCount[iR], pdWeightsR,
944 pcConsens1, pcReprsnt1,
945 pcConsens2, pcReprsnt2,
947 iAux_FS++, /* DEBUG ARGUMENT */
949 if (RETURN_OK != iHHret){ /* at this stage hhalign() has failed twice,
950 1st time MAC, 2nd time Viterbi, don't know what to do else. FS, r241 -> r243 */
951 fprintf(stderr, "%s:%s:%d: problem in alignment, even Viterbi did not work\n"
952 "\thh-error-code=%d (mac-ram=%d)\n",
953 __FUNCTION__, __FILE__, __LINE__, iHHret, rHhalignPara.iMacRamMB);
954 Log(&rLog, LOG_FATAL, "could not perform alignment -- bailing out\n");
957 fprintf(stderr, "%s:%s:%d: 2nd attempt worked", __FUNCTION__, __FILE__, __LINE__);
959 rHhalignPara.iMacRamMB = iOldMacRam;
960 } /* 1st invocation failed */
962 } /* 1st profile was shorter than 2nd */
965 int iOldMacRam = rHhalignPara.iMacRamMB;
966 iHHret = hhalign(ppcProfile2, piLeafCount[iR], pdWeightsR,
967 ppcProfile1, piLeafCount[iL], pdWeightsL,
969 pcConsens2, pcReprsnt2,
970 pcConsens1, pcReprsnt1,
972 iAux_FS++, /* DEBUG ARGUMENT */
975 if (RETURN_OK != iHHret){ /* FS, r241 -> r243 */
977 "%s:%s:%d: problem in alignment (profile sizes: %d + %d) (%s + %s), forcing Viterbi\n"
978 "\thh-error-code=%d (mac-ram=%d)\n",
979 __FUNCTION__, __FILE__, __LINE__, piLeafCount[iL], piLeafCount[iR],
980 prMSeq->sqinfo[ppiLeafList[iL][0]].name, prMSeq->sqinfo[ppiLeafList[iR][0]].name,
981 iHHret, rHhalignPara.iMacRamMB);
982 /* at this stage hhalign() has failed,
983 the only thing we can do (easily) is to re-run it in Viterbi mode,
984 for this set MAC-RAM=0, set it back to its original value after 2nd try.
986 if (RETURN_FROM_MAC == iHHret){
988 rHhalignPara.iMacRamMB = 0;
991 rHhalignPara.iMacRamMB = REALLY_BIG_MEMORY_MB;
994 iHHret = hhalign(ppcProfile2, piLeafCount[iR], pdWeightsR,
995 ppcProfile1, piLeafCount[iL], pdWeightsL,
997 pcConsens2, pcReprsnt2,
998 pcConsens1, pcReprsnt1,
1000 iAux_FS++, /* DEBUG ARGUMENT */
1002 if (RETURN_OK != iHHret){ /* at this stage hhalign() has failed twice,
1003 1st time MAC, 2nd time Viterbi, don't know what to do else. FS, r241 -> r243 */
1004 fprintf(stderr, "%s:%s:%d: problem in alignment, even Viterbi did not work\n"
1005 "\thh-error-code=%d (mac-ram=%d)\n",
1006 __FUNCTION__, __FILE__, __LINE__, iHHret, rHhalignPara.iMacRamMB);
1007 Log(&rLog, LOG_FATAL, "could not perform alignment -- bailing out\n");
1010 fprintf(stderr, "%s:%s:%d: 2nd attempt worked", __FUNCTION__, __FILE__, __LINE__);
1012 rHhalignPara.iMacRamMB = iOldMacRam;
1013 } /* 1st invocation failed */
1015 } /* 2nd profile was shorter than 1st */
1018 /* free left/right node lists,
1019 * after alignment left/right profiles no longer needed
1021 if (NULL != ppcCopy1){
1023 for (i = 0; i < piLeafCount[iL]; i++){
1024 CKFREE(ppcCopy1[i]);
1030 if (NULL != ppcCopy2){
1032 for (i = 0; i < piLeafCount[iR]; i++){
1033 CKFREE(ppcCopy2[i]);
1039 ppiLeafList[iL] = CKFREE(ppiLeafList[iL]);
1040 ppiLeafList[iR] = CKFREE(ppiLeafList[iR]);
1041 piLeafCount[iL] = piLeafCount[iR] = 0;
1043 } /* was a merge node */
1045 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
1047 FILE *fp = LogGetFP(&rLog, LOG_DEBUG);
1048 for (i = 0; i < iNodeCount; i++){
1049 if (0 == piLeafCount[i]){
1052 fprintf(fp, "node %3d, #leaves=%d:\t", i, piLeafCount[i]);
1053 for (j = 0; ppiLeafList && (j < piLeafCount[i]); j++){
1054 fprintf(fp, "%d,", ppiLeafList[i][j]);
1061 } /* 0 <= iN < iNodeCount */
1062 ProgressDone(prProgress);
1065 /* check length and set length info
1067 iAlnLen = strlen(prMSeq->seq[0]);
1068 for (i=0; i<prMSeq->nseqs; i++) {
1070 Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: name %s; len %d; %s",
1071 i, prMSeq->sqinfo[i].name, strlen(prMSeq->seq[i]), prMSeq->seq[i]);
1075 assert(iAlnLen == strlen(prMSeq->seq[i]));
1077 prMSeq->sqinfo[i].len = iAlnLen;
1079 prMSeq->aligned = TRUE;
1082 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
1085 Log(&rLog, LOG_DEBUG, "Alignment scores with HMM:");
1086 for (i = 0; /*pdScores[i] > 0.0*/i < prMSeq->nseqs; i++){
1087 Log(&rLog, LOG_DEBUG, "%2d:\t%f\n", i, pdScores[i]);
1093 /** translate back ambiguity residues
1094 * hhalign translates ambiguity codes (B,Z) into unknown residues (X).
1095 * as we still have the original input we can substitute them back
1097 TranslateUnknown2Ambiguity(prMSeq);
1098 ReAttachLeadingGaps(prMSeq, iProfProfSeparator);
1100 if (NULL == prHMMList){
1103 CKFREE(ppcProfile2);
1104 CKFREE(ppcProfile1);
1105 CKFREE(ppiLeafList[piOrderLR[DIFF_NODE*(iNodeCount-1)+PRNT_NODE]]);
1106 CKFREE(ppiLeafList);
1107 CKFREE(piLeafCount);
1109 FreeProgress(&prProgress);
1114 StopwatchStop(stopwatch);
1115 StopwatchDisplay(stdout, "Total time for HHalignWrapper():" , stopwatch);
1116 StopwatchFree(stopwatch);
1119 return dScore; /* FIXME alternative: return averaged pdScores */
1122 /*** end: HHalignWrapper() ***/