1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: hhalign_wrapper.c 306 2016-06-13 13:49:04Z fabian $
34 #include "hhalign/general.h"
35 #include "hhalign/hhfunc.h"
36 #include "hhalign/hhalign.h"
38 /* up to this level (from leaf) will background HMM info be applied */
39 #define APPLY_BG_HMM_UP_TO_TREE_DEPTH 10
48 * @note prHalignPara has to point to an already allocated instance
51 void SetDefaultHhalignPara(hhalign_para *prHhalignPara)
53 prHhalignPara->iMacRamMB = 8000; /* default|give just under 8GB to MAC algorithm, FS, 2016-04-18, went from 2GB to 8GB */
54 prHhalignPara->bIsDna = false; /* protein mode unless we say otherwise */
55 prHhalignPara->bIsRna = false;
56 prHhalignPara->pca = -UNITY;
57 prHhalignPara->pcb = -UNITY;
58 prHhalignPara->pcc = -UNITY;
59 prHhalignPara->pcw = -UNITY;
60 prHhalignPara->gapb = -UNITY;
61 prHhalignPara->gapd = -UNITY;
62 prHhalignPara->gape = -UNITY;
63 prHhalignPara->gapf = -UNITY;
64 prHhalignPara->gapg = -UNITY;
65 prHhalignPara->gaph = -UNITY;
66 prHhalignPara->gapi = -UNITY;
67 prHhalignPara->pcaV = -UNITY;
68 prHhalignPara->pcbV = -UNITY;
69 prHhalignPara->pccV = -UNITY;
70 prHhalignPara->pcwV = -UNITY;
71 prHhalignPara->gapbV = -UNITY;
72 prHhalignPara->gapdV = -UNITY;
73 prHhalignPara->gapeV = -UNITY;
74 prHhalignPara->gapfV = -UNITY;
75 prHhalignPara->gapgV = -UNITY;
76 prHhalignPara->gaphV = -UNITY;
77 prHhalignPara->gapiV = -UNITY;
80 /*** end: SetDefaultHhalignPara() ***/
85 * @brief get rid of unknown residues
87 * @note HHalignWrapper can be entered in 2 different ways: (i) all
88 * sequences are un-aligned (ii) there are 2 (aligned) profiles. in
89 * the un-aligned case (i) the sequences come straight from Squid,
90 * that is, they have been sanitised, all non-alphabetic residues
91 * have been rendered as X's. In profile mode (ii) one profile may
92 * have been produced internally. In that case residues may have
93 * been translated back into their 'native' form, that is, they may
94 * contain un-sanitised residues. These will cause trouble during
99 SanitiseUnknown(mseq_t *mseq)
102 int iS; /* iterator for sequence */
103 /*int iR;*/ /* iterator for residue */
104 /*int iLen;*/ /* length of sequence */
108 for (iS = 0; iS < mseq->nseqs; iS++){
110 for (pcRes = mseq->seq[iS]; '\0' != *pcRes; pcRes++){
113 /* don't like MSF gap characters ('~'),
114 sanitise them (and '.' and ' '); FS, r258 -> r259 */
119 if (mseq->seqtype==SEQTYPE_PROTEIN) {
120 if (NULL == strchr(AMINO_ALPHABET, toupper(*pcRes))) {
121 *pcRes = AMINOACID_ANY;
123 } else if (mseq->seqtype==SEQTYPE_DNA) {
124 if (NULL == strchr(DNA_ALPHABET, toupper(*pcRes))) {
125 *pcRes = NUCLEOTIDE_ANY;
127 } else if (mseq->seqtype==SEQTYPE_RNA) {
128 if (NULL == strchr(RNA_ALPHABET, toupper(*pcRes))) {
129 *pcRes = NUCLEOTIDE_ANY;
135 } /* 0 <= iS < mseq->nseqs */
139 } /*** end: SanitiseUnknown() ***/
142 * @brief translate unknown residues back to ambiguity codes;
143 * hhalign translates ambiguity codes (B,Z) into unknown residue (X).
144 * we still have the original (un-aligned) residue information,
145 * by iterating along the original and aligned sequences we can
146 * reconstruct where codes have been changed and restore them
147 * to their original value
149 * @param[in,out] mseq
150 * sequence/profile data, mseq->seq [in,out] is changed to conform
151 * with mseq->orig_seq [in]
155 TranslateUnknown2Ambiguity(mseq_t *mseq)
158 int iS; /* iterator for sequence */
159 int iR, iRo; /* iterator for residue (original) */
160 int iChange, iCase, iAmbi; /* counts how many replacements */
161 static int siOffset = 'a' - 'A';
163 for (iS = 0; iS < mseq->nseqs; iS++){
166 iChange = iCase = iAmbi = 0;
168 while(('\0' != mseq->seq[iS][iR]) &&
169 ('\0' != mseq->orig_seq[iS][iRo])) {
171 /* skip gaps in aligned sequences */
172 while(isgap(mseq->seq[iS][iR])) {
174 } /* was gap in unaligned seq
175 * this should probably not happen */
176 while(isgap(mseq->orig_seq[iS][iRo])) {
178 } /* was gap in aligned seq */
180 /* check if we reached the end of the sequence after
181 * skipping the gaps */
182 if ( ('\0' == mseq->seq[iS][iR]) ||
183 ('\0' == mseq->orig_seq[iS][iRo]) ){
188 if (mseq->seq[iS][iR] != mseq->orig_seq[iS][iRo]){
189 /* FIXME: count replacements, discard case changes */
191 if ( (mseq->seq[iS][iR] == mseq->orig_seq[iS][iRo]+siOffset) ||
192 (mseq->seq[iS][iR] == mseq->orig_seq[iS][iRo]-siOffset) ){
199 Log(&rLog, LOG_FORCED_DEBUG, "seq=%d, pos=(%d:%d), (%c:%c)",
201 mseq->seq[iS][iR], mseq->orig_seq[iS][iRo]);
203 mseq->seq[iS][iR] = mseq->orig_seq[iS][iRo];
211 Log(&rLog, LOG_DEBUG,
212 "in seq %d re-translated %d residue codes (%d true, %d case)",
213 iS, iChange, iAmbi, iCase);
215 /* assert that both sequences (un/aligned) have terminated */
216 /* skip gaps in aligned sequences */
217 while(isgap(mseq->seq[iS][iR])) {
219 } /* was gap in unaligned seq
220 * this should probably not happen */
221 while(isgap(mseq->orig_seq[iS][iRo])) {
223 } /* was gap in aligned seq */
226 if ( ('\0' != mseq->seq[iS][iR]) ||
227 ('\0' != mseq->orig_seq[iS][iRo]) ){
229 Log(&rLog, LOG_FATAL, "inconsistency in un/aligned sequence %d\n>%s\n>%s\n",
230 iS, mseq->seq[iS], mseq->orig_seq[iS]);
233 } /* 0 <= iS < mseq->nseqs */
235 } /*** end: TranslateUnknown2Ambiguity() ***/
239 * @brief re-attach leading and trailing gaps to alignment
241 * @param[in,out] prMSeq
242 * alignment structure (at this stage there should be no un-aligned sequences)
243 * @param[in] iProfProfSeparator
244 * gives sizes of input profiles, -1 if no input-profiles but un-aligned sequences
246 * @note leading and tailing profile columns
247 * that only contain gaps have no effect on the alignment
248 * and are removed during the alignment. if they are
249 * encountered a warning message is printed to screen.
250 * some users like to preserve these gap columns
254 ReAttachLeadingGaps(mseq_t *prMSeq, int iProfProfSeparator)
258 int iSize1 = 0; /* #seqs in 1st profile */
259 int iSize2 = 0; /* #seqs in 2nd profile */
260 int iPPS = iProfProfSeparator;
261 int iLeadO1 = 0; /* leading gaps in 1st seq of 1st profile */
262 int iLeadO2 = 0; /* leading gaps in 1st seq of 2nd profile */
263 int iLeadA1 = 0; /* leading gaps in 1st seq of final alignment */
264 int iLeadA2 = 0; /* leading gaps in PPS seq of final alignment */
265 int iTrailO1 = 0; /* trailing gaps in 1st seq of 1st profile */
266 int iTrailO2 = 0; /* trailing gaps in 1st seq of 2nd profile */
267 int iTrailA1 = 0; /* trailing gaps in 1st seq of final alignment */
268 int iTrailA2 = 0; /* trailing gaps in PPS seq of final alignment */
269 int iLen = 0; /* length of final alignment */
270 int iLen1 = 0; /* length of 1st profile */
271 int iLen2 = 0; /* length of 2nd profile */
272 int iCutHead = 0; /* make up truncation at head */
273 int iCutTail = 0; /* make up truncation at tail */
276 if (-1 == iProfProfSeparator){
280 assert(prMSeq->aligned);
281 assert(prMSeq->nseqs > iProfProfSeparator);
283 iSize1 = iProfProfSeparator;
284 iSize2 = prMSeq->nseqs - iProfProfSeparator;
285 iLen = strlen(prMSeq->seq[0]);
286 iLen1 = strlen(prMSeq->orig_seq[0]);
287 iLen2 = strlen(prMSeq->orig_seq[iPPS]);
289 /* count leading/trailing gaps in 1st sequence of 1st/2nd profile and final alignmant */
290 for (iLeadO1 = 0, pcIter = prMSeq->orig_seq[0]; isgap(*pcIter); pcIter++, iLeadO1++);
291 for (iLeadO2 = 0, pcIter = prMSeq->orig_seq[iPPS]; isgap(*pcIter); pcIter++, iLeadO2++);
292 for (iLeadA1 = 0, pcIter = prMSeq->seq[0]; isgap(*pcIter); pcIter++, iLeadA1++);
293 for (iLeadA2 = 0, pcIter = prMSeq->seq[iPPS]; isgap(*pcIter); pcIter++, iLeadA2++);
294 for (iTrailO1 = 0, pcIter = &prMSeq->orig_seq[0][iLen1-1]; isgap(*pcIter); pcIter--, iTrailO1++);
295 for (iTrailO2 = 0, pcIter = &prMSeq->orig_seq[iPPS][iLen2-1]; isgap(*pcIter); pcIter--, iTrailO2++);
296 for (iTrailA1 = 0, pcIter = &prMSeq->seq[0][iLen-1]; isgap(*pcIter); pcIter--, iTrailA1++);
297 for (iTrailA2 = 0, pcIter = &prMSeq->seq[iPPS][iLen-1]; isgap(*pcIter); pcIter--, iTrailA2++);
300 /* turn leading/trailing gaps into truncation */
301 iLeadO1 = iLeadO1 > iLeadA1 ? iLeadO1-iLeadA1 : 0;
302 iLeadO2 = iLeadO2 > iLeadA2 ? iLeadO2-iLeadA2 : 0;
303 iTrailO1 = iTrailO1 > iTrailA1 ? iTrailO1-iTrailA1 : 0;
304 iTrailO2 = iTrailO2 > iTrailA2 ? iTrailO2-iTrailA2 : 0;
305 iCutHead = iLeadO1 > iLeadO2 ? iLeadO1 : iLeadO2;
306 iCutTail = iTrailO1 > iTrailO2 ? iTrailO1 : iTrailO2;
308 /* re-allocate and shift memory */
309 if ( (iCutHead > 0) || (iCutTail > 0) ){ /* skip if no re-attachment, FS, r244 -> r245 */
310 for (i = 0; i < prMSeq->nseqs; i++){
312 CKREALLOC(prMSeq->seq[i], iLen+iCutHead+iCutTail+2);
313 if (iCutHead > 0){ /* skip if no re-attachment, FS, r244 -> r245 */
314 memmove(prMSeq->seq[i]+iCutHead, prMSeq->seq[i], iLen);
316 for (j = 0; j < iCutHead; j++){
317 prMSeq->seq[i][j] = '-';
319 for (j = iLen+iCutHead; j < iLen+iCutHead+iCutTail; j++){
320 prMSeq->seq[i][j] = '-';
322 prMSeq->seq[i][j] = '\0';
324 } /* (iCutHead > 0) || (iCutTail > 0) */
326 } /*** end: ReAttachLeadingGaps() ***/
329 * @brief reallocate enough memory for alignment and
330 * attach sequence pointers to profiles
332 * @param[in,out] mseq
333 * sequence/profile data, increase memory for sequences in profiles
334 * @param[out] ppcProfile1
335 * pointers to sequencese in 1st profile
336 * @param[out] ppcProfile2
337 * pointers to sequencese in 2nd profile
338 * @param[out] pdWeightsL
339 * weights (normalised to 1.0) for sequences in left profile
340 * @param[out] pdWeightsR
341 * weights (normalised to 1.0) for sequences in right profile
342 * @param[in] pdSeqWeights
343 * weights for _all_ sequences in alignment
344 * @param[in] iLeafCountL
345 * number of sequences in 1st profile
346 * @param[in] piLeafListL
347 * array of integer IDs of sequences in 1st profile
348 * @param[in] iLeafCountR
349 * number of sequences in 2nd profile
350 * @param[in] piLeafListR
351 * array of integer IDs of sequences in 2nd profile
355 PrepareAlignment(mseq_t *mseq, char **ppcProfile1, char **ppcProfile2,
356 double *pdWeightsL, double *pdWeightsR, double *pdSeqWeights,
357 int iLeafCountL, int *piLeafListL,
358 int iLeafCountR, int *piLeafListR)
361 int iLenL = 0; /* length of 1st profile */
362 int iLenR = 0; /* length of 2nd profile */
363 int iMaxLen = 0; /* maximum possible length of alignment */
365 double dWeight = 0.00;
366 double dWeightInv = 0.00;
369 assert(NULL!=ppcProfile1);
370 assert(NULL!=ppcProfile2);
371 assert(NULL!=piLeafListL);
372 assert(NULL!=piLeafListR);
374 /* get length of profiles,
375 * in a profile all sequences should have same length so only look at 1st
377 iLenL = strlen(mseq->seq[piLeafListL[0]]);
378 iLenR = strlen(mseq->seq[piLeafListR[0]]);
379 iMaxLen = iLenL + iLenR + 1;
381 /* reallocate enough memory for sequences in alignment (for adding
384 for (i = 0; i < iLeafCountL; i++){
385 mseq->seq[piLeafListL[i]] =
386 CKREALLOC(mseq->seq[piLeafListL[i]], iMaxLen);
388 for (i = 0; i < iLeafCountR; i++){
389 mseq->seq[piLeafListR[i]] =
390 CKREALLOC(mseq->seq[piLeafListR[i]], iMaxLen);
393 /* attach sequences to profiles
395 for (i = 0; i < iLeafCountL; i++){
396 ppcProfile1[i] = mseq->seq[piLeafListL[i]];
398 ppcProfile1[i] = NULL;
400 for (i = 0; i < iLeafCountR; i++){
401 ppcProfile2[i] = mseq->seq[piLeafListR[i]];
403 ppcProfile2[i] = NULL;
406 /* remove terminal 'X' for single sequences:
407 * it is a quirk of hhalign() to delete 2 individual sequences
408 * if 2 terminal X's meet during alignment,
409 * just replace (one of) them.
410 * this can be undone at the end.
411 * profiles -consisting of more than 1 sequence-
412 * appear to be all-right.
413 * there seems to be no problem with B's and Z's
415 if ( (1 == iLeafCountL) && (1 == iLeafCountR) ){
417 if ( ('X' == ppcProfile1[0][0]) && ('X' == ppcProfile2[0][0]) ){
419 ppcProfile1[0][0] = NOTX; /* FIXME: arbitrary assignment */
420 ppcProfile2[0][0] = NOTX; /* FIXME: arbitrary assignment */
422 if ( ('X' == ppcProfile1[0][iLenL-1]) && ('X' == ppcProfile2[0][iLenR-1]) ){
423 ppcProfile1[0][iLenL-1] = NOTX; /* FIXME: arbitrary assignment */
424 ppcProfile2[0][iLenR-1] = NOTX; /* FIXME: arbitrary assignment */
428 /* obtain sequence weights
430 if (NULL != pdSeqWeights){
433 for (i = 0; i < iLeafCountL; i++){
434 register double dAux = pdSeqWeights[piLeafListL[i]];
437 Log(&rLog, LOG_DEBUG, "seq-weight %d = %f", piLeafListL[i], dAux);
440 pdWeightsL[i] = dAux;
442 } /* 0 <= i < iLeafCountL */
443 dWeightInv = 1.00 / dWeight;
444 for (i = 0; i < iLeafCountL; i++){
445 pdWeightsL[i] *= dWeightInv;
449 for (i = 0; i < iLeafCountR; i++){
450 register double dAux = pdSeqWeights[piLeafListR[i]];
453 Log(&rLog, LOG_DEBUG, "seq-weight %d = %f", piLeafListR[i], dAux);
456 pdWeightsR[i] = dAux;
458 } /* 0 <= i < iLeafCountL */
459 dWeightInv = 1.00 / dWeight;
460 for (i = 0; i < iLeafCountR; i++){
461 pdWeightsR[i] *= dWeightInv;
463 } /* (NULL != pdSeqWeights) */
465 pdWeightsL[0] = pdWeightsR[0] = -1.00;
469 for (i = 0; i < iLeafCountL; i++){
470 Log(&rLog, LOG_FORCED_DEBUG, "ppcProfile1[%d/%d] pointing to mseq %d = %s",
471 i, iLeafCountR, piLeafListL[i], ppcProfile1[i]);
473 for (i = 0; i < iLeafCountR; i++){
474 Log(&rLog, LOG_FORCED_DEBUG, "ppcProfile2[%d/%d] pointing to mseq %d = %s",
475 i, iLeafCountR, piLeafListR[i], ppcProfile2[i]);
481 } /*** end: PrepareAlignment() ***/
485 * @brief PosteriorProbabilities() calculates posterior probabilities
486 * of aligning a single sequences on-to an alignment containing this sequence
489 * holds the aligned sequences [in]
490 * @param[in] rHMMalignment
491 * HMM of the alignment in prMSeq
492 * @param[in] rHhalignPara
493 * various parameters read from commandline, needed by hhalign()
494 * @param[in] pcPosteriorfile
495 * name of file into which posterior probability information is written
497 * @return score of the alignment FIXME what is this?
499 * @note the PP-loop can be parallelised easily FIXME
503 PosteriorProbabilities(mseq_t *prMSeq, hmm_light rHMMalignment, hhalign_para rHhalignPara, char *pcPosteriorfile)
508 int iS; /* iterator for sequences */
509 int iNseq = prMSeq->nseqs; /* number of sequences */
510 int iLenHMM = rHMMalignment.L; /* length of alignment */
511 int iViterbiCount = 0; /* counts how often Viterbi is triggered */
512 char **ppcCopy = NULL;
513 char **ppcRepresent = NULL;
514 char zcAux[10000] = {0};
515 char zcError[10000] = {0};
516 hhalign_scores *prHHscores = NULL;
519 pfPP = fopen(pcPosteriorfile, "w");
520 fprintf(pfPP, "#1.i\t2.name\t\t3.L1\t4.L2\t5.sum\t\t6.sum/L1\t7.HH\n");
523 prHHscores = CKCALLOC(iNseq, sizeof(hhalign_scores));
524 for (iS = 0; iS < iNseq; iS++){
525 memset(&(prHHscores[iS]), 0, sizeof(hhalign_scores));
528 ppcCopy = CKCALLOC(1, sizeof(char *));
529 ppcCopy[0] = CKCALLOC(2*iLenHMM, sizeof(char));
530 ppcRepresent = CKCALLOC(1, sizeof(char *));
531 ppcRepresent[0] = CKCALLOC(2*iLenHMM, sizeof(char));
533 for (i = 0; i < iLenHMM; i++){
534 ppcRepresent[0][i] = rHMMalignment.seq[rHMMalignment.ncons][i+1];
537 /* FIXME: this loop can be parallelised, FS r288 */
539 for (iS = 0; iS < iNseq; iS++){
541 /* single sequences may very well contain leading/trailing gaps,
542 * this will trigger warning in Transfer:hhalignment-C.h */
544 for (pcIter = prMSeq->orig_seq[iS]; ('-' == *pcIter) || ('.' == *pcIter); pcIter++);
545 strcpy(ppcCopy[0], pcIter/*prMSeq->orig_seq[iS]*/);
546 for (pcIter = &ppcCopy[0][strlen(ppcCopy[0])-1]; ('-' == *pcIter) || ('.' == *pcIter); pcIter--);
551 hhalign(ppcCopy, 1, NULL,
552 ppcRepresent, 0, NULL,
553 &dScore, &rHMMalignment, &rHMMalignment,
554 NULL, NULL, NULL, NULL,
555 rHhalignPara, &prHHscores[iS], 0/* debug */, FALSE /* Log-Level*/,
557 if (NULL != strstr(zcError, "Viterbi")){
561 } /* 0 <= iS < iNseq */
562 Log(&rLog, LOG_INFO, "Viterbi algorithm triggered %d times (out of %d)", iViterbiCount, iNseq-1);
564 for (iS = 0; iS < iNseq; iS++){
566 fprintf(pfPP, "%d\t%10s\t%3d"
567 , iS, prMSeq->sqinfo[iS].name, (int)(strlen(prMSeq->orig_seq[iS])));
568 fprintf(pfPP, "\t%3d\t%f\t%f\t%f",
569 prHHscores[iS].L, prHHscores[iS].sumPP, prHHscores[iS].sumPP/strlen(prMSeq->orig_seq[iS]), prHHscores[iS].hhScore);
571 } /* 0 <= iS < iNseq */
575 fclose(pfPP); pfPP = NULL;
576 free(ppcRepresent[0]); ppcRepresent[0] = NULL;
577 free(ppcCopy[0]); ppcCopy[0] = NULL;
578 free(ppcRepresent); ppcRepresent = NULL;
579 free(ppcCopy); ppcCopy = NULL;
580 free(prHHscores); prHHscores = NULL;
584 } /* this is the end of PosteriorProbabilities() */
588 * @brief sequentially align (chain) sequences
590 * @param[in,out] prMSeq
591 * holds the un-aligned sequences (in) and the final alignment (out)
592 * @param[in] rHhalignPara
593 * various parameters needed by hhalign()
594 * @param[in] iClustersize
595 * parameter that controls how often HMM is updated
597 * @note chained alignment takes much longer than balanced alignment
598 * because at every step ClustalO has to scan all previously aligned residues.
599 * for a balanced tree this takes N*log(N) time but for a chained tree
601 * This function has a short-cut, that the HMM need not be updated
602 * for every single alignment step, but the HMM from the previous
603 * step(s) can be re-cycled. The HMM is updated (i) at the very first step,
604 * (ii) if a gap has been inserted into the HMM during alignment or
605 * (iii) if the HMM has been used for too many steps without having
606 * been updated. This update-frequency is controlled by the input
607 * parameter iClustersize. iClustersize is the number of sequences used
608 * to build a HMM to allow for one non-updating step. For example,
609 * if iClustersize=100 and a HMM has been build from 100 sequences,
610 * then this HMM can be used once without updating. If the HMM has
611 * been built from 700 sequences (and iClustersize=100) then this
612 * HMM can be used 7-times without having to be updated, etc.
613 * For this reason the initial iClustersize sequences are always
614 * aligned with fully updated HMMs.
617 PileUp(mseq_t *prMSeq, hhalign_para rHhalignPara, int iClustersize)
619 /* @<variables local to PileUp> */
620 int iI; /* iterator for sequences */
621 int iCountPro = 0; /* count how many sequences are already in profile */
622 int *piLeafListSeq = NULL; /* lists required by PrepareAlignment() for attaching sequences pointers to ppcProfileSeq */
623 int *piLeafListPro = NULL; /* -"- ppcProfilePro */
624 double *pdWeightL = NULL; /* argument used by hhalign, not used here */
625 double *pdWeightR = NULL; /* -"- */
626 double *pdWeightP = NULL; /* -"- */
627 char **ppcProfileSeq = NULL; /* pointer to sequences in profile */
628 char **ppcProfilePro = NULL; /* pointer to sequences in profile */
629 double dScore = -1.0; /* alignment score, calculated by hhalign() */
630 hhalign_scores rHHscores = {0}; /* more explicit scores than dScore */
631 hmm_light *prHMM = NULL; /* HMM against which to align single sequence */
632 hmm_light rHMMprofile = {0}; /* HMM calculated externally from profile */
633 int iHHret = -1; /* return value of hhalign() */
634 char zcAux[10000] = {0}; /* auxilliary print string used by hhalign() */
635 char zcError[10000] = {0}; /* error message printed by hhalign() */
636 char *pcConsensPro = NULL, /* auxilliary sequence strings required by hhalign() */
637 *pcReprsntPro = NULL, /* not used here */
638 *pcConsensSeq = NULL, *pcReprsntSeq = NULL;
639 bool bHMM_stale = YES; /* flag if HMM has to be updated */
640 int iSinceLastUpdate = -1; /* counts how often HMM has been used since last update */
641 progress_t *prProgress; /* structure required for progress report */
642 bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE; /* progress report */
643 char **ppcReprsnt = NULL; /* string used to represent HMM */
644 ppcReprsnt = CKCALLOC(1, sizeof(char *));
646 /* weights are not really used, but hhalign() expects them */
647 pdWeightL = (double *)CKMALLOC(prMSeq->nseqs * sizeof(double));
648 pdWeightR = (double *)CKMALLOC(prMSeq->nseqs * sizeof(double));
649 pdWeightP = (double *)CKMALLOC(prMSeq->nseqs * sizeof(double));
651 /* lists used for attaching ppcProfileSeq/ppcProfilePro to prMSeq */
652 piLeafListSeq = (int *)CKMALLOC(1 * sizeof(int));
653 piLeafListPro = (int *)CKMALLOC(prMSeq->nseqs * sizeof(int));
655 ppcProfileSeq = (char **)CKMALLOC(1 * sizeof(char *));
656 ppcProfileSeq[0] = NULL;
657 ppcProfilePro = (char **)CKMALLOC(prMSeq->nseqs * sizeof(char *));
658 for (iI = 0; iI < prMSeq->nseqs; iI++){
659 ppcProfilePro[iI] = NULL;
662 piLeafListPro[0] = 0; /* first sequences in profile is simply un-aligned sequence */
663 iCountPro = 1; /* 'profile' now contains 1 sequence */
665 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
666 "Progressive alignment progress", bPrintCR);
668 /* build up the initial alignment:
669 * sequences are chained but hhalign() is used normally,
670 * that is, one sequence is aligned to a profile */
671 for (iI = 1; iI < MIN(prMSeq->nseqs, iClustersize); iI++){
673 piLeafListSeq[0] = iI;
675 /* PrepareAlignment() connects ppcProfileSeq/ppcProfilePro and prMSeq */
676 PrepareAlignment(prMSeq, ppcProfilePro, ppcProfileSeq,
677 pdWeightL, pdWeightR, pdWeightP,
681 /* align 1 (5th argument) sequence to the growing profile
682 * (2nd argument number of sequences is iI),
683 * external HMM not used at this stage (prHMM=NULL) */
684 iHHret = hhalign(ppcProfilePro, iI, NULL/*pdWeightL*/,
685 ppcProfileSeq, 1, NULL/*pdWeightR*/,
686 &dScore, prHMM, prHMM,
687 pcConsensPro, pcReprsntPro,
688 pcConsensSeq, pcReprsntSeq,
689 rHhalignPara, &rHHscores,
690 CALL_FROM_REGULAR/* DEBUG ARGUMENT */, rLog.iLogLevelEnabled,
693 piLeafListPro[iI] = iI;
695 ProgressLog(prProgress, iI, prMSeq->nseqs-1, FALSE); /* FIXME: test! FS, r288 */
697 } /* 1 <= iI < MIN(prMSeq->nseqs, iClustersize) */
699 /* now align single sequences not to a profile but to a HMM of a potentially old profile */
700 while (iI < prMSeq->nseqs){
702 /* if HMM not up-to-date then create new HMM for profile:
703 * HMM is stale (i) at the beginning,
704 * (ii) if a gap has been inserted into the HMM during previous step,
705 * (iii) too many steps after the last up-date */
706 if ( (YES == bHMM_stale) ){
707 FreeHMMstruct(&rHMMprofile);
709 AlnToHMM2(&rHMMprofile, rHhalignPara, prMSeq->seq, iI)
711 Log(&rLog, LOG_ERROR, "Couldn't convert alignment to HMM. Will not proceed with chained alignment, step %d", iI);
714 iSinceLastUpdate = 0;
717 /* align single sequence to HMM */
718 piLeafListSeq[0] = iI;
720 PrepareAlignment(prMSeq, ppcProfilePro, ppcProfileSeq,
721 pdWeightL, pdWeightR, pdWeightP,
725 /* use representative sequence to track where gaps are inserted into HMM */
726 ppcReprsnt[0] = CKREALLOC(ppcReprsnt[0], strlen(ppcProfilePro[0])+strlen(ppcProfileSeq[0])+1);
727 memcpy(ppcReprsnt[0], rHMMprofile.seq[rHMMprofile.ncons]+1, rHMMprofile.L);
728 ppcReprsnt[0][rHMMprofile.L] = '\0';
730 /* align 1 (5th argument) sequence to a HMM (2nd argument number of sequences is '0',
731 * not iI as for a profile or '1' for one representative sequence),
732 * external HMM (rHMMprofile calculated by AlnToHMM2()) is used at this stage */
733 prHMM = &rHMMprofile;
734 hhalign(ppcReprsnt, 0, NULL,
735 ppcProfileSeq, 1, NULL,
736 &dScore, prHMM, prHMM,
737 pcConsensPro, pcReprsntPro,
738 pcConsensSeq, pcReprsntSeq,
739 rHhalignPara, &rHHscores,
740 CALL_FROM_ALN_HMM/* DEBUG ARGUMENT */, rLog.iLogLevelEnabled,
742 iSinceLastUpdate++; /* HMM has been used one more time */
743 ProgressLog(prProgress, iI, prMSeq->nseqs-1, FALSE); /* FIXME: test! FS, r288 */
745 /* check if gaps have been inserted into HMM.
746 * if this is the case then HMM is stale
747 * and the profile used to create the HMM must get gaps at the same positions.
748 * gaps are shown in the representative sequence */
749 if (rHMMprofile.L != strlen(ppcReprsnt[0])){
751 int iSeq; /* iterate through sequences of profile */
753 /* manually insert gaps into existing profile,
754 * there should already be enough memory */
756 /* all sequences in profile obtain gaps at the same position,
757 * can do this in parallel */
758 #pragma omp parallel for
760 for (iSeq = 0; iSeq < iI; iSeq++){
762 int iPtrRep, /* points to 'residue' in representative sequence */
763 iPtrPrf; /* points to the corresponding position in the profile */
765 for (iPtrRep = strlen(ppcReprsnt[0])-1, iPtrPrf = strlen(ppcProfilePro[iSeq])-1;
766 iPtrRep >= 0; iPtrRep--){
768 if ('-' == ppcReprsnt[0][iPtrRep]){ /* gaps are newly introduced into profile */
769 ppcProfilePro[iSeq][iPtrRep] = '-';
771 else { /* non-gap characters (residues) are shifted towards the back */
772 ppcProfilePro[iSeq][iPtrRep] = ppcProfilePro[iSeq][iPtrPrf];
775 if (iPtrRep == iPtrPrf){
776 /* if profile and representative seq point to same position then no more gaps */
779 } /* strlen(ppcReprsnt[0])-1 >= iPtrRep >= 0 */
780 ppcProfilePro[iSeq][strlen(ppcReprsnt[0])] = '\0';
782 } /* 0 <= iSeq < iI */
787 } /* strlen(represent) != HMM.L (gaps inserted into HMM) */
789 /* check if maximum number of sequences exceeded */
790 if ( (iClustersize <= 1) || (iSinceLastUpdate > (double)(rHMMprofile.N_in)/(double)(iClustersize)) ){
795 piLeafListPro[iI] = iI;
800 } /* iI < prMSeq->nseqs */
801 ProgressDone(prProgress);
802 FreeProgress(&prProgress);
804 FreeHMMstruct(&rHMMprofile);
805 CKFREE(pdWeightL); CKFREE(pdWeightR); CKFREE(pdWeightP);
806 if (NULL != ppcReprsnt){
807 if (NULL != ppcReprsnt[0]){
808 CKFREE(ppcReprsnt[0]);
812 CKFREE(piLeafListSeq); CKFREE(piLeafListPro);
816 } /* this is the end of PileUp() */
823 * @brief wrapper for hhalign. This is a frontend function to
824 * the ported hhalign code.
826 * @param[in,out] prMSeq
827 * holds the unaligned sequences [in] and the final alignment [out]
828 * @param[in] piOrderLR
829 * holds order in which sequences/profiles are to be aligned,
830 * even elements specify left nodes, odd elements right nodes,
831 * if even and odd are same then it is a leaf
832 * @param[in] pdSeqWeights
833 * Weight per sequence. No weights used if NULL
834 * @param[in] iNodeCount
835 * number of nodes in tree, piOrderLR has 2*iNodeCount elements
836 * @param[in] prHMMList
837 * List of background HMMs (transition/emission probabilities)
838 * @param[in] iHMMCount
839 * Number of input background HMMs
840 * @param[in] iProfProfSeparator
841 * Gives the number of sequences in the first profile, if in
842 * profile/profile alignment mode (iNodeCount==3). That assumes mseqs
843 * holds the sequences of profile 1 and profile 2.
844 * @param[in] rHhalignPara
845 * various parameters read from commandline
847 * @return score of the alignment FIXME what is this?
849 * @note complex function. could use some simplification, more and
850 * documentation and a struct'uring of piOrderLR
852 * @note HHalignWrapper can be entered in 2 different ways:
853 * (i) all sequences are un-aligned (ii) there are 2 (aligned) profiles.
854 * in the un-aligned case (i) the sequences come straight from Squid,
855 * that is, they have been sanitised, all non-alphabetic residues
856 * have been rendered as X's. In profile mode (ii) one profile may
857 * have been produced internally. In that case residues may have
858 * been translated back into their 'native' form, that is, they
859 * may contain un-sanitised residues. These will cause trouble
862 * @note: introduced argument hhalign_para rHhalignPara; FS, r240 -> r241
863 * @note: if hhalign() fails then try with Viterbi by setting MAC-RAM=0; FS, r241 -> r243
868 HHalignWrapper(mseq_t *prMSeq, int *piOrderLR,
869 double *pdSeqWeights, int iNodeCount,
870 hmm_light *prHMMList, int iHMMCount,
871 int iProfProfSeparator, hhalign_para rHhalignPara)
873 int iN; /* node iterator */
874 int *piLeafCount = NULL; /* number of leaves beneath a certain node */
875 int **ppiLeafList = NULL; /* list of leaves beneath a certain node */
876 char **ppcProfile1 = NULL; /* pointer to sequences in profile */
877 char **ppcProfile2 = NULL; /* pointer to sequences in profile */
878 char *pcReprsnt1 = NULL; /* representative of HMM aligned to left */
879 char *pcReprsnt2 = NULL; /* representative of HMM aligned to right */
880 char **ppcReprsnt1 = &pcReprsnt1; /* representative of HMM aligned to L */
881 char **ppcReprsnt2 = &pcReprsnt2; /* representative of HMM aligned to R */
882 char *pcConsens1 = NULL; /* copy of left sequence */
883 char *pcConsens2 = NULL; /* copy of right sequence */
884 char **ppcCopy1 = /*&pcCopy1*/NULL; /* copy of left sequences */
885 char **ppcCopy2 = /*&pcCopy2*/NULL; /* copy of right sequences */
886 double *pdScores = NULL; /* alignment scores (seq/HMM) */
887 double dScore = 0.0; /* alignment score (seq/HMM) */
889 char zcAux[10000] = {0};
890 char zcError[10000] = {0};
892 progress_t *prProgress;
893 int iAlnLen; /* alignment length */
894 double *pdWeightsL = NULL; /* sequence weights of left profile */
895 double *pdWeightsR = NULL; /* sequence weights of right profile */
896 int iMergeNodeCounter = 0;
897 hmm_light *prHMM = NULL;
898 hmm_light *prHMMleft = NULL;
899 hmm_light *prHMMrght = NULL;
900 hmm_light *prHMMnull = NULL;
901 bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
903 char zcStopwatchMsg[1024];
904 Stopwatch_t *stopwatch = StopwatchCreate();
905 StopwatchZero(stopwatch);
906 StopwatchStart(stopwatch);
908 hhalign_scores rHHscores = {0};
910 #if 0 /* DEVEL 291 */
911 if (NULL != prHMMList) { /* FIXME DEVEL 291: replace this outer test with iHMMCount>0*/
913 Log(&rLog, LOG_WARN, "FIXME: Using only first of %u HMMs (needs implementation)", iHMMCount);
915 prHMM = &(prHMMList[0]); /* FIXME DEVEL 291: check for NULL */
917 /* FIXME: prHMM not allowed to be NULL and therefore pseudo allocated here */
918 prHMM = (hmm_light *) CKCALLOC(1, sizeof(hmm_light));
921 prHMMnull = (hmm_light *) CKCALLOC(1, sizeof(hmm_light));
922 if ( (iHMMCount > 0) && (NULL != prHMMList) ){
923 prHMM = &(prHMMList[0]);
925 Log(&rLog, LOG_WARN, "FIXME: Using only first of %u HMMs (needs implementation)", iHMMCount);
929 prHMM = prHMMnull; /* prHMM not allowed to be NULL and therefore assigned to pseudo allocated */
933 assert(NULL!=prMSeq);
935 if (NULL==piOrderLR) {
936 assert(3==iNodeCount);
938 SanitiseUnknown(prMSeq);
940 /* hhalign was not made for DNA/RNA. So warn if sequences are not
943 if (SEQTYPE_PROTEIN != prMSeq->seqtype) {
944 /*Log(&rLog, LOG_WARN, "%s alignment is still experimental.",
945 SeqTypeToStr(prMSeq->seqtype));*/
946 if(prMSeq->seqtype == SEQTYPE_DNA)
947 rHhalignPara.bIsDna = true;
948 if(prMSeq->seqtype == SEQTYPE_RNA)
949 rHhalignPara.bIsRna = true;
952 /* hhalign produces funny results if sequences contain gaps, so
953 * dealign. Only way to use alignment info is to use it as a
956 if (TRUE == prMSeq->aligned) {
957 Log(&rLog, LOG_DEBUG, "Dealigning aligned sequences (inside %s)", __FUNCTION__);
962 Log(&rLog, LOG_FORCED_DEBUG, "iNodeCount = %d", iNodeCount);
966 /* allocate top-level memory for leaf tracking arrays and profiles,
967 * and sequence weights*/
968 piLeafCount = CKCALLOC(iNodeCount, sizeof(int));
969 ppiLeafList = CKCALLOC(iNodeCount, sizeof(int *));
970 ppcProfile1 = CKCALLOC(prMSeq->nseqs*2-1, sizeof(char *));
971 ppcProfile2 = CKCALLOC(prMSeq->nseqs*2-1, sizeof(char *));
972 pdScores = CKCALLOC(iNodeCount, sizeof(double));
973 pdWeightsL = CKCALLOC(iNodeCount, sizeof(double));
974 pdWeightsR = CKCALLOC(iNodeCount, sizeof(double));
976 NewProgress(&prProgress, LogGetFP(&rLog, LOG_INFO),
977 "Progressive alignment progress", bPrintCR);
981 /* Profile-profile alignment? Then setup piLeafCount and
982 * piLeafList here. FIXME this is just an awful haaaack
984 if (iNodeCount==3 && NULL==piOrderLR) {
985 int iSizeProf1 = iProfProfSeparator;
986 int iSizeProf2 = prMSeq->nseqs - iProfProfSeparator;
988 piLeafCount[0] = iSizeProf1;
989 ppiLeafList[0] = (int *)CKMALLOC(iSizeProf1 * sizeof(int));
990 for (i=0;i<iSizeProf1;i++) {
991 ppiLeafList[0][i] = i;
993 piLeafCount[1] = iSizeProf2;
994 ppiLeafList[1] = (int *)CKMALLOC(iSizeProf2 * sizeof(int));
995 for (i=0;i<iSizeProf2;i++) {
996 ppiLeafList[1][i] = i+iSizeProf1;
999 /* awful hack inside an awful hack: we had to setup piLeafCount
1000 * and piLeafList outside the node iteration. this which is
1001 * normally done at leaf level inside the node iteration. to
1002 * avoid overwriting the already setup vars set...
1006 piOrderLR = (int *)CKCALLOC(DIFF_NODE, sizeof(int));
1007 piOrderLR[LEFT_NODE] = 0;
1008 piOrderLR[RGHT_NODE] = 1;
1009 piOrderLR[PRNT_NODE] = 2;
1014 iMergeNodeCounter = 0;
1015 for (iN = 0; iN < iNodeCount; iN++){
1017 register int riAux = DIFF_NODE * iN;
1019 /*LOG_DEBUG("node %d ", iN);*/
1021 if (piOrderLR[riAux+LEFT_NODE] == piOrderLR[riAux+RGHT_NODE]){
1023 register int riLeaf = piOrderLR[riAux+LEFT_NODE];
1025 if (NULL == pdSeqWeights) {
1026 Log(&rLog, LOG_FORCED_DEBUG, "node %d is a leaf with entry %d (seq %s)",
1027 iN, riLeaf, prMSeq->sqinfo[riLeaf].name);
1029 Log(&rLog, LOG_FORCED_DEBUG, "node %d is a leaf with entry %d (seq %s) and weight %f",
1030 iN, riLeaf, prMSeq->sqinfo[riLeaf].name, pdSeqWeights[riLeaf]);
1033 /* left/right entry same, this is a leaf */
1034 piLeafCount[piOrderLR[riAux+PRNT_NODE]] = 1; /* number of leaves is '1' */
1035 ppiLeafList[piOrderLR[riAux+PRNT_NODE]] = (int *)CKMALLOC(1 * sizeof(int));
1036 ppiLeafList[piOrderLR[riAux+PRNT_NODE]][0] = riLeaf;
1040 int iL, iR, iP; /* ID of left/right nodes, parent */
1043 Log(&rLog, LOG_DEBUG,
1044 "merge profiles at node %d", iN, piOrderLR[riAux]);
1046 /* iNodeCount - prMSeq->nseqs = total # of merge-nodes
1047 * unless in profile/profile alignment mode
1049 if (1 == iNodeCount) {
1050 ProgressLog(prProgress, ++iMergeNodeCounter,
1053 ProgressLog(prProgress, ++iMergeNodeCounter,
1054 iNodeCount - prMSeq->nseqs, FALSE);
1057 /* left/right entry are not same, this is a merge node */
1058 iL = piOrderLR[riAux+LEFT_NODE];
1059 iR = piOrderLR[riAux+RGHT_NODE];
1060 iP = piOrderLR[riAux+PRNT_NODE];
1061 piLeafCount[iP] = piLeafCount[iL] + piLeafCount[iR];
1062 ppiLeafList[iP] = (int *)CKMALLOC(piLeafCount[iP] * sizeof(int));
1064 for (i = j = 0; i < piLeafCount[iL]; i++, j++){
1065 ppiLeafList[iP][j] = ppiLeafList[iL][i];
1067 for (i = 0; i < piLeafCount[iR]; i++, j++){
1068 ppiLeafList[iP][j] = ppiLeafList[iR][i];
1071 /* prepare simulation arena:
1072 * - make sure enough memory in sequences
1073 * - attach sequence pointers to profiles
1075 /* idea: switch template and query according to nseq? */
1077 PrepareAlignment(prMSeq, ppcProfile1, ppcProfile2,
1078 pdWeightsL, pdWeightsR, pdSeqWeights,
1079 piLeafCount[iL], ppiLeafList[iL],
1080 piLeafCount[iR], ppiLeafList[iR]);
1081 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
1083 FILE *fp = LogGetFP(&rLog, LOG_DEBUG);
1084 Log(&rLog, LOG_DEBUG, "merging profiles %d & %d", iL, iR);
1085 for (i = 0; i < piLeafCount[iL]; i++){
1086 fprintf(fp, "L/#=%3d (ID=%3d, w=%f): %s\n",
1087 i, ppiLeafList[iL][i], pdWeightsL[i], ppcProfile1[i]);
1089 for (i = 0; i < piLeafCount[iR]; i++){
1090 fprintf(fp, "R/#=%3d (ID=%3d, w=%f): %s\n",
1091 i, ppiLeafList[iR][i], pdWeightsR[i], ppcProfile2[i]);
1095 #if 1 /* DEVEL 291 */
1097 * Note: if there is a HMM-batch file, then prMSeq->ppiHMMBindex is not NULL;
1098 * ppiLeafList[iL/iR][0] is the 'lead' sequence in a profile;
1099 * prMSeq->ppiHMMBindex[ppiLeafList[iL][0]] are the HMMs associated with the lead sequence;
1100 * this could be NULL if there are no HMMs associated with this particular sequence
1101 * at the moment only 1st HMM can be used, prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0];
1102 * the index of this HMM can be '-1' if the specified HMM file does not exist
1103 * Note: we only use prHMMleft/prHMMrght, even if global HMM (--hmm-in) is specified
1105 if ( (NULL != prMSeq->ppiHMMBindex) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iL][0]]) &&
1106 (prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] > -1) ){
1107 prHMMleft = &(prHMMList[prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0]]);
1109 else if (iHMMCount > 0){
1113 prHMMleft = prHMMnull;
1116 if ( (NULL != prMSeq->ppiHMMBindex) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iR][0]]) &&
1117 (prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] > -1) ){
1118 prHMMrght = &(prHMMList[prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0]]);
1120 else if (iHMMCount > 0){
1124 prHMMrght = prHMMnull;
1128 /* align individual sequences to HMM;
1129 * - use representative sequence to get gapping
1130 * - create copies of both, individual/representative sequences
1131 * as we don't want to introduce gaps into original
1133 * FIXME: representative sequence is crutch, should use
1134 * full HMM but that does not seem to work at all
1135 * -- try harder! Fail better!
1137 /*if ( (piLeafCount[iL] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){*/
1138 if (0 != prHMMleft->L){
1140 pcReprsnt1 = CKCALLOC(prHMMleft->L+strlen(ppcProfile1[0])+1, sizeof(char));
1141 for (i = 0; i < prHMMleft->L; i++){
1142 pcReprsnt1[i] = prHMMleft->seq[prHMMleft->ncons][i+1];
1144 ppcCopy1 = CKCALLOC(piLeafCount[iL], sizeof(char *));
1145 for (j = 0; j < piLeafCount[iL]; j++){
1146 ppcCopy1[j] = CKCALLOC(prHMMleft->L+strlen(ppcProfile1[0])+1, sizeof(char));
1147 for (i = 0; i < (int) strlen(ppcProfile1[0]); i++){
1148 ppcCopy1[j][i] = ppcProfile1[j][i];
1153 /* the size of the elements in the forward/backward matrices
1154 depends very much on the lengths of the profiles _and_
1155 in which position (1st/2nd) the longer/shorter profile/HMM is.
1156 the matrix elements can easily exceed the size of a (long?) double
1157 if the longer profile/HMM is associated with the query (q) and the
1158 shorter with the target (t).
1159 FIXME: however, pseudo-count adding may also depend on position,
1160 this is only really tested for the HMM being in the 1st position (q)
1161 MUST TEST THIS MORE THOROUGHLY
1163 this switch appears to be most easily (although unelegantly)
1164 effected here. Don't want to do it (upstairs) in PrepareAlignment()
1165 because it might jumble up the nodes. Don't want to do it in hhalign()
1166 either because ppcProfile1/2 and q/t may be used independently.
1169 int iLenA = strlen(ppcCopy1[0]);
1170 int iLenH = prHMMleft->L;
1174 iHHret = hhalign(ppcReprsnt1, 0/* only one representative seq*/, NULL,
1175 ppcCopy1, piLeafCount[iL], pdWeightsL,
1176 &dScore, prHMMleft, prHMMleft,
1177 NULL, NULL, NULL, NULL,
1178 rHhalignPara, &rHHscores,
1179 iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
1183 iHHret = hhalign(ppcCopy1, piLeafCount[iL], pdWeightsL,
1184 ppcReprsnt1, 0/* only one representative seq*/, NULL,
1185 &dScore, prHMMleft, prHMMleft,
1186 NULL, NULL, NULL, NULL,
1187 rHhalignPara, &rHHscores,
1188 iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
1191 if ( (0 != iHHret) && (rLog.iLogLevelEnabled <= LOG_VERBOSE) ){ /* FS, r255 -> */
1192 fprintf(stderr, "%s:%s:%d: (not essential) HMM pre-alignment failed, error %d, \n"
1193 "\t#=%d (len=%d), lead-seq=%s, len(HMM)=%d\n%s\nCARRY ON REGARDLESS\n",
1194 __FUNCTION__, __FILE__, __LINE__, iHHret,
1195 piLeafCount[iL], (int)strlen(ppcCopy1[0]), prMSeq->sqinfo[ppiLeafList[iL][0]].name,
1196 (int)strlen(ppcReprsnt1[0]), zcError);
1199 pdScores[ppiLeafList[iL][0]] = dScore;
1201 printf("score: %f\nL: %s\nH: %s\n",
1202 dScore, ppcCopy1[0], ppcReprsnt1[0]);
1204 /* assemble 'consensus';
1205 * this is not a real consensus, it is more a gap indicator,
1206 * for each position it consists of residues/gaps in the 1st sequences,
1207 * or a residue (if any) of the other sequences.
1208 * it only contains a gap if all sequences of the profile
1209 * have a gap at this position
1211 pcConsens1 = CKCALLOC(prHMMleft->L+strlen(ppcProfile1[0])+1, sizeof(char));
1212 for (i = 0; i < prHMMleft->L; i++){
1213 for (j = 0, pcConsens1[i] = '-'; (j < piLeafCount[iL]) && ('-' == pcConsens1[i]); j++){
1214 pcConsens1[i] = ppcCopy1[j][i];
1218 for (j = 0; (j < piLeafCount[iL]); j++){
1219 printf("L%d:%s\n", j, ppcCopy1[j]);
1221 printf("LC:%s\n", pcConsens1);
1223 } /* ( (1 == piLeafCount[iL]) && (0 != prHMM->L) ) */
1225 /*if ( (piLeafCount[iR] <= APPLY_BG_HMM_UP_TO_TREE_DEPTH) && (0 != prHMM->L) ){*/
1226 if (0 != prHMMrght->L){
1229 pcReprsnt2 = CKCALLOC(prHMMrght->L+strlen(ppcProfile2[0])+1, sizeof(char));
1230 for (i = 0; i < prHMMrght->L; i++){
1231 pcReprsnt2[i] = prHMMrght->seq[prHMMrght->ncons][i+1];
1233 ppcCopy2 = CKCALLOC(piLeafCount[iR], sizeof(char *));
1234 for (j = 0; j < piLeafCount[iR]; j++){
1235 ppcCopy2[j] = CKCALLOC(prHMMrght->L+strlen(ppcProfile2[0])+1, sizeof(char));
1236 for (i = 0; i < (int) strlen(ppcProfile2[0]); i++){
1237 ppcCopy2[j][i] = ppcProfile2[j][i];
1242 /* the size of the elements in the forward/backward matrices
1243 depends very much on the lengths of the profiles _and_
1244 in which position (1st/2nd) the longer/shorter profile/HMM is.
1245 the matrix elements can easily exceed the size of a (long?) double
1246 if the longer profile/HMM is associated with the query (q) and the
1247 shorter with the target (t).
1248 FIXME: however, pseudo-count adding may also depend on position,
1249 this is only really tested for the HMM being in the 1st position (q)
1250 MUST TEST THIS MORE THOROUGHLY
1252 this switch appears to be most easily (although unelegantly)
1253 effected here. Don't want to do it (upstairs) in PrepareAlignment()
1254 because it might jumble up the nodes. Don't want to do it in hhalign()
1255 either because ppcProfile1/2 and q/t may be used independently.
1258 int iLenA = strlen(ppcCopy2[0]);
1259 int iLenH = prHMMrght->L;
1263 iHHret = hhalign(ppcReprsnt2, 0/* only one representative seq */, NULL,
1264 ppcCopy2, piLeafCount[iR], pdWeightsR,
1265 &dScore, prHMMrght, prHMMrght,
1266 NULL, NULL, NULL, NULL,
1267 rHhalignPara, &rHHscores,
1268 iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
1272 iHHret = hhalign(ppcCopy2, piLeafCount[iR], pdWeightsR,
1273 ppcReprsnt2, 0/* only one representative seq */, NULL,
1274 &dScore, prHMMrght, prHMMrght,
1275 NULL, NULL, NULL, NULL,
1276 rHhalignPara, &rHHscores,
1277 iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
1280 if ( (0 != iHHret) && (rLog.iLogLevelEnabled <= LOG_VERBOSE) ){ /* FS, r255 -> */
1281 fprintf(stderr, "%s:%s:%d: (not essential) HMM pre-alignment failed, error %d, \n"
1282 "\t#=%d (len=%d), lead-seq=%s, len(HMM)=%d\n%s\nCARRY ON REGARDLESS\n",
1283 __FUNCTION__, __FILE__, __LINE__, iHHret,
1284 piLeafCount[iR], (int)strlen(ppcCopy2[0]), prMSeq->sqinfo[ppiLeafList[iR][0]].name,
1285 (int)strlen(ppcReprsnt2[0]), zcError);
1288 pdScores[ppiLeafList[iR][0]] = dScore;
1290 printf("H: %s\nR: %s\nscore: %f\n",
1291 ppcReprsnt2[0], ppcCopy2[0], dScore);
1293 /* assemble 'consensus';
1294 * this is not a real consensus, it is more a gap indicator,
1295 * for each position it consists of residues/gaps in the 1st sequences,
1296 * or a residue (if any) of the other sequences.
1297 * it only contains a gap if all sequences of the profile
1298 * have a gap at this position
1300 pcConsens2 = CKCALLOC(prHMMrght->L+strlen(ppcProfile2[0])+1, sizeof(char));
1301 for (i = 0; i < prHMMrght->L; i++){
1302 for (j = 0, pcConsens2[i] = '-'; (j < piLeafCount[iR]) && ('-' == pcConsens2[i]); j++){
1303 pcConsens2[i] = ppcCopy2[j][i];
1307 for (j = 0; (j < piLeafCount[iR]); j++){
1308 printf("R%d:%s\n", j, ppcCopy2[j]);
1310 printf("RC:%s\n", pcConsens2);
1312 } /* ( (1 == piLeafCount[iR]) && (0 != prHMM->L) ) */
1316 /* do alignment here (before free)
1319 /* the size of the elements in the forward/backward matrices
1320 depends very much on the lengths of the profiles _and_
1321 in which position (1st/2nd) the longer/shorter profile is.
1322 the matrix elements can easily exceed the size of a (long?) double
1323 if the longer profile is associated with the query (q) and the
1324 shorter with the target (t).
1325 this switch appears to be most easily (although unelegantly)
1326 effected here. Don't want to do it (upstairs) in PrepareAlignment()
1327 because it might jumble up the nodes. Don't want to do it in hhalign()
1328 either because ppcProfile1/2 and q/t may be used independently.
1331 int iLen1 = strlen(ppcProfile1[0]);
1332 int iLen2 = strlen(ppcProfile2[0]);
1333 /* potential problem with empty profiles, FS, r249 -> r250 */
1334 if ( (0 == iLen1) || (0 == iLen2) ){
1335 Log(&rLog, LOG_FATAL, "strlen(prof1)=%d, strlen(prof2)=%d -- nothing to align\n",
1341 int iOldMacRam = rHhalignPara.iMacRamMB;
1342 iHHret = hhalign(ppcProfile1, piLeafCount[iL], pdWeightsL,
1343 ppcProfile2, piLeafCount[iR], pdWeightsR,
1344 &dScore, prHMMleft, prHMMrght,
1345 pcConsens1, pcReprsnt1,
1346 pcConsens2, pcReprsnt2,
1347 rHhalignPara, &rHHscores,
1348 iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
1351 if (RETURN_OK != iHHret){ /* FS, r241 -> */
1352 /*fprintf(stderr, "%s:%d: emergency EXIT\n", __FILE__, __LINE__); exit(-1);*/
1354 "%s:%s:%d: problem in alignment (profile sizes: %d + %d) (%s + %s), forcing Viterbi\n"
1355 "\thh-error-code=%d (mac-ram=%d)\n%s",
1356 __FUNCTION__, __FILE__, __LINE__, piLeafCount[iL], piLeafCount[iR],
1357 prMSeq->sqinfo[ppiLeafList[iL][0]].name, prMSeq->sqinfo[ppiLeafList[iR][0]].name,
1358 iHHret, rHhalignPara.iMacRamMB, zcError);
1359 /* at this stage hhalign() has failed,
1360 the only thing we can do (easily) is to re-run it in Viterbi mode,
1361 for this set MAC-RAM=0, set it back to its original value after 2nd try.
1364 if (RETURN_FROM_MAC == iHHret){
1365 /* Note: the default way to run hhalign() is to initially select MAC
1366 by giving it all the memory it needs. MAC may fail due to overflow (repeats?).
1367 alternatively, the problem may be (genuinely) too big for MAC.
1368 in thses cases it is legitimate to switch to Viterbi.
1369 However, selecting Viterbi from the outset is an abuse (abomination!),
1370 should this 1st invocation of Viterbi fail, then we (FS) will overrule
1371 the user and hammer the system with a massive memory request.
1372 (Jos 2:19) If anyone goes outside your house into the street,
1373 his blood will be on his own head; we will not be responsible. FS, r246 -> r247 */
1374 rHhalignPara.iMacRamMB = 0;
1377 rHhalignPara.iMacRamMB = REALLY_BIG_MEMORY_MB;
1380 iHHret = hhalign(ppcProfile1, piLeafCount[iL], pdWeightsL,
1381 ppcProfile2, piLeafCount[iR], pdWeightsR,
1382 &dScore, prHMMleft, prHMMrght,
1383 pcConsens1, pcReprsnt1,
1384 pcConsens2, pcReprsnt2,
1385 rHhalignPara, &rHHscores,
1386 iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
1388 if (RETURN_OK != iHHret){ /* at this stage hhalign() has failed twice,
1389 1st time MAC, 2nd time Viterbi, don't know what to do else. FS, r241 -> r243 */
1390 fprintf(stderr, "%s:%s:%d: problem in alignment, Viterbi did not work\n"
1391 "\thh-error-code=%d (mac-ram=%d)\n%s",
1392 __FUNCTION__, __FILE__, __LINE__, iHHret, rHhalignPara.iMacRamMB, zcError);
1393 Log(&rLog, LOG_FATAL, "could not perform alignment -- bailing out\n");
1396 fprintf(stderr, "%s:%s:%d: 2nd attempt worked", __FUNCTION__, __FILE__, __LINE__);
1398 rHhalignPara.iMacRamMB = iOldMacRam;
1399 } /* 1st invocation failed */
1401 } /* 1st profile was shorter than 2nd */
1404 int iOldMacRam = rHhalignPara.iMacRamMB;
1405 iHHret = hhalign(ppcProfile2, piLeafCount[iR], pdWeightsR,
1406 ppcProfile1, piLeafCount[iL], pdWeightsL,
1407 &dScore, prHMMrght, prHMMleft,
1408 pcConsens2, pcReprsnt2,
1409 pcConsens1, pcReprsnt1,
1410 rHhalignPara, &rHHscores,
1411 iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
1414 if (RETURN_OK != iHHret){ /* FS, r241 -> r243 */
1415 /*fprintf(stderr, "%s:%d: emergency EXIT\n", __FILE__, __LINE__); exit(-1);*/
1417 "%s:%s:%d: problem in alignment (profile sizes: %d + %d) (%s + %s), forcing Viterbi\n"
1418 "\thh-error-code=%d (mac-ram=%d)\n%s",
1419 __FUNCTION__, __FILE__, __LINE__, piLeafCount[iL], piLeafCount[iR],
1420 prMSeq->sqinfo[ppiLeafList[iL][0]].name, prMSeq->sqinfo[ppiLeafList[iR][0]].name,
1421 iHHret, rHhalignPara.iMacRamMB, zcError);
1422 /* at this stage hhalign() has failed,
1423 the only thing we can do (easily) is to re-run it in Viterbi mode,
1424 for this set MAC-RAM=0, set it back to its original value after 2nd try.
1427 if (RETURN_FROM_MAC == iHHret){
1429 rHhalignPara.iMacRamMB = 0;
1432 rHhalignPara.iMacRamMB = REALLY_BIG_MEMORY_MB;
1435 iHHret = hhalign(ppcProfile2, piLeafCount[iR], pdWeightsR,
1436 ppcProfile1, piLeafCount[iL], pdWeightsL,
1437 &dScore, prHMMrght, prHMMleft,
1438 pcConsens2, pcReprsnt2,
1439 pcConsens1, pcReprsnt1,
1440 rHhalignPara, &rHHscores,
1441 iAux_FS++, /* DEBUG ARGUMENT */ rLog.iLogLevelEnabled,
1443 if (RETURN_OK != iHHret){ /* at this stage hhalign() has failed twice,
1444 1st time MAC, 2nd time Viterbi, don't know what to do else. FS, r241 -> r243 */
1445 fprintf(stderr, "%s:%s:%d: problem in alignment, Viterbi did not work\n"
1446 "\thh-error-code=%d (mac-ram=%d)\n%s",
1447 __FUNCTION__, __FILE__, __LINE__, iHHret, rHhalignPara.iMacRamMB, zcError);
1448 Log(&rLog, LOG_FATAL, "could not perform alignment -- bailing out\n");
1451 fprintf(stderr, "%s:%s:%d: 2nd attempt worked", __FUNCTION__, __FILE__, __LINE__);
1453 rHhalignPara.iMacRamMB = iOldMacRam;
1454 } /* 1st invocation failed */
1456 } /* 2nd profile was shorter than 1st */
1459 * at this stage have performed alignment of 2 profiles/sequences.
1460 * if HMM batch information had been used then have choices:
1461 * (i) if HMM info only intended for initial alignment (of sequences) then make both HMMs stale;
1462 * (iia) if alignment of 2 profiles/sequences where same HMM used, then retain;
1463 * (iib) if alignment of 2 profiles/sequences where different HMMs used, then make both stale;
1464 * (iii) some majority voting
1466 #if 0 /* always make HMM batch stale (after 1st invocation) */
1467 if ( (NULL != prMSeq->ppiHMMBindex) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iL][0]]) ){
1468 prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] = -1;
1470 if ( (NULL != prMSeq->ppiHMMBindex) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iR][0]]) ){
1471 prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] = -1;
1473 #else /* retain HMMs if they were the same for both profiles */
1474 if (NULL != prMSeq->ppiHMMBindex) {
1477 if ( (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iL][0]]) && (NULL != prMSeq->ppiHMMBindex[ppiLeafList[iR][0]]) ){
1478 if ( prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] == -1){
1479 prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] = -1; /* this is conservative, could do H[iL] = H[iR] */
1480 for (i = 0; i < piLeafCount[iR]; i++){
1481 prMSeq->ppiHMMBindex[ppiLeafList[iR][i]][0] = -1;
1484 else if ( prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] == -1){
1485 prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] = -1; /* this is conservative, could do H[iR] = H[iL] */
1486 for (i = 0; i < piLeafCount[iL]; i++){
1487 prMSeq->ppiHMMBindex[ppiLeafList[iL][i]][0] = -1;
1490 else if (prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] != prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0]){
1491 prMSeq->ppiHMMBindex[ppiLeafList[iL][0]][0] = -1; /* this is NOT conservative, mandatory */
1492 for (i = 0; i < piLeafCount[iL]; i++){
1493 prMSeq->ppiHMMBindex[ppiLeafList[iL][i]][0] = -1;
1495 prMSeq->ppiHMMBindex[ppiLeafList[iR][0]][0] = -1; /* this is NOT conservative, mandatory */
1496 for (i = 0; i < piLeafCount[iR]; i++){
1497 prMSeq->ppiHMMBindex[ppiLeafList[iR][i]][0] = -1;
1501 /* void, HMMs should be same */
1504 } /* there was a HMM batch */
1507 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
1510 printf("@@iL=%d, #(iL)=%d, iR=%d, #(iR)=%d\n", iL, piLeafCount[iL], iR, piLeafCount[iR]);
1511 for (i = 0; i < piLeafCount[iL]; i++){
1512 char *pc = ppcProfile1[i];
1513 printf("@@>%s\n", prMSeq->sqinfo[ppiLeafList[iL][i]].name);
1516 printf("%c", toupper(*pc));
1521 for (i = 0; i < piLeafCount[iR]; i++){
1522 char *pc = ppcProfile2[i];
1523 printf("@@>%s\n", prMSeq->sqinfo[ppiLeafList[iR][i]].name);
1526 printf("%c", toupper(*pc));
1535 /* free left/right node lists,
1536 * after alignment left/right profiles no longer needed
1538 if (NULL != ppcCopy1){
1540 for (i = 0; i < piLeafCount[iL]; i++){
1541 CKFREE(ppcCopy1[i]);
1547 if (NULL != ppcCopy2){
1549 for (i = 0; i < piLeafCount[iR]; i++){
1550 CKFREE(ppcCopy2[i]);
1556 ppiLeafList[iL] = CKFREE(ppiLeafList[iL]);
1557 ppiLeafList[iR] = CKFREE(ppiLeafList[iR]);
1558 piLeafCount[iL] = piLeafCount[iR] = 0;
1560 } /* was a merge node */
1562 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
1564 FILE *fp = LogGetFP(&rLog, LOG_DEBUG);
1565 for (i = 0; i < iNodeCount; i++){
1566 if (0 == piLeafCount[i]){
1569 fprintf(fp, "node %3d, #leaves=%d:\t", i, piLeafCount[i]);
1570 for (j = 0; ppiLeafList && (j < piLeafCount[i]); j++){
1571 fprintf(fp, "%d,", ppiLeafList[i][j]);
1578 } /* 0 <= iN < iNodeCount */
1579 ProgressDone(prProgress);
1582 /* check length and set length info
1584 iAlnLen = strlen(prMSeq->seq[0]);
1585 for (i=0; i<prMSeq->nseqs; i++) {
1587 Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: name %s; len %d; %s",
1588 i, prMSeq->sqinfo[i].name, strlen(prMSeq->seq[i]), prMSeq->seq[i]);
1592 assert(iAlnLen == strlen(prMSeq->seq[i]));
1594 prMSeq->sqinfo[i].len = iAlnLen;
1596 prMSeq->aligned = TRUE;
1599 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
1602 Log(&rLog, LOG_DEBUG, "Alignment scores with HMM:");
1603 for (i = 0; /*pdScores[i] > 0.0*/i < prMSeq->nseqs; i++){
1604 Log(&rLog, LOG_DEBUG, "%2d:\t%f\n", i, pdScores[i]);
1610 /** translate back ambiguity residues
1611 * hhalign translates ambiguity codes (B,Z) into unknown residues (X).
1612 * as we still have the original input we can substitute them back
1614 TranslateUnknown2Ambiguity(prMSeq);
1615 ReAttachLeadingGaps(prMSeq, iProfProfSeparator);
1617 #if 0 /* DEVEL 291 */
1618 if (NULL == prHMMList){
1624 CKFREE(ppcProfile2);
1625 CKFREE(ppcProfile1);
1626 CKFREE(ppiLeafList[piOrderLR[DIFF_NODE*(iNodeCount-1)+PRNT_NODE]]);
1627 CKFREE(ppiLeafList);
1628 CKFREE(piLeafCount);
1630 FreeProgress(&prProgress);
1635 StopwatchStop(stopwatch);
1636 StopwatchDisplay(stdout, "Total time for HHalignWrapper():" , stopwatch);
1637 StopwatchFree(stopwatch);
1640 return dScore; /* FIXME alternative: return averaged pdScores */
1643 /*** end: HHalignWrapper() ***/