1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: clustal-omega.c 304 2016-06-13 13:39:13Z fabian $
27 #include "clustal-omega.h"
28 #include "hhalign/general.h"
29 #include "clustal/hhalign_wrapper.h"
31 /* The following comment block contains the frontpage/mainpage of the doxygen
32 * documentation. Please add some more info. FIXME add more
37 * @mainpage Clustal-Omega Documentation
39 * @section intro_sec Introduction
41 * For more information see http://www.clustal.org/
43 * @section api_section API
45 * @subsection example_prog_subsection An Example Program
47 * To use libclustalo you will have to include the clustal-omega.h header and
48 * link against libclustalo. For linking against libclustalo you will have to
49 * use a C++ compiler, no matter if your program was written in C or C++. See
50 * below (\ref pkgconfig_subsubsec)) on how to figure out compiler flags with
53 * Assuming Clustal Omega was installed in system-wide default directory (e.g.
54 * /usr), first compile (don't link yet) your source (for example code see
55 * section \ref example_src_subsubsec) and then link against libclustalo:
58 * $ gcc -c -ansi -Wall clustalo-api-test.c
59 * $ g++ -ansi -Wall -o clustalo-api-test clustalo-api-test.o -lclustalo
62 * Voila! Now you have your own alignment program based on Clustal Omega which
66 * $ ./clustalo-api-test <your-sequence-input>
69 * It's best to use the same compiler that you used for compiling libclustal.
70 * If libclustal was compiled with OpenMP support, you will have to use OpenMP
71 * flags for you program as well.
74 * @subsubsection pkgconfig_subsubsec Using pkg-config / Figuring out compiler flags
76 * Clustal Omega comes with support for <a
77 * href="http://pkg-config.freedesktop.org">pkg-config</a>, which means you
81 * $ pkg-config --cflags --libs clustalo
84 * to figure out cflags and library flags needed to compile and link against
85 * libclustalo. This is especially handy if Clustal Omega was installed to a
86 * non-standard directory.
88 * You might have to change PKG_CONFIG_PATH. For example, if you used the prefix $HOME/local/ for
89 * installation then you will first need to set PKG_CONFIG_PATH:
92 * $ export PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig
93 * $ pkg-config --cflags --libs clustalo
97 * To compile your source use as above but this time using proper flags:
100 * $ export PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig
101 * $ gcc -c -ansi -Wall $(pkg-config --cflags clustalo) clustalo-api-test.c
102 * $ g++ -ansi -Wall -o clustalo-api-test $(pkg-config --libs clustalo) clustalo-api-test.o
106 * @subsubsection example_src_subsubsec Example Source Code
108 * @include "clustalo-api-test.c"
114 /* the following are temporary flags while the code is still under construction;
115 had problems internalising hhmake, so as temporary crutch
116 write alignment to file and get external hmmer/hhmake via system call
117 to read alignment and convert into HMM
118 All this will go, once hhmake is properly internalised */
119 #define INDIRECT_HMM 0 /* temp flag: (1) write aln to file, use system(hmmer/hhmake), (0) internal hhmake */
120 #define USEHMMER 1 /* temp flag: use system(hmmer) to build HMM */
121 #define USEHHMAKE (!USEHMMER) /* temp flag: use system(hhmake) to build HMM */
124 /* shuffle order of input sequences */
125 #define SHUFFLE_INPUT_SEQ_ORDER 0
127 /* sort input sequences by length */
128 #define SORT_INPUT_SEQS 0
131 int iNumberOfThreads;
133 /* broken, unused and lonely */
134 static const int ITERATION_SCORE_IMPROVEMENT_THRESHOLD = 0.01;
138 * @brief Print Long version information to pre-allocated char.
140 * @note short version
141 * information is equivalent to PACKAGE_VERSION
144 * char pointer to write to preallocated to hold iSize chars.
149 PrintLongVersion(char *pcStr, int iSize)
151 snprintf(pcStr, iSize, "version %s; code-name '%s'; build date %s",
152 PACKAGE_VERSION, PACKAGE_CODENAME, __DATE__);
154 /* end of PrintLongVersion() */
159 * @brief free aln opts members
163 FreeAlnOpts(opts_t *prAlnOpts) {
164 if (NULL != prAlnOpts->pcGuidetreeInfile) {
165 CKFREE(prAlnOpts->pcGuidetreeInfile);
167 if (NULL != prAlnOpts->pcGuidetreeOutfile) {
168 CKFREE(prAlnOpts->pcGuidetreeOutfile);
170 if (NULL != prAlnOpts->pcDistmatOutfile) {
171 CKFREE(prAlnOpts->pcDistmatOutfile);
173 if (NULL != prAlnOpts->pcDistmatInfile) {
174 CKFREE(prAlnOpts->pcDistmatInfile);
177 /* end of FreeAlnOpts() */
182 * @brief Sets members of given user opts struct to default values
185 * User opt struct to initialise
189 SetDefaultAlnOpts(opts_t *prOpts) {
190 prOpts->bAutoOptions = FALSE;
192 prOpts->pcDistmatInfile = NULL;
193 prOpts->pcDistmatOutfile = NULL;
195 prOpts->iClustersizes = 100; /* FS, r274 -> */
196 prOpts->iTransitivity = 0; /* FS, r290 -> */
197 prOpts->pcClustfile = NULL; /* FS, r274 -> */
198 prOpts->pcPosteriorfile = NULL; /* FS, r288 -> */
199 prOpts->iClusteringType = CLUSTERING_UPGMA;
200 prOpts->iPairDistType = PAIRDIST_KTUPLE;
201 prOpts->bUseMbed = TRUE; /* FS, r250 -> */
202 prOpts->bUseMbedForIteration = TRUE; /* FS, r250 -> */
203 prOpts->bPileup = FALSE; /* FS, r288 -> */
204 prOpts->pcGuidetreeOutfile = NULL;
205 prOpts->pcGuidetreeInfile = NULL;
206 prOpts->bPercID = FALSE;
208 prOpts->ppcHMMInput = NULL;
209 prOpts->iHMMInputFiles = 0;
210 prOpts->pcHMMBatch = NULL; /* FS, r291 -> */
212 prOpts->iNumIterations = 0;
213 prOpts->bIterationsAuto = FALSE;
214 prOpts->iMaxGuidetreeIterations = INT_MAX;
215 prOpts->iMaxHMMIterations = INT_MAX;
217 SetDefaultHhalignPara(& prOpts->rHhalignPara);
219 /* end of SetDefaultAlnOpts() */
224 * @brief Check logic of parsed user options. Will exit (call Log(&rLog, LOG_FATAL, ))
225 * on Fatal logic error
228 * Already parsed user options
232 AlnOptsLogicCheck(opts_t *prOpts)
234 /* guide-tree & distmat
237 if (prOpts->pcDistmatInfile && prOpts->pcGuidetreeInfile) {
238 Log(&rLog, LOG_FATAL, "Read distances *and* guide-tree from file doesn't make sense.");
241 if (prOpts->pcDistmatOutfile && prOpts->pcGuidetreeInfile) {
242 Log(&rLog, LOG_FATAL, "Won't be able to save distances to file, because I got a guide-tree as input.");
245 /* combination of options that don't make sense when not iterating
247 if (prOpts->iNumIterations==0 && prOpts->bIterationsAuto != TRUE) {
249 if (prOpts->pcGuidetreeInfile && prOpts->pcGuidetreeOutfile) {
250 Log(&rLog, LOG_FATAL, "Got a guide-tree as input and output which doesn't make sense when not iterating.");
253 if (prOpts->pcGuidetreeInfile && prOpts->bUseMbed > 0) {
254 Log(&rLog, LOG_FATAL, "Got a guide-tree as input and was requested to cluster with mBed, which doesn't make sense when not iterating.");
258 AW: bUseMbedForIteration default since at least R252
259 if (prOpts->bUseMbedForIteration > 0) {
260 Log(&rLog, LOG_FATAL, "No iteration requested, but mbed for iteration was set. Paranoia exit.");
265 if (prOpts->rHhalignPara.iMacRamMB < 512) {
266 Log(&rLog, LOG_WARN, "Memory for MAC Algorithm quite low, Viterbi Algorithm may be triggered.");
267 if (prOpts->rHhalignPara.iMacRamMB < 1) {
268 Log(&rLog, LOG_WARN, "Viterbi Algorithm always turned on, increase MAC-RAM to turn on MAC.");
274 /* end of AlnOptsLogicCheck() */
281 PrintAlnOpts(FILE *prFile, opts_t *prOpts)
286 /* keep in same order as struct */
287 fprintf(prFile, "option: auto-options = %d\n", prOpts->bAutoOptions);
288 fprintf(prFile, "option: distmat-infile = %s\n",
289 NULL != prOpts->pcDistmatInfile? prOpts->pcDistmatInfile: "(null)");
290 fprintf(prFile, "option: distmat-outfile = %s\n",
291 NULL != prOpts->pcDistmatOutfile? prOpts->pcDistmatOutfile: "(null)");
292 fprintf(prFile, "option: clustering-type = %d\n", prOpts->iClusteringType);
293 fprintf(prFile, "option: pair-dist-type = %d\n", prOpts->iPairDistType);
294 fprintf(prFile, "option: use-mbed = %d\n", prOpts->bUseMbed);
295 fprintf(prFile, "option: use-mbed-for-iteration = %d\n", prOpts->bUseMbedForIteration);
296 fprintf(prFile, "option: pile-up = %d\n", prOpts->bPileup);
297 fprintf(prFile, "option: guidetree-outfile = %s\n",
298 NULL != prOpts->pcGuidetreeOutfile? prOpts->pcGuidetreeOutfile: "(null)");
299 fprintf(prFile, "option: guidetree-infile = %s\n",
300 NULL != prOpts->pcGuidetreeInfile? prOpts->pcGuidetreeInfile: "(null)");
301 for (iAux=0; iAux<prOpts->iHMMInputFiles; iAux++) {
302 fprintf(prFile, "option: hmm-input no %d = %s\n", iAux, prOpts->ppcHMMInput[iAux]);
304 fprintf(prFile, "option: hmm-input-files = %d\n", prOpts->iHMMInputFiles);
305 fprintf(prFile, "option: num-iterations = %d\n", prOpts->iNumIterations);
306 fprintf(prFile, "option: iterations-auto = %d\n", prOpts->bIterationsAuto);
307 fprintf(prFile, "option: max-hmm-iterations = %d\n", prOpts->iMaxHMMIterations);
308 fprintf(prFile, "option: max-guidetree-iterations = %d\n", prOpts->iMaxGuidetreeIterations);
309 fprintf(prFile, "option: iMacRamMB = %d\n", prOpts->rHhalignPara.iMacRamMB);
310 fprintf(prFile, "option: percent-id = %d\n", prOpts->bPercID);
311 fprintf(prFile, "option: use-kimura = %d\n", prOpts->bUseKimura);
312 fprintf(prFile, "option: clustering-out = %s\n", prOpts->pcClustfile);
313 fprintf(prFile, "option: posterior-out = %s\n", prOpts->pcPosteriorfile);
316 /* end of PrintAlnOpts() */
321 * @brief Returns major version of HMMER. Whichever hmmbuild version
322 * is found first in your PATH will be used
324 * @return -1 on error, major hmmer version otherwise
330 char zcHmmerTestCall[] = "hmmbuild -h";
332 int iMajorVersion = 0;
335 if (NULL == (fp = popen(zcHmmerTestCall, "r"))) {
336 Log(&rLog, LOG_ERROR, "Couldn't exec %s", zcHmmerTestCall);
339 while (fgets(zcLine, sizeof(zcLine), fp)) {
341 if ((pcLocate = strstr(zcLine, "HMMER "))) {
342 iMajorVersion = atoi(&pcLocate[6]);
348 return iMajorVersion;
350 /* end of HmmerVersion() */
355 * @brief Create a HHM file from aligned sequences
357 * @warning Should be eliminated in the future
358 * as building routine should not create intermediate files
362 * @param[in] pcHMMOut
363 * HMM output file name
365 * @return Non-zero on error
369 AlnToHHMFile(mseq_t *prMSeq, char *pcHMMOut)
371 char *tmp_aln = NULL;
374 assert(NULL!=prMSeq);
375 assert(NULL!=pcHMMOut);
377 if (FALSE == prMSeq->aligned) {
378 Log(&rLog, LOG_ERROR, "Sequences need to be aligned to create an HMM");
382 /* Convert alignment to a2m, and call hhmake
384 * can't be static templates, or mktemp fails (at least on os x
385 * (with a bus error))
387 * gcc says we should use mkstemp to avoid race conditions,
388 * but that returns a file descriptor, which is of no use to
391 /* NOTE: the following won't work on windows: missing /tmp/ */
392 tmp_aln = CkStrdup("/tmp/clustalo_tmpaln_XXXXXX");
393 if (NULL == mktemp(tmp_aln)) {
394 Log(&rLog, LOG_ERROR, "Could not create temporary alignment filename");
396 goto cleanup_and_return;
399 if (WriteAlignment(prMSeq, tmp_aln, MSAFILE_A2M, LINE_WRAP, FALSE)) {
400 Log(&rLog, LOG_ERROR, "Could not save alignment to %s", tmp_aln);
402 goto cleanup_and_return;
405 if (HHMake_Wrapper(tmp_aln, pcHMMOut)){
406 Log(&rLog, LOG_ERROR, "Could not convert alignment %s into HHM", tmp_aln);
408 goto cleanup_and_return;
414 if (NULL != tmp_aln) {
415 if (FileExists(tmp_aln)) {
416 if (remove(tmp_aln)) {
417 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_aln);
425 } /* end of AlnToHHMFile() */
430 * @brief Create a HMM file from aligned sequences
432 * @warning Should be replaced in the future by some internal HMM
433 * building routine that does not call external programs
437 * @param[in] pcHMMOut
438 * HMM output file name
440 * @return Non-zero on error
445 AlnToHMMFile(mseq_t *prMSeq, const char *pcHMMOut)
447 char *tmp_aln = NULL;
448 char *tmp_hmm = NULL; /* only needed for hmmer3 to hmmer2 conversion */
450 int iHmmerVersion = 0;
453 assert(NULL!=prMSeq);
454 assert(NULL!=pcHMMOut);
456 if (FALSE == prMSeq->aligned) {
457 Log(&rLog, LOG_ERROR, "Sequences need to be aligned to create an HMM");
461 iHmmerVersion = HmmerVersion();
462 if (2 != iHmmerVersion && 3 != iHmmerVersion) {
463 Log(&rLog, LOG_ERROR, "Could not find suitable HMMER binaries");
467 /* Convert alignment to stockholm, call hmmbuild and then
468 * either hmmconvert (hmmer3) or hmmcalibrate (hmmer2)
470 * can't be static templates, or mktemp fails (at least on os x
471 * (with a bus error))
473 * gcc says we should use mkstemp to avoid race conditions,
474 * but that returns a file descriptor, which is of no use to
477 /* NOTE: the following won't work on windows: missing /tmp/ */
478 tmp_aln = CkStrdup("/tmp/clustalo_tmpaln_XXXXXX");
479 if (NULL == mktemp(tmp_aln)) {
480 Log(&rLog, LOG_ERROR, "Could not create temporary alignment filename");
482 goto cleanup_and_return;
485 if (WriteAlignment(prMSeq, tmp_aln, MSAFILE_STOCKHOLM, LINE_WRAP, FALSE)) {
486 Log(&rLog, LOG_ERROR, "Could not save alignment to %s", tmp_aln);
488 goto cleanup_and_return;
491 if (2 == iHmmerVersion) {
492 sprintf(cmdbuf, "hmmbuild %s %s >/dev/null && hmmcalibrate %s >/dev/null",
493 pcHMMOut, tmp_aln, pcHMMOut);
494 if (system(cmdbuf)) {
495 Log(&rLog, LOG_ERROR, "Command '%s' failed", cmdbuf);
497 goto cleanup_and_return;
499 } else if (3 == iHmmerVersion) {
500 /* NOTE: the following won't work on windows: missing /tmp/ */
501 tmp_hmm = CkStrdup("/tmp/clustalo_tmphmm2_XXXXXX");
502 if (NULL == mktemp(tmp_hmm)) {
503 Log(&rLog, LOG_ERROR, "Could not create temporary hmm filename");
505 goto cleanup_and_return;
507 sprintf(cmdbuf, "hmmbuild %s %s >/dev/null && hmmconvert -2 %s > %s",
508 tmp_hmm, tmp_aln, tmp_hmm, pcHMMOut);
509 if (system(cmdbuf)) {
510 Log(&rLog, LOG_ERROR, "Command '%s' failed", cmdbuf);
512 goto cleanup_and_return;
516 Log(&rLog, LOG_FATAL, "Internal error: Unknown Hmmer version %d", iHmmerVersion);
522 if (NULL != tmp_aln) {
523 if (FileExists(tmp_aln)) {
524 if (remove(tmp_aln)) {
525 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_aln);
530 if (NULL != tmp_hmm) {
531 if (FileExists(tmp_hmm)) {
532 if (remove(tmp_hmm)) {
533 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_hmm);
541 /* end of AlnToHMMFile() */
546 * @brief Convert a multiple sequence structure into a HMM
549 * Pointer to preallocted HMM which will be set here
551 * Pointer to an alignment
553 * @return 0 on error, non-0 otherwise
555 * @see AlnToHMMFile()
559 AlnToHMM(hmm_light *prHMM, mseq_t *prMSeq)
561 char *pcHMM; /* temp hmm file */
564 "Using HMMER version %d to calculate a new HMM.",
566 /* FIXME replace all this with internal HMM computation (HHmake) */
569 * @warning the following probably won't work on windows: missing
570 * /tmp/. Should be ok on Cygwin though
572 pcHMM = CkStrdup("/tmp/clustalo-hmm-iter_XXXXXX");
573 if (NULL == mktemp(pcHMM)) {
574 Log(&rLog, LOG_ERROR, "Could not create temporary hmm filename");
579 /* Create a HMM representing the current alignment
582 if (AlnToHMMFile(prMSeq, pcHMM)) {
583 Log(&rLog, LOG_ERROR, "AlnToHMMFile() on %s failed.", pcHMM);
588 if (AlnToHHMFile(prMSeq, pcHMM)) {
589 Log(&rLog, LOG_ERROR, "AlnToHHMFile() on %s failed.", pcHMM);
593 /* Log(&rLog, LOG_FATAL, "Method to create HHM (HMM using hhmake) not installed yet"); */
595 Log(&rLog, LOG_FATAL, "Unknown method to create temporary HMM");
598 /* Read HMM information
600 if (OK != readHMMWrapper(prHMM, pcHMM)){
601 Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMM);
607 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", pcHMM);
613 /* end of AlnToHMM() */
622 InitClustalOmega(int iNumThreadsRequested)
626 iNumberOfThreads = iNumThreadsRequested;
627 omp_set_num_threads(iNumberOfThreads);
629 if (iNumThreadsRequested>1) {
630 Log(&rLog, LOG_FATAL, "Cannot change number of threads to %d. %s was build without OpenMP support.",
631 iNumThreadsRequested, PACKAGE_NAME);
633 iNumberOfThreads = 1; /* need to set this, even if build without support */
636 Log(&rLog, LOG_INFO, "Using %d threads",
640 /* end of InitClustalOmega() */
645 * @brief Defines an alignment order, which adds sequences
646 * sequentially, i.e. one at a time starting with seq 1 & 2
648 * @param[out] piOrderLR_p
649 * order in which nodes/profiles are to be merged/aligned
651 * Number of sequences
653 * @see TraverseTree()
657 SequentialAlignmentOrder(int **piOrderLR_p, int iNumSeq)
659 unsigned int uNodes = iNumSeq*2-1;
660 unsigned int uNodeCounter = 0;
661 unsigned int uSeqCounter = 0;
663 Log(&rLog, LOG_FATAL, "FIXME: Untested...");
665 (*piOrderLR_p) = (int *)CKCALLOC(DIFF_NODE * uNodes, sizeof(int));
666 /* loop over merge nodes, which have per definition even indices
667 * and set up children which have odd indices
670 for (uNodeCounter=iNumSeq; uNodeCounter<uNodes; uNodeCounter+=1) {
671 unsigned int uLeftChildNodeIndex = uNodeCounter-1;
672 unsigned int uRightChildNodeIndex = uNodeCounter-iNumSeq+1;
673 unsigned int uParentNodeIndex = uNodeCounter+1;
675 /* merge node setup */
676 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE] = uLeftChildNodeIndex;
677 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+RGHT_NODE] = uRightChildNodeIndex;
678 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+PRNT_NODE] = uParentNodeIndex;
679 /* only setup left child if at first merge node, all other left childs
680 * should be merge nodes that are already set up. also correct
681 * left node number here.
683 if (uNodeCounter==iNumSeq) {
684 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE] = 0;
686 (*piOrderLR_p)[0+LEFT_NODE] = 0;
687 (*piOrderLR_p)[0+RGHT_NODE] = 0;
688 (*piOrderLR_p)[0+PRNT_NODE] = uNodeCounter;
691 Log(&rLog, LOG_FORCED_DEBUG, "Set up first leaf with node counter %d: left=%d right=%d parent=%d",
693 (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+LEFT_NODE],
694 (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+RGHT_NODE],
695 (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+PRNT_NODE]);
697 Log(&rLog, LOG_FORCED_DEBUG, "Set up merge node with node counter %d: left=%d right=%d parent=%d",
698 uNodeCounter, (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE],
699 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+RGHT_NODE],
700 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+PRNT_NODE]);
703 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+LEFT_NODE] = uSeqCounter;
704 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+RGHT_NODE] = uSeqCounter;
705 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+PRNT_NODE] = uNodeCounter;
708 Log(&rLog, LOG_FORCED_DEBUG, "Set up leaf with node counter %d: left=%d right=%d parent=%d",
709 uRightChildNodeIndex, (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+LEFT_NODE],
710 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+RGHT_NODE],
711 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+PRNT_NODE]);
714 /* end of SequentialAlignmentOrder() */
719 * @brief Defines the alignment order by calculating a guide tree. In
720 * a first-step pairwise distances will be calculated (or read from a
721 * file). In a second step those distances will be clustered and a
722 * guide-tree created. Steps 1 and 2 will be skipped if a guide-tree
723 * file was given, in which case the guide-tree will be just read from
726 * @param[out] piOrderLR_p
727 * order in which nodes/profiles are to be merged/aligned
728 * @param[out] pdSeqWeights_p
730 * @param[out] pdSeqWeights_p
733 * The sequences from which the alignment order is to be calculated
734 * @param[in] iPairDistType
735 * Method of pairwise distance comparison
736 * @param[in] pcDistmatInfile
737 * If not NULL distances will be read from this file instead of being
739 * @param[in] pcDistmatOutfile
740 * If not NULL computed pairwise distances will be written to this file
741 * @param[in] iClusteringType
742 * Clustering method to be used to cluster the pairwise distances
743 * @param[in] pcGuidetreeInfile
744 * If not NULL guidetree will be read from this file. Skips pairwise
745 * distance and guidetree computation
746 * @param[in] pcGuidetreeOutfile
747 * If not NULL computed guidetree will be written to this file
748 * @param[in] bUseMbed
749 * If TRUE, fast mBed guidetree computation will be employed
751 * @return Non-zero on error
755 AlignmentOrder(int **piOrderLR_p, double **pdSeqWeights_p, mseq_t *prMSeq,
756 int iPairDistType, char *pcDistmatInfile, char *pcDistmatOutfile,
757 int iClusteringType, int iClustersizes,
758 char *pcGuidetreeInfile, char *pcGuidetreeOutfile, char *pcClusterFile,
759 bool bUseMbed, bool bPercID)
761 /* pairwise distance matrix (tmat in 1.83) */
762 symmatrix_t *distmat = NULL;
764 tree_t *prTree = NULL;
768 /* Shortcut for only two sequences: Do not compute k-tuple
769 * distances. Use the same logic as in TraverseTree() to setup
770 * piOrderLR_p. Changes there will have to be reflected here as
772 if (2==prMSeq->nseqs) {
773 Log(&rLog, LOG_VERBOSE,
774 "Have only two sequences: No need to compute pairwise score and compute a tree.");
776 if (NULL != pcDistmatOutfile){
777 Log(&rLog, LOG_WARN, "Have only two sequences: Will not calculate/print distance matrix.");
780 (*piOrderLR_p) = (int*) CKMALLOC(DIFF_NODE * 3 * sizeof(int));
781 (*piOrderLR_p)[DIFF_NODE*0+LEFT_NODE] = 0;
782 (*piOrderLR_p)[DIFF_NODE*0+RGHT_NODE] = 0;
783 (*piOrderLR_p)[DIFF_NODE*0+PRNT_NODE] = 0;
785 (*piOrderLR_p)[DIFF_NODE*1+LEFT_NODE] = 1;
786 (*piOrderLR_p)[DIFF_NODE*1+RGHT_NODE] = 1;
787 (*piOrderLR_p)[DIFF_NODE*1+PRNT_NODE] = 1;
790 (*piOrderLR_p)[DIFF_NODE*2+LEFT_NODE] = 0;
791 (*piOrderLR_p)[DIFF_NODE*2+RGHT_NODE] = 1;
792 (*piOrderLR_p)[DIFF_NODE*2+PRNT_NODE] = 2;
794 /* Same logic as CalcClustalWeights(). Changes there will
795 have to be reflected here as well. */
797 (*pdWeights_p) = (double *) CKMALLOC(uNodeCount * sizeof(double));
798 (*pdWeights_p)[0] = 0.5;
799 (*pdWeights_p)[1] = 0.5;
806 /* compute distance & guide tree, alternatively read distances or
807 * guide tree from file
810 if (NULL != pcGuidetreeInfile) {
811 Log(&rLog, LOG_INFO, "Reading guide-tree from %s", pcGuidetreeInfile);
812 if (GuideTreeFromFile(&prTree, prMSeq, pcGuidetreeInfile)) {
813 Log(&rLog, LOG_ERROR, "Reading of guide tree %s failed.", pcGuidetreeInfile);
820 if (NULL != pcDistmatInfile) {
821 Log(&rLog, LOG_ERROR, "Can't input distance matrix when in mbed mode.");
824 if (Mbed(&prTree, prMSeq, iPairDistType, pcGuidetreeOutfile, iClustersizes, pcClusterFile)) {
825 Log(&rLog, LOG_ERROR, "mbed execution failed.");
828 Log(&rLog, LOG_INFO, "Guide-tree computation (mBed) done.");
829 if (NULL != pcDistmatOutfile) {
831 "Ignoring request to write distance matrix (am in mBed mode)");
835 if (PairDistances(&distmat, prMSeq, iPairDistType, bPercID,
836 0, prMSeq->nseqs, 0, prMSeq->nseqs,
837 pcDistmatInfile, pcDistmatOutfile)) {
838 Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
842 /* clustering of distances to get guide tree
844 if (CLUSTERING_UPGMA == iClusteringType) {
846 labels = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char*));
847 for (i=0; i<prMSeq->nseqs; i++) {
848 labels[i] = prMSeq->sqinfo[i].name;
851 GuideTreeUpgma(&prTree, labels, distmat, pcGuidetreeOutfile);
852 Log(&rLog, LOG_INFO, "Guide-tree computation done.");
856 Log(&rLog, LOG_FATAL, "INTERNAL ERROR %s",
857 "clustering method should have been checked before");
859 } /* did not use mBed */
860 } /* had to calculate tree (did not read from file) */
864 /* derive sequence weights from tree
867 Log(&rLog, LOG_INFO, "Calculating sequence weights");
868 CalcClustalWeights(pdSeqWeights_p, prTree);
869 for (i = 0; i < GetLeafCount(prTree); i++) {
870 Log(&rLog, LOG_VERBOSE,
871 "Weight for seq no %d: %s = %f",
872 i, prMSeq->sqinfo[i].name, (*pdSeqWeights_p)[i]);
875 Log(&rLog, LOG_DEBUG, "Not using weights");
879 /* define traversing order of tree
882 TraverseTree(piOrderLR_p, prTree, prMSeq);
883 if (rLog.iLogLevelEnabled <= LOG_DEBUG) {
884 /* FIXME: debug only, FS */
886 FILE *fp = LogGetFP(&rLog, LOG_INFO);
887 Log(&rLog, LOG_DEBUG, "left/right order after tree traversal");
888 for (uNodeIndex = 0; uNodeIndex < GetNodeCount(prTree); uNodeIndex++) {
889 fprintf(fp, "%3d:\t%2d/%2d -> %d\n", i,
890 (*piOrderLR_p)[DIFF_NODE*uNodeIndex+LEFT_NODE],
891 (*piOrderLR_p)[DIFF_NODE*uNodeIndex+RGHT_NODE],
892 (*piOrderLR_p)[DIFF_NODE*uNodeIndex+PRNT_NODE]);
896 FreeMuscleTree(prTree);
897 FreeSymMatrix(&distmat);
900 Log(&rLog, LOG_FATAL, "DEBUG EXIT before leaving %s", __FUNCTION__);
904 /* end of AlignmentOrder() */
909 * @brief Set some options automatically based on number of sequences. Might
910 * overwrite some user-set options.
913 * Pointer to alignment options structure
915 * Number of sequences to align
918 SetAutoOptions(opts_t *prOpts, int iNumSeq) {
921 "Setting options automatically based on input sequence characteristics (might overwrite some of your options).");
923 /* AW: new version of mbed is always good (uses subclusters) */
924 if (FALSE == prOpts->bUseMbed) {
925 Log(&rLog, LOG_INFO, "Auto settings: Enabling mBed.");
926 prOpts->bUseMbed = TRUE;
929 if (iNumSeq >= 1000) {
930 if (0 != prOpts->iNumIterations) {
931 Log(&rLog, LOG_INFO, "Auto settings: Disabling iterations.");
932 prOpts->iNumIterations = 0;
935 } else if (iNumSeq < 1000) {
936 if (1 != prOpts->iNumIterations) {
937 Log(&rLog, LOG_INFO, "Auto settings: Setting iteration to 1.");
938 prOpts->iNumIterations = 1;
947 * @brief The main alignment function which wraps everything else.
950 * *the* multiple sequences structure
951 * @param[in] prMSeqProfile
952 * optional profile to align against
954 * alignment options to use
956 * @return 0 on success, -1 on failure
960 Align(mseq_t *prMSeq,
961 mseq_t *prMSeqProfile,
962 opts_t *prOpts) { /* Note DEVEL 291: at this stage pppcHMMBNames is set but ppiHMMBindex is not */
966 /* structs with pseudocounts etc; one for each HMM infile, i.e.
967 * index range: 0..iHMMInputFiles */
968 hmm_light *prHMMs = NULL;
970 /* MSA order in which nodes/profiles are to be merged/aligned
971 (order of nodes in guide tree (left/right)*/
972 int *piOrderLR = NULL;
974 /* weights per sequence */
975 double *pdSeqWeights = NULL;
979 int iIterationCounter = 0;
981 /* last dAlnScore for iteration */
982 double dLastAlnScore = -666.666;
985 char **ppcHMMbatch = NULL; /* names of unique HMM files */
986 int iHMMbatch = 0; /* number of unique HMM files */
990 assert(NULL != prMSeq);
991 if (NULL != prMSeqProfile) {
992 assert(TRUE == prMSeqProfile->aligned);
996 /* automatic setting of options
999 if (prOpts->bAutoOptions) {
1000 SetAutoOptions(prOpts, prMSeq->nseqs);
1004 #if SHUFFLE_INPUT_SEQ_ORDER
1006 * shuffle input: only useful for testing/debugging
1008 Log(&rLog, LOG_WARN, "Shuffling input sequences! (Will also change output order)");
1009 ShuffleMSeq(prMSeq);
1017 * would ensure we *always* (unless we get into the mbed k-means stage)
1018 * get the same answer. usually you don't, because most pairwise alignment
1019 * scores are in theory not symmetric, therefore sequence ordering might
1020 * have an effect on the guide-tree. Sorting by length should get rid of
1021 * this (and takes no time even for 100k seqs). Benchmark results on
1022 * Balibase show almost no difference after sorting.
1024 Log(&rLog, LOG_WARN, "Sorting input seq by length! This will also change the output order");
1025 SortMSeqByLength(prMSeq, 'd');
1029 if (TRUE == prOpts->bPileup){
1030 PileUp(prMSeq, prOpts->rHhalignPara, prOpts->iClustersizes);
1035 /* Read backgrounds HMMs and store in prHMMs (Devel 291)
1038 if (NULL != prOpts->pcHMMBatch){
1041 for (i = 0; i < prMSeq->nseqs; i++){
1043 if (NULL != prMSeq->pppcHMMBNames[i]){
1044 for (j = 0; NULL != prMSeq->pppcHMMBNames[i][j]; j++){
1046 for (k = 0; k < iHMMbatch; k++){
1047 if (0 == strcmp(ppcHMMbatch[k], prMSeq->pppcHMMBNames[i][j])){
1048 prMSeq->ppiHMMBindex[i][j] = k;
1049 break; /* HMM already registered */
1051 } /* went through HMM batch files already identified */
1052 if (k == iHMMbatch){
1054 if (NULL == (pfHMM = fopen(prMSeq->pppcHMMBNames[i][j], "r"))){
1055 prMSeq->ppiHMMBindex[i][j] = -1;
1056 Log(&rLog, LOG_WARN, "Background HMM %s for %s (%d/%d) does not exist",
1057 prMSeq->pppcHMMBNames[i][j], prMSeq->sqinfo[i].name, i, j);
1060 fclose(pfHMM); pfHMM = NULL;
1061 ppcHMMbatch = (char **)realloc(ppcHMMbatch, (iHMMbatch+1)*sizeof(char *));
1062 ppcHMMbatch[iHMMbatch] = strdup(prMSeq->pppcHMMBNames[i][j]);
1063 prMSeq->ppiHMMBindex[i][j] = k;
1068 } /* j = 0; NULL != prMSeq->pppcHMMBNames[i][j] */
1069 } /* NULL != prMSeq->pppcHMMBNames[i] */
1073 } /* 0 <= i < prMSeq->nseqs */
1075 } /* there was a HMM batch file */
1078 if (0 < prOpts->iHMMInputFiles) {
1079 int iHMMInfileIndex;
1082 * @warning old structure used to be initialised like this:
1083 * hmm_light rHMM = {0};
1085 prHMMs = (hmm_light *) CKMALLOC( (prOpts->iHMMInputFiles) * sizeof(hmm_light));
1087 for (iHMMInfileIndex=0; iHMMInfileIndex<prOpts->iHMMInputFiles; iHMMInfileIndex++) {
1088 char *pcHMMInput = prOpts->ppcHMMInput[iHMMInfileIndex];
1089 if (OK != readHMMWrapper(&prHMMs[iHMMInfileIndex], pcHMMInput)){
1090 Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMMInput);
1095 Log(&rLog, LOG_FORCED_DEBUG, "HMM length is %d", prHMMs[iHMMInfileIndex].L);
1096 Log(&rLog, LOG_FORCED_DEBUG, "n-display is %d", prHMMs[iHMMInfileIndex].n_display);
1097 for (i = 0; NULL != prHMMs[prOpts->iHMMInputFiles].seq[i]; i++){
1098 printf("seq[%d]: %s\n", i, prHMMs[iHMMInfileIndex].seq[i]);
1100 Log(&rLog, LOG_FORCED_DEBUG, "Neff_HMM is %f", prHMMs[iHMMInfileIndex].Neff_HMM);
1102 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
1103 Log(&rLog, LOG_DEBUG, "print frequencies");
1104 for (i = 0; i < prHMMs[iHMMInfileIndex].L; i++){
1105 #define PRINT_TAIL 5
1106 if ( (PRINT_TAIL+1 == i) && (prHMMs[iHMMInfileIndex].L-PRINT_TAIL != i) ){
1109 if ( (i > PRINT_TAIL) && (i < prHMMs[iHMMInfileIndex].L-PRINT_TAIL) ){
1113 for (j = 0; j < 20; j++){
1114 printf("\t%1.3f", prHMMs[iHMMInfileIndex].f[i][j]);
1118 } /* debug print block */
1120 CKFREE(prOpts->ppcHMMInput[iHMMInfileIndex]);
1121 } /* for each background HMM file */
1122 CKFREE(prOpts->ppcHMMInput);
1123 } /* there were background HMM files */
1125 /** read HMMs specific to individual sequences
1130 prHMMs = (hmm_light *) realloc( prHMMs, (prOpts->iHMMInputFiles + iHMMbatch + 1) * sizeof(hmm_light));
1132 for (i = 0; i < iHMMbatch; i++){
1133 char *pcHMMInput = ppcHMMbatch[i];
1135 if (OK != readHMMWrapper(&prHMMs[i + prOpts->iHMMInputFiles], pcHMMInput)){
1136 Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMMInput);
1140 } /* 0 <= i < iHMMbatch */
1142 } /* there were HMM batch files */
1145 /* If the input ("non-profile") sequences are aligned, then turn
1146 * the alignment into a HMM and add to the list of background HMMs
1149 if (TRUE == prMSeq->aligned) {
1150 /* FIXME: gcc warns about missing initialiser here (-Wall -Wextra -pedantic) */
1151 hmm_light rHMMLocal = {0};
1153 Log(&rLog, LOG_INFO,
1154 "Input sequences are aligned. Will turn alignment into HMM and add it to the user provided background HMMs.");
1155 /* certain gap parameters ('~' MSF) cause problems,
1156 sanitise them; FS, r258 -> r259 */
1157 SanitiseUnknown(prMSeq);
1160 AlnToHMM(&rHMMLocal, prMSeq)
1162 AlnToHMM2(&rHMMLocal, prOpts->rHhalignPara, prMSeq->seq, prMSeq->nseqs)
1165 Log(&rLog, LOG_ERROR, "Couldn't convert aligned input sequences to HMM. Will try to continue");
1167 prHMMs = (hmm_light *) CKREALLOC(prHMMs, ((prOpts->iHMMInputFiles+1) * sizeof(hmm_light)));
1168 memcpy(&(prHMMs[prOpts->iHMMInputFiles]), &rHMMLocal, sizeof(hmm_light));
1169 prOpts->iHMMInputFiles++;
1174 /* If we have a profile turn it into a HMM and add to
1175 * the list of background HMMs.
1178 if (NULL != prMSeqProfile) {
1179 /* FIXME: gcc warns about missing initialiser here (-Wall -Wextra -pedantic) */
1180 hmm_light rHMMLocal = {0};
1181 Log(&rLog, LOG_INFO,
1182 "Turning profile1 into HMM and will use it during progressive alignment.");
1185 AlnToHMM(&rHMMLocal, prMSeqProfile)
1187 AlnToHMM2(&rHMMLocal, prOpts->rHhalignPara, prMSeqProfile->seq, prMSeqProfile->nseqs)
1190 Log(&rLog, LOG_ERROR, "Couldn't convert profile1 to HMM. Will try to continue");
1192 prHMMs = (hmm_light *) CKREALLOC(prHMMs, ((prOpts->iHMMInputFiles+1) * sizeof(hmm_light)));
1193 memcpy(&(prHMMs[prOpts->iHMMInputFiles]), &rHMMLocal, sizeof(hmm_light));
1194 prOpts->iHMMInputFiles++;
1199 /* Now do a first alignment of the input sequences (prMSeq) adding
1200 * all collected background HMMs
1203 /* Determine progressive alignment order
1205 if (TRUE == prMSeq->aligned) {
1206 if ( (SEQTYPE_PROTEIN == prMSeq->seqtype) && (TRUE == prOpts->bUseKimura) ){
1207 Log(&rLog, LOG_INFO, "%s %s",
1208 "Input sequences are aligned.",
1209 "Will use Kimura distances of aligned sequences.");
1210 prOpts->iPairDistType = PAIRDIST_SQUIDID_KIMURA;
1213 prOpts->iPairDistType = PAIRDIST_SQUIDID;
1218 Log(&rLog, LOG_WARN, "Using a sequential alignment order.");
1219 SequentialAlignmentOrder(&piOrderLR, prMSeq->nseqs);
1221 if (OK != AlignmentOrder(&piOrderLR, &pdSeqWeights, prMSeq,
1222 prOpts->iPairDistType,
1223 prOpts->pcDistmatInfile, prOpts->pcDistmatOutfile,
1224 prOpts->iClusteringType, prOpts->iClustersizes,
1225 prOpts->pcGuidetreeInfile, prOpts->pcGuidetreeOutfile, prOpts->pcClustfile,
1226 prOpts->bUseMbed, prOpts->bPercID)) {
1227 Log(&rLog, LOG_ERROR, "AlignmentOrder() failed. Cannot continue");
1232 /* if max-hmm-iter is set < 0 then do not perform alignment
1233 * there is a problem/feature(?) that the un-aligned sequences are output
1235 if (prOpts->iMaxHMMIterations < 0){
1236 Log(&rLog, LOG_VERBOSE,
1237 "iMaxHMMIterations < 0 (%d), will not perform alignment", prOpts->iMaxHMMIterations);
1238 if (NULL != piOrderLR){
1239 free(piOrderLR); piOrderLR = NULL;
1245 /* Progressive alignment of input sequences. Order defined by
1246 * branching of guide tree (piOrderLR). Use optional
1247 * background HMM information (prHMMs[0..prOpts->iHMMInputFiles-1])
1250 dAlnScore = HHalignWrapper(prMSeq, piOrderLR, pdSeqWeights,
1251 2*prMSeq->nseqs -1/* nodes */,
1252 prHMMs, prOpts->iHMMInputFiles, -1, prOpts->rHhalignPara);
1253 dLastAlnScore = dAlnScore;
1254 Log(&rLog, LOG_VERBOSE,
1255 "Alignment score for first alignment = %f", dAlnScore);
1260 /* ------------------------------------------------------------
1262 * prMSeq is aligned now. Now start iterations if requested and save the
1263 * alignment at the very end.
1265 * @note We discard the background HMM information at this point,
1266 * because it was already used. Could consider to make this choice
1269 * ------------------------------------------------------------ */
1272 /* iteration after first alignment was computed (if not profile-profile
1276 for (iIterationCounter=0;
1277 (iIterationCounter < prOpts->iNumIterations || prOpts->bIterationsAuto);
1278 iIterationCounter++) {
1280 hmm_light rHMMLocal = {0};
1281 /* FIXME Keep copy of old alignment in case new one sucks? */
1284 if (iIterationCounter >= prOpts->iMaxHMMIterations
1286 iIterationCounter >= prOpts->iMaxGuidetreeIterations) {
1287 Log(&rLog, LOG_VERBOSE, "Reached maximum number of HMM and guide-tree iterations");
1291 if (! prOpts->bIterationsAuto) {
1292 Log(&rLog, LOG_INFO, "Iteration step %d out of %d",
1293 iIterationCounter+1, prOpts->iNumIterations);
1295 Log(&rLog, LOG_INFO, "Iteration step %d out of <auto>",
1296 iIterationCounter+1);
1299 if (rLog.iLogLevelEnabled <= LOG_VERBOSE) {
1300 char zcIntermediate[1000] = {0};
1301 char *pcFormat = "fasta";
1302 sprintf(zcIntermediate, "clustalo-aln-iter~%d~", iIterationCounter);
1303 #define LINE_WRAP 60
1304 if (WriteAlignment(prMSeq, zcIntermediate, MSAFILE_A2M, LINE_WRAP)) {
1305 Log(&rLog, LOG_ERROR, "Could not save alignment to %s", zcIntermediate);
1315 if (iIterationCounter < prOpts->iMaxGuidetreeIterations) {
1316 /* determine progressive alignment order
1318 * few things are different now when calling AlignmentOrder:
1319 * - we have to ignore prOpts->pcDistmatInfile and pcGuidetreeInfile
1320 * as they were used before
1321 * - the corresponding outfiles are still valid though
1323 /* Free stuff that has already been allocated by or further
1324 * downstream of AlignmentOrder()
1326 if (NULL != piOrderLR)
1328 if (NULL != pdSeqWeights)
1329 CKFREE(pdSeqWeights);
1330 Log(&rLog, LOG_INFO, "Computing new guide tree (iteration step %d)");
1331 if (AlignmentOrder(&piOrderLR, &pdSeqWeights, prMSeq,
1332 ((SEQTYPE_PROTEIN == prMSeq->seqtype) && (TRUE == prOpts->bUseKimura)) ? PAIRDIST_SQUIDID_KIMURA : PAIRDIST_SQUIDID,
1333 NULL, prOpts->pcDistmatOutfile,
1334 prOpts->iClusteringType, prOpts->iClustersizes,
1335 NULL, prOpts->pcGuidetreeOutfile, prOpts->pcClustfile,
1336 prOpts->bUseMbedForIteration, prOpts->bPercID)) {
1337 Log(&rLog, LOG_ERROR, "AlignmentOrder() failed. Cannot continue");
1341 Log(&rLog, LOG_INFO, "Skipping guide-tree iteration at iteration step %d (reached maximum)",
1346 /* new local hmm iteration
1349 /* non-residue/gap characters will crash AlnToHMM2(),
1350 therefore sanitise unknown characters, FS, r259 -> r260 */
1351 SanitiseUnknown(prMSeq);
1352 if (iIterationCounter < prOpts->iMaxHMMIterations) {
1353 Log(&rLog, LOG_INFO, "Computing HMM from alignment");
1357 AlnToHMM(&rHMMLocal, prMSeq)
1359 AlnToHMM2(&rHMMLocal, prOpts->rHhalignPara, prMSeq->seq, prMSeq->nseqs)
1362 Log(&rLog, LOG_ERROR, "Couldn't convert alignment to HMM. Will stop iterating now...");
1366 Log(&rLog, LOG_INFO, "Skipping HMM iteration at iteration step %d (reached maximum)",
1371 /* align the sequences (again)
1373 dAlnScore = HHalignWrapper(prMSeq, piOrderLR, pdSeqWeights,
1374 2*prMSeq->nseqs -1/* nodes */, &rHMMLocal, 1, -1,
1375 prOpts->rHhalignPara);
1376 Log(&rLog, LOG_VERBOSE,
1377 "Alignment score for alignmnent in hmm-iteration no %d = %f (last score = %f)",
1378 iIterationCounter+1, dAlnScore, dLastAlnScore);
1381 FreeHMMstruct(&rHMMLocal);
1384 /* FIXME: need a better score for automatic iteration */
1385 if (prOpts->bIterationsAuto) {
1386 /* automatic iteration: break if score improvement was not
1389 double dScoreImprovement = (dAlnScore-dLastAlnScore)/dLastAlnScore;
1390 if (dScoreImprovement < ITERATION_SCORE_IMPROVEMENT_THRESHOLD) {
1391 Log(&rLog, LOG_INFO,
1392 "Stopping after %d guide-tree iterations. No further alignment score improvement achieved.",
1393 iIterationCounter+1);
1394 /* use previous alignment */
1396 Log(&rLog, LOG_FORCED_DEBUG, "FIXME: %s", "CopyMSeq breaks things in this context");
1397 CopyMSeq(&prMSeq, prMSeqCopy);
1398 /* FIXME: prOpts->pcDistmatOutfile and pcGuidetreeOutfile
1399 * might have been updated, but then discarded here?
1403 Log(&rLog, LOG_INFO,
1404 "Got a %d%% better score in iteration step %d",
1405 (int)dScoreImprovement*100, iIterationCounter+1);
1406 FreeMSeq(&prMSeqCopy);
1409 dLastAlnScore = dAlnScore;
1413 /* end of iterations */
1417 /* Last step: if a profile was also provided then align now-aligned mseq
1420 * Don't use the backgrounds HMMs anymore and don't iterate.
1421 * (which was done before).
1424 if (NULL != prMSeqProfile) {
1425 if (AlignProfiles(prMSeq, prMSeqProfile, prOpts->rHhalignPara)) {
1426 Log(&rLog, LOG_ERROR, "An error occured during the profile/profile alignment");
1431 if (NULL != prOpts->pcPosteriorfile){
1433 hmm_light rHMMLocal = {0};
1437 AlnToHMM(&rHMMLocal, prMSeq)
1439 AlnToHMM2(&rHMMLocal, prOpts->rHhalignPara, prMSeq->seq, prMSeq->nseqs)
1442 Log(&rLog, LOG_ERROR, "Couldn't convert alignment to HMM. Will not do posterior probabilities...");
1444 PosteriorProbabilities(prMSeq, rHMMLocal, prOpts->rHhalignPara, prOpts->pcPosteriorfile);
1445 FreeHMMstruct(&rHMMLocal);
1448 if (NULL != piOrderLR) {
1451 if (NULL != pdSeqWeights) {
1452 CKFREE(pdSeqWeights);
1454 if (0 < prOpts->iHMMInputFiles) {
1455 for (i=0; i<prOpts->iHMMInputFiles; i++) {
1456 FreeHMMstruct(&prHMMs[i]);
1463 /* end of Align() */
1469 * @brief Align two profiles, ie two sets of prealigned sequences. Already
1470 * aligned columns won't be changed.
1472 * @param[out] prMSeqProfile1
1473 * First profile/aligned set of sequences. Merged alignment will be found in
1475 * @param[in] prMSeqProfile2
1476 * First profile/aligned set of sequences
1477 * @param[in] rHhalignPara
1480 * @return 0 on success, -1 on failure
1484 AlignProfiles(mseq_t *prMSeqProfile1,
1485 mseq_t *prMSeqProfile2, hhalign_para rHhalignPara) {
1489 /* number of seqs in first half of joined profile */
1490 int iProfProfSeparator = prMSeqProfile1->nseqs;
1492 assert(TRUE == prMSeqProfile1->aligned);
1493 assert(TRUE == prMSeqProfile2->aligned);
1495 Log(&rLog, LOG_INFO, "Performing profile/profile alignment");
1497 /* Combine the available mseqs into prMSeq
1498 * which will be aligned afterwards.
1500 JoinMSeqs(&prMSeqProfile1, prMSeqProfile2);
1503 /* set alignment flag explicitly to FALSE */
1504 prMSeqProfile1->aligned = FALSE;
1506 dAlnScore = HHalignWrapper(prMSeqProfile1,
1507 NULL, /* no order */
1508 NULL, /* no weights */
1509 3, /* nodes: root+2profiles */
1510 NULL, 0 /* no bg-hmms */,
1511 iProfProfSeparator, rHhalignPara);
1513 Log(&rLog, LOG_VERBOSE, "Alignment score is = %f", dAlnScore);
1517 /* end of AlignProfiles() */
1520 * @brief read pseudo-count 'fudge' parameters from file
1522 * @param[out] rHhalignPara_p
1523 * structure that holds several hhalign parameters
1524 * @param[in] pcPseudoFile
1525 * name of file with pseudo-count information.
1526 * format must be collection of pairs of lines where one line specifies name of parameter
1527 * (gapb,gapd,gape,gapf,gapg,gaph,gapi,pca,pcb,pcc,gapbV,gapdV,gapeV,gapfV,gapgV,gaphV,gapiV,pcaV,pcbV,pccV)
1528 * followed by second line with the (double) value of this parameter.
1530 * order of parameters is not fixed, not all parameters have to be set
1532 int ReadPseudoCountParams(hhalign_para *rHhalignPara_p, char *pcPseudoFile){
1536 char zcLine[1000] = {0};
1537 char zcFudge[1000] = {0};
1539 char *pcToken = NULL;
1542 if (NULL == (pfIn = fopen(pcPseudoFile, "r"))){
1543 Log(&rLog, LOG_FATAL, "File %s with pseudo-count parameters does not exist", pcPseudoFile);
1546 while (NULL != fgets(zcLine, 1000, pfIn)){
1548 if (NULL == (pcToken = strtok(zcLine, " \040\t\n"))){
1549 Log(&rLog, LOG_FATAL, "no token specifying pseudo-count parameter in file %s\n"
1553 strcpy(zcFudge, pcToken);
1555 if (NULL == fgets(zcLine, 1000, pfIn)){
1556 Log(&rLog, LOG_FATAL, "no line with parameter in file %s associated with %s\n"
1557 , pcPseudoFile, zcFudge
1561 if (NULL == (pcToken = strtok(zcLine, " \040\t\n"))){
1562 Log(&rLog, LOG_FATAL, "no token in 2nd line in file %s associated with %s\n"
1563 , pcPseudoFile, zcFudge
1567 dVal = (double)strtod(pcToken, &pcEnd);
1568 if ('\0' != *pcEnd){
1569 Log(&rLog, LOG_FATAL, "token in file %s associated with %s not valid double (%s)\n"
1570 , pcPseudoFile, zcFudge, pcToken
1576 printf("%s:%s:%d: read file %s: fudge = %s, val = %f\n"
1577 , __FUNCTION__, __FILE__, __LINE__
1578 , pcPseudoFile, zcFudge, dVal
1582 if (0 == strcmp(zcFudge, "gapb")){
1583 rHhalignPara_p->gapb = dVal;
1585 else if (0 == strcmp(zcFudge, "gapd")){
1586 rHhalignPara_p->gapd = dVal;
1588 else if (0 == strcmp(zcFudge, "gape")){
1589 rHhalignPara_p->gape = dVal;
1591 else if (0 == strcmp(zcFudge, "gapf")){
1592 rHhalignPara_p->gapf = dVal;
1594 else if (0 == strcmp(zcFudge, "gapg")){
1595 rHhalignPara_p->gapg = dVal;
1597 else if (0 == strcmp(zcFudge, "gaph")){
1598 rHhalignPara_p->gaph = dVal;
1600 else if (0 == strcmp(zcFudge, "gapi")){
1601 rHhalignPara_p->gapi = dVal;
1603 else if (0 == strcmp(zcFudge, "pca")){
1604 rHhalignPara_p->pca = dVal;
1606 else if (0 == strcmp(zcFudge, "pcb")){
1607 rHhalignPara_p->pcb = dVal;
1609 else if (0 == strcmp(zcFudge, "pcc")){
1610 rHhalignPara_p->pcc = dVal;
1612 else if (0 == strcmp(zcFudge, "gapbV")){
1613 rHhalignPara_p->gapbV = dVal;
1615 else if (0 == strcmp(zcFudge, "gapdV")){
1616 rHhalignPara_p->gapdV = dVal;
1618 else if (0 == strcmp(zcFudge, "gapeV")){
1619 rHhalignPara_p->gapeV = dVal;
1621 else if (0 == strcmp(zcFudge, "gapfV")){
1622 rHhalignPara_p->gapfV = dVal;
1624 else if (0 == strcmp(zcFudge, "gapgV")){
1625 rHhalignPara_p->gapgV = dVal;
1627 else if (0 == strcmp(zcFudge, "gaphV")){
1628 rHhalignPara_p->gaphV = dVal;
1630 else if (0 == strcmp(zcFudge, "gapiV")){
1631 rHhalignPara_p->gapiV = dVal;
1633 else if (0 == strcmp(zcFudge, "pcaV")){
1634 rHhalignPara_p->pcaV = dVal;
1636 else if (0 == strcmp(zcFudge, "pcbV")){
1637 rHhalignPara_p->pcbV = dVal;
1639 else if (0 == strcmp(zcFudge, "pccV")){
1640 rHhalignPara_p->pccV = dVal;
1643 Log(&rLog, LOG_FATAL, "%s not a valid pseudo-count parameter\n"
1644 "must be one of [gapb,gapd,gape,gapf,gapg,gaph,gapi,pca,pcb,pcc,gapbV,gapdV,gapeV,gapfV,gapgV,gaphV,gapiV,pcaV,pcbV,pccV]\n"
1646 } /* switched between parameters */
1649 fclose(pfIn); pfIn = NULL;
1651 } /* there was a parameter file */
1653 printf("%s:%s:%d: gapb=%g,gapd=%g,gape=%g,gapf=%g,gapg=%g,gaph=%g,gapi=%g,pca=%g,pcb=%g,pcc=%g\n"
1654 , __FUNCTION__, __FILE__, __LINE__
1655 , rHhalignPara_p->gapb, rHhalignPara_p->gapd, rHhalignPara_p->gape, rHhalignPara_p->gapf, rHhalignPara_p->gapg, rHhalignPara_p->gaph, rHhalignPara_p->gapi, rHhalignPara_p->pca, rHhalignPara_p->pcb, rHhalignPara_p->pcc
1659 printf("%s:%s:%d: gapbV=%g,gapdV=%g,gapeV=%g,gapfV=%g,gapgV=%g,gaphV=%g,gapiV=%g,pcaV=%g,pcbV=%g,pccV=%g\n"
1660 , __FUNCTION__, __FILE__, __LINE__
1661 , rHhalignPara_p->gapbV, rHhalignPara_p->gapdV, rHhalignPara_p->gapeV, rHhalignPara_p->gapfV, rHhalignPara_p->gapgV, rHhalignPara_p->gaphV, rHhalignPara_p->gapiV, rHhalignPara_p->pcaV, rHhalignPara_p->pcbV, rHhalignPara_p->pccV
1668 } /* this is the end of ReadPseudoCountParams() */
1672 * EOF clustal-omega.c