X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=binaries%2Fsrc%2Fclustalo%2Fsrc%2Fclustal-omega.c;fp=binaries%2Fsrc%2Fclustalo%2Fsrc%2Fclustal-omega.c;h=376e0781e996e89784434fabbe318068a2c38baa;hb=35947b9e0b6ae1be74f17d15cd6a65ceadc6d310;hp=0000000000000000000000000000000000000000;hpb=e7bf02d1c203bc8d88b9095e89d5ca6cce21a882;p=jabaws.git diff --git a/binaries/src/clustalo/src/clustal-omega.c b/binaries/src/clustalo/src/clustal-omega.c new file mode 100644 index 0000000..376e078 --- /dev/null +++ b/binaries/src/clustalo/src/clustal-omega.c @@ -0,0 +1,1385 @@ +/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ + +/********************************************************************* + * Clustal Omega - Multiple sequence alignment + * + * Copyright (C) 2010 University College Dublin + * + * Clustal-Omega is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This file is part of Clustal-Omega. + * + ********************************************************************/ + +/* + * RCS $Id: clustal-omega.c 254 2011-06-21 13:07:50Z andreas $ + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include + +#include "clustal-omega.h" +#include "hhalign/general.h" + +/* The following comment block contains the frontpage/mainpage of the doxygen + * documentation. Please add some more info. FIXME add more + */ + +/** + * + * @mainpage Clustal-Omega Documentation + * + * @section intro_sec Introduction + * + * For more information see http://www.clustal.org/ + * + * @section api_section API + * + * @subsection example_prog_subsection An Example Program + * + * To use libclustalo you will have to include the clustal-omega.h header and + * link against libclustalo. For linking against libclustalo you will have to + * use a C++ compiler, no matter if your program was written in C or C++. + * + * First compile (no linking) your source (for an example see section "\ref + * example_src_subsubsec"): + * + * @code + * $ gcc -c -ansi -Wall clustalo-api-test.c + * @endcode + * + * Then link against libclustalo (we recommend the use of pkg-config as + * explained in \ref pkgconfig_subsubsec). Assuming Clustal Omega was installed + * in system-wide default directory (e.g. /usr) just type: + * + * @code + * $ g++ -ansi -Wall -o clustalo-api-test clustalo-api-test.o -lclustalo + * @endcode + * + * Voila! Now you have your own alignment program which can be run with + * + * @code + * $ ./clustalo-api-test + * @endcode + * + * It's best to use the same compiler that you used for compiling libclustal. + * If libclustal was compiled with OpenMP support, you will have to use OpenMP + * flags for you program as well. + * + * + * @subsubsection pkgconfig_subsubsec Using pkg-config / Figuring out compiler flags + * + * Clustal Omega comes with support for pkg-config, which means you + * can run + * + * @code + * $ pkg-config --cflags --libs clustalo + * @endcode + * + * to figure out cflags and library flags needed to compile and link against + * libclustalo. This is especially handy if Clustal Omega was installed to a + * non-standard directory. + * + * You might have to change PKG_CONFIG_PATH. For example, if you used the prefix $HOME/local/ for + * installation then you will first need to set PKG_CONFIG_PATH: + * + * @code + * $ export PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig + * $ pkg-config --cflags --libs clustalo + * @endcode + * + * + * To compile your source use: + * + * @code + * $ export PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig + * $ gcc -c -ansi -Wall clustalo-api-test.c $(pkg-config --cflags clustalo) + * $ g++ -ansi -Wall -o clustalo-api-test clustalo-api-test.o $(pkg-config --libs clustalo) + * @endcode + * + * + * @subsubsection example_src_subsubsec Example Source Code + * + * @include "clustalo-api-test.c" + * + * + */ + + + + +/* FIXME: doc */ +/* the following are temporary flags while the code is still under construction; + had problems internalising hhmake, so as temporary crutch + write alignment to file and get external hmmer/hhmake via system call + to read alignment and convert into HMM + All this will go, once hhmake is properly internalised */ +#define INDIRECT_HMM 0 /* temp flag: (1) write aln to file, use system(hmmer/hhmake), (0) internal hhmake */ +#define USEHMMER 1 /* temp flag: use system(hmmer) to build HMM */ +#define USEHHMAKE (!USEHMMER) /* temp flag: use system(hhmake) to build HMM */ + + +/* shuffle order of input sequences */ +#define SHUFFLE_INPUT_SEQ_ORDER 0 + +/* sort input sequences by length */ +#define SORT_INPUT_SEQS 0 + + +int iNumberOfThreads; + +/* broken, unused and lonely */ +static const int ITERATION_SCORE_IMPROVEMENT_THRESHOLD = 0.01; + + +/** + * @brief Print Long version information to pre-allocated char. + * + * @note short version + * information is equivalent to PACKAGE_VERSION + * + * @param[out] pcStr + * char pointer to write to preallocated to hold iSize chars. + * @param[in] iSize + * size of pcStr + */ +void +PrintLongVersion(char *pcStr, int iSize) +{ + snprintf(pcStr, iSize, "version %s; code-name '%s'; build date %s", + PACKAGE_VERSION, PACKAGE_CODENAME, __DATE__); +} +/* end of PrintLongVersion() */ + + + +/** + * @brief free aln opts members + * + */ +void +FreeAlnOpts(opts_t *prAlnOpts) { + if (NULL != prAlnOpts->pcGuidetreeInfile) { + CKFREE(prAlnOpts->pcGuidetreeInfile); + } + if (NULL != prAlnOpts->pcGuidetreeOutfile) { + CKFREE(prAlnOpts->pcGuidetreeOutfile); + } + if (NULL != prAlnOpts->pcDistmatOutfile) { + CKFREE(prAlnOpts->pcDistmatOutfile); + } + if (NULL != prAlnOpts->pcDistmatInfile) { + CKFREE(prAlnOpts->pcDistmatInfile); + } +} +/* end of FreeAlnOpts() */ + + + +/** + * @brief Sets members of given user opts struct to default values + * + * @param[out] prOpts + * User opt struct to initialise + * + */ +void +SetDefaultAlnOpts(opts_t *prOpts) { + prOpts->bAutoOptions = FALSE; + + prOpts->pcDistmatInfile = NULL; + prOpts->pcDistmatOutfile = NULL; + + prOpts->iClusteringType = CLUSTERING_UPGMA; + prOpts->iPairDistType = PAIRDIST_KTUPLE; + prOpts->bUseMbed = TRUE; /* FS, r250 -> */ + prOpts->bUseMbedForIteration = TRUE; /* FS, r250 -> */ + prOpts->pcGuidetreeOutfile = NULL; + prOpts->pcGuidetreeInfile = NULL; + + prOpts->ppcHMMInput = NULL; + prOpts->iHMMInputFiles = 0; + + prOpts->iNumIterations = 0; + prOpts->bIterationsAuto = FALSE; + prOpts->iMaxGuidetreeIterations = INT_MAX; + prOpts->iMaxHMMIterations = INT_MAX; + prOpts->iMacRam = 2048; /* give 2GB to MAC algorithm. FS, r240 -> r241 */ + } +/* end of SetDefaultAlnOpts() */ + + + +/** + * @brief Check logic of parsed user options. Will exit (call Log(&rLog, LOG_FATAL, )) + * on Fatal logic error + * + * @param[in] prOpts + * Already parsed user options + * + */ +void +AlnOptsLogicCheck(opts_t *prOpts) +{ + /* guide-tree & distmat + * + */ + if (prOpts->pcDistmatInfile && prOpts->pcGuidetreeInfile) { + Log(&rLog, LOG_FATAL, "Read distances *and* guide-tree from file doesn't make sense."); + } + + if (prOpts->pcDistmatOutfile && prOpts->pcGuidetreeInfile) { + Log(&rLog, LOG_FATAL, "Won't be able to save distances to file, because I got a guide-tree as input."); + } + + /* combination of options that don't make sense when not iterating + */ + if (prOpts->iNumIterations==0 && prOpts->bIterationsAuto != TRUE) { + + if (prOpts->pcGuidetreeInfile && prOpts->pcGuidetreeOutfile) { + Log(&rLog, LOG_FATAL, "Got a guide-tree as input and output which doesn't make sense when not iterating."); + } + /* + if (prOpts->pcGuidetreeInfile && prOpts->bUseMbed > 0) { + Log(&rLog, LOG_FATAL, "Got a guide-tree as input and was requested to cluster with mBed, which doesn't make sense when not iterating."); + } + */ + /* + AW: bUseMbedForIteration default since at least R252 + if (prOpts->bUseMbedForIteration > 0) { + Log(&rLog, LOG_FATAL, "No iteration requested, but mbed for iteration was set. Paranoia exit."); + } + */ + } + + if (prOpts->iMacRam < 512) { + + Log(&rLog, LOG_INFO, "Memory for MAC Algorithm quite low, Viterbi Algorithm may be triggered."); + + if (prOpts->iMacRam < 1) { + + Log(&rLog, LOG_WARN, "Viterbi Algorithm always turned on, increase MAC-RAM to turn on MAC."); + } + } + + return; +} +/* end of AlnOptsLogicCheck() */ + + +/** + * @brief FIXME doc + */ +void +PrintAlnOpts(FILE *prFile, opts_t *prOpts) +{ + int iAux; + + + /* keep in same order as struct */ + fprintf(prFile, "option: auto-options = %d\n", prOpts->bAutoOptions); + fprintf(prFile, "option: distmat-infile = %s\n", + NULL != prOpts->pcDistmatInfile? prOpts->pcDistmatInfile: "(null)"); + fprintf(prFile, "option: distmat-outfile = %s\n", + NULL != prOpts->pcDistmatOutfile? prOpts->pcDistmatOutfile: "(null)"); + fprintf(prFile, "option: clustering-type = %d\n", prOpts->iClusteringType); + fprintf(prFile, "option: pair-dist-type = %d\n", prOpts->iPairDistType); + fprintf(prFile, "option: use-mbed = %d\n", prOpts->bUseMbed); + fprintf(prFile, "option: use-mbed-for-iteration = %d\n", prOpts->bUseMbedForIteration); + fprintf(prFile, "option: guidetree-outfile = %s\n", + NULL != prOpts->pcGuidetreeOutfile? prOpts->pcGuidetreeOutfile: "(null)"); + fprintf(prFile, "option: guidetree-infile = %s\n", + NULL != prOpts->pcGuidetreeInfile? prOpts->pcGuidetreeInfile: "(null)"); + for (iAux=0; iAuxiHMMInputFiles; iAux++) { + fprintf(prFile, "option: hmm-input no %d = %s\n", iAux, prOpts->ppcHMMInput[iAux]); + } + fprintf(prFile, "option: hmm-input-files = %d\n", prOpts->iHMMInputFiles); + fprintf(prFile, "option: num-iterations = %d\n", prOpts->iNumIterations); + fprintf(prFile, "option: iterations-auto = %d\n", prOpts->bIterationsAuto); + fprintf(prFile, "option: max-hmm-iterations = %d\n", prOpts->iMaxHMMIterations); + fprintf(prFile, "option: max-guidetree-iterations = %d\n", prOpts->iMaxGuidetreeIterations); +} +/* end of PrintAlnOpts() */ + + + +/** + * @brief Returns major version of HMMER. Whichever hmmbuild version + * is found first in your PATH will be used + * + * @return -1 on error, major hmmer version otherwise + * + */ +int +HmmerVersion() +{ + char zcHmmerTestCall[] = "hmmbuild -h"; + FILE *fp = NULL; + int iMajorVersion = 0; + char zcLine[16384]; + + if (NULL == (fp = popen(zcHmmerTestCall, "r"))) { + Log(&rLog, LOG_ERROR, "Couldn't exec %s", zcHmmerTestCall); + return -1; + } + while (fgets(zcLine, sizeof(zcLine), fp)) { + char *pcLocate; + if ((pcLocate = strstr(zcLine, "HMMER "))) { + iMajorVersion = atoi(&pcLocate[6]); + break; + } + } + pclose(fp); + + return iMajorVersion; +} +/* end of HmmerVersion() */ + + + +/** + * @brief Create a HHM file from aligned sequences + * + * @warning Should be eliminated in the future + * as building routine should not create intermediate files + * + * @param[in] prMSeq + * Aligned mseq_t + * @param[in] pcHMMOut + * HMM output file name + * + * @return Non-zero on error + * + */ +int +AlnToHHMFile(mseq_t *prMSeq, char *pcHMMOut) +{ + char *tmp_aln = NULL; + int retcode = OK; + + assert(NULL!=prMSeq); + assert(NULL!=pcHMMOut); + + if (FALSE == prMSeq->aligned) { + Log(&rLog, LOG_ERROR, "Sequences need to be aligned to create an HMM"); + return FAILURE; + } + + /* Convert alignment to a2m, and call hhmake + * + * can't be static templates, or mktemp fails (at least on os x + * (with a bus error)) + * + * gcc says we should use mkstemp to avoid race conditions, + * but that returns a file descriptor, which is of no use to + * us + */ + /* NOTE: the following won't work on windows: missing /tmp/ */ + tmp_aln = CkStrdup("/tmp/clustalo_tmpaln_XXXXXX"); + if (NULL == mktemp(tmp_aln)) { + Log(&rLog, LOG_ERROR, "Could not create temporary alignment filename"); + retcode = FAILURE; + goto cleanup_and_return; + } + if (WriteAlignment(prMSeq, tmp_aln, MSAFILE_A2M)) { + Log(&rLog, LOG_ERROR, "Could not save alignment to %s", tmp_aln); + retcode = FAILURE; + goto cleanup_and_return; + } + + if (HHMake_Wrapper(tmp_aln, pcHMMOut)){ + Log(&rLog, LOG_ERROR, "Could not convert alignment %s into HHM", tmp_aln); + retcode = FAILURE; + goto cleanup_and_return; + } + + + cleanup_and_return: + + if (NULL != tmp_aln) { + if (FileExists(tmp_aln)) { + if (remove(tmp_aln)) { + Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_aln); + } + } + CKFREE(tmp_aln); + } + + return retcode; + +} /* end of AlnToHHMFile() */ + + + +/** + * @brief Create a HMM file from aligned sequences + * + * @warning Should be replaced in the future by some internal HMM + * building routine that does not call external programs + * + * @param[in] prMSeq + * Aligned mseq_t + * @param[in] pcHMMOut + * HMM output file name + * + * @return Non-zero on error + * + + */ +int +AlnToHMMFile(mseq_t *prMSeq, const char *pcHMMOut) +{ + char *tmp_aln = NULL; + char *tmp_hmm = NULL; /* only needed for hmmer3 to hmmer2 conversion */ + char cmdbuf[16384]; + int iHmmerVersion = 0; + int retcode = OK; + + assert(NULL!=prMSeq); + assert(NULL!=pcHMMOut); + + if (FALSE == prMSeq->aligned) { + Log(&rLog, LOG_ERROR, "Sequences need to be aligned to create an HMM"); + return FAILURE; + } + + iHmmerVersion = HmmerVersion(); + if (2 != iHmmerVersion && 3 != iHmmerVersion) { + Log(&rLog, LOG_ERROR, "Could not find suitable HMMER binaries"); + return FAILURE; + } + + /* Convert alignment to stockholm, call hmmbuild and then + * either hmmconvert (hmmer3) or hmmcalibrate (hmmer2) + * + * can't be static templates, or mktemp fails (at least on os x + * (with a bus error)) + * + * gcc says we should use mkstemp to avoid race conditions, + * but that returns a file descriptor, which is of no use to + * us + */ + /* NOTE: the following won't work on windows: missing /tmp/ */ + tmp_aln = CkStrdup("/tmp/clustalo_tmpaln_XXXXXX"); + if (NULL == mktemp(tmp_aln)) { + Log(&rLog, LOG_ERROR, "Could not create temporary alignment filename"); + retcode = FAILURE; + goto cleanup_and_return; + } + if (WriteAlignment(prMSeq, tmp_aln, MSAFILE_STOCKHOLM)) { + Log(&rLog, LOG_ERROR, "Could not save alignment to %s", tmp_aln); + retcode = FAILURE; + goto cleanup_and_return; + } + + if (2 == iHmmerVersion) { + sprintf(cmdbuf, "hmmbuild %s %s >/dev/null && hmmcalibrate %s >/dev/null", + pcHMMOut, tmp_aln, pcHMMOut); + if (system(cmdbuf)) { + Log(&rLog, LOG_ERROR, "Command '%s' failed", cmdbuf); + retcode = FAILURE; + goto cleanup_and_return; + } + } else if (3 == iHmmerVersion) { + /* NOTE: the following won't work on windows: missing /tmp/ */ + tmp_hmm = CkStrdup("/tmp/clustalo_tmphmm2_XXXXXX"); + if (NULL == mktemp(tmp_hmm)) { + Log(&rLog, LOG_ERROR, "Could not create temporary hmm filename"); + retcode = FAILURE; + goto cleanup_and_return; + } + sprintf(cmdbuf, "hmmbuild %s %s >/dev/null && hmmconvert -2 %s > %s", + tmp_hmm, tmp_aln, tmp_hmm, pcHMMOut); + if (system(cmdbuf)) { + Log(&rLog, LOG_ERROR, "Command '%s' failed", cmdbuf); + retcode = FAILURE; + goto cleanup_and_return; + } + } else { + CKFREE(tmp_aln); + Log(&rLog, LOG_FATAL, "Internal error: Unknown Hmmer version %d", iHmmerVersion); + } + + + cleanup_and_return: + + if (NULL != tmp_aln) { + if (FileExists(tmp_aln)) { + if (remove(tmp_aln)) { + Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_aln); + } + } + CKFREE(tmp_aln); + } + if (NULL != tmp_hmm) { + if (FileExists(tmp_hmm)) { + if (remove(tmp_hmm)) { + Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_hmm); + } + } + CKFREE(tmp_hmm); + } + + return retcode; +} +/* end of AlnToHMMFile() */ + + + +/** + * @brief Convert a multiple sequence structure into a HMM + * + * @param[out] prHMM + * Pointer to preallocted HMM which will be set here + * @param[in] prMSeq + * Pointer to an alignment + * + * @return 0 on error, non-0 otherwise + * + * @see AlnToHMMFile() + * + */ +int +AlnToHMM(hmm_light *prHMM, mseq_t *prMSeq) +{ + char *pcHMM; /* temp hmm file */ + + Log(&rLog, LOG_INFO, + "Using HMMER version %d to calculate a new HMM.", + HmmerVersion()); + /* FIXME replace all this with internal HMM computation (HHmake) */ + + /** + * @warning the following probably won't work on windows: missing + * /tmp/. Should be ok on Cygwin though + */ + pcHMM = CkStrdup("/tmp/clustalo-hmm-iter_XXXXXX"); + if (NULL == mktemp(pcHMM)) { + Log(&rLog, LOG_ERROR, "Could not create temporary hmm filename"); + CKFREE(pcHMM); + return FAILURE; + } + + /* Create a HMM representing the current alignment + */ +#if USEHMMER + if (AlnToHMMFile(prMSeq, pcHMM)) { + Log(&rLog, LOG_ERROR, "AlnToHMMFile() on %s failed.", pcHMM); + CKFREE(pcHMM); + return FAILURE; + } +#elif USEHHMAKE + if (AlnToHHMFile(prMSeq, pcHMM)) { + Log(&rLog, LOG_ERROR, "AlnToHHMFile() on %s failed.", pcHMM); + CKFREE(pcHMM); + return FAILURE; + } + /* Log(&rLog, LOG_FATAL, "Method to create HHM (HMM using hhmake) not installed yet"); */ +#else + Log(&rLog, LOG_FATAL, "Unknown method to create temporary HMM"); +#endif + + /* Read HMM information + */ + if (OK != readHMMWrapper(prHMM, pcHMM)){ + Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMM); + CKFREE(pcHMM); + return FAILURE; + } + + if (remove(pcHMM)) { + Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", pcHMM); + } + CKFREE(pcHMM); + + return OK; +} +/* end of AlnToHMM() */ + + + +/** + * @brief FIXME + * + */ +void +InitClustalOmega(int iNumThreadsRequested) +{ + +#ifdef HAVE_OPENMP + iNumberOfThreads = iNumThreadsRequested; + omp_set_num_threads(iNumberOfThreads); +#else + if (iNumThreadsRequested>1) { + Log(&rLog, LOG_FATAL, "Cannot change number of threads to %d. %s was build without OpenMP support.", + iNumThreadsRequested, PACKAGE_NAME); + } + iNumberOfThreads = 1; /* need to set this, even if build without support */ +#endif + + Log(&rLog, LOG_INFO, "Using %d threads", + iNumberOfThreads); + +} +/* end of InitClustalOmega() */ + + + +/** + * @brief Defines an alignment order, which adds sequences + * sequentially, i.e. one at a time starting with seq 1 & 2 + * + * @param[out] piOrderLR_p + * order in which nodes/profiles are to be merged/aligned + * @param[in] iNumSeq + * Number of sequences + * + * @see TraverseTree() + * + */ +void +SequentialAlignmentOrder(int **piOrderLR_p, int iNumSeq) +{ + unsigned int uNodes = iNumSeq*2-1; + unsigned int uNodeCounter = 0; + unsigned int uSeqCounter = 0; + + Log(&rLog, LOG_FATAL, "FIXME: Untested..."); + + (*piOrderLR_p) = (int *)CKCALLOC(DIFF_NODE * uNodes, sizeof(int)); + /* loop over merge nodes, which have per definition even indices + * and set up children which have odd indices + */ + uSeqCounter = 0; + for (uNodeCounter=iNumSeq; uNodeCounternseqs) { + Log(&rLog, LOG_VERBOSE, + "Have only two sequences: No need to compute pairwise score and compute a tree."); + + (*piOrderLR_p) = (int*) CKMALLOC(DIFF_NODE * 3 * sizeof(int)); + (*piOrderLR_p)[DIFF_NODE*0+LEFT_NODE] = 0; + (*piOrderLR_p)[DIFF_NODE*0+RGHT_NODE] = 0; + (*piOrderLR_p)[DIFF_NODE*0+PRNT_NODE] = 0; + + (*piOrderLR_p)[DIFF_NODE*1+LEFT_NODE] = 1; + (*piOrderLR_p)[DIFF_NODE*1+RGHT_NODE] = 1; + (*piOrderLR_p)[DIFF_NODE*1+PRNT_NODE] = 1; + + /* root */ + (*piOrderLR_p)[DIFF_NODE*2+LEFT_NODE] = 0; + (*piOrderLR_p)[DIFF_NODE*2+RGHT_NODE] = 1; + (*piOrderLR_p)[DIFF_NODE*2+PRNT_NODE] = 2; + + /* Same logic as CalcClustalWeights(). Changes there will + have to be reflected here as well. */ +#if USE_WEIGHTS + (*pdWeights_p) = (double *) CKMALLOC(uNodeCount * sizeof(double)); + (*pdWeights_p)[0] = 0.5; + (*pdWeights_p)[1] = 0.5; +#endif + + return OK; + } + + + /* compute distance & guide tree, alternatively read distances or + * guide tree from file + * + */ + if (NULL != pcGuidetreeInfile) { + Log(&rLog, LOG_INFO, "Reading guide-tree from %s", pcGuidetreeInfile); + if (GuideTreeFromFile(&prTree, prMSeq, pcGuidetreeInfile)) { + Log(&rLog, LOG_ERROR, "Reading of guide tree %s failed.", pcGuidetreeInfile); + return FAILURE; + } + + } else { + + if (bUseMbed) { + if (Mbed(&prTree, prMSeq, iPairDistType, pcGuidetreeOutfile)) { + Log(&rLog, LOG_ERROR, "mbed execution failed."); + return FAILURE; + } + Log(&rLog, LOG_INFO, "Guide-tree computation (mBed) done."); + if (NULL != pcDistmatOutfile) { + Log(&rLog, LOG_INFO, + "Ignoring request to write distance matrix (am in mBed mode)"); + } + } else { + + if (PairDistances(&distmat, prMSeq, iPairDistType, + 0, prMSeq->nseqs, 0, prMSeq->nseqs, + pcDistmatInfile, pcDistmatOutfile)) { + Log(&rLog, LOG_ERROR, "Couldn't compute pair distances"); + return FAILURE; + } + + /* clustering of distances to get guide tree + */ + if (CLUSTERING_UPGMA == iClusteringType) { + char **labels; + labels = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char*)); + for (i=0; inseqs; i++) { + labels[i] = prMSeq->sqinfo[i].name; + } + + GuideTreeUpgma(&prTree, labels, distmat, pcGuidetreeOutfile); + Log(&rLog, LOG_INFO, "Guide-tree computation done."); + + CKFREE(labels); + } else { + Log(&rLog, LOG_FATAL, "INTERNAL ERROR %s", + "clustering method should have been checked before"); + } + } + } + +#if USE_WEIGHTS + /* derive sequence weights from tree + * + */ + Log(&rLog, LOG_INFO, "Calculating sequence weights"); + CalcClustalWeights(pdSeqWeights_p, prTree); + for (i = 0; i < GetLeafCount(prTree); i++) { + Log(&rLog, LOG_VERBOSE, + "Weight for seq no %d: %s = %f", + i, prMSeq->sqinfo[i].name, (*pdSeqWeights_p)[i]); + } +#else + Log(&rLog, LOG_DEBUG, "Not using weights"); +#endif + + + /* define traversing order of tree + * + */ + TraverseTree(piOrderLR_p, prTree, prMSeq); + if (rLog.iLogLevelEnabled <= LOG_DEBUG) { + /* FIXME: debug only, FS */ + uint uNodeIndex; + FILE *fp = LogGetFP(&rLog, LOG_INFO); + Log(&rLog, LOG_DEBUG, "left/right order after tree traversal"); + for (uNodeIndex = 0; uNodeIndex < GetNodeCount(prTree); uNodeIndex++) { + fprintf(fp, "%3d:\t%2d/%2d -> %d\n", i, + (*piOrderLR_p)[DIFF_NODE*uNodeIndex+LEFT_NODE], + (*piOrderLR_p)[DIFF_NODE*uNodeIndex+RGHT_NODE], + (*piOrderLR_p)[DIFF_NODE*uNodeIndex+PRNT_NODE]); + } + } + + FreeMuscleTree(prTree); + FreeSymMatrix(&distmat); + +#if 0 + Log(&rLog, LOG_FATAL, "DEBUG EXIT before leaving %s", __FUNCTION__); +#endif + return OK; +} +/* end of AlignmentOrder() */ + + + +/** + * @brief Set some options automatically based on number of sequences. Might + * overwrite some user-set options. + * + * @param[out] prOpts + * Pointer to alignment options structure + * @param[in] iNumSeq + * Number of sequences to align + */ +void +SetAutoOptions(opts_t *prOpts, int iNumSeq) { + + Log(&rLog, LOG_INFO, + "Setting options automatically based on input sequence characteristics (might overwrite some of your options)."); + + /* AW: new version of mbed is always good (uses subclusters) */ + if (FALSE == prOpts->bUseMbed) { + Log(&rLog, LOG_INFO, "Auto settings: Enabling mBed."); + prOpts->bUseMbed = TRUE; + } + + if (iNumSeq >= 1000) { + if (0 != prOpts->iNumIterations) { + Log(&rLog, LOG_INFO, "Auto settings: Disabling iterations."); + prOpts->iNumIterations = 0; + } + + } else if (iNumSeq < 1000) { + if (1 != prOpts->iNumIterations) { + Log(&rLog, LOG_INFO, "Auto settings: Setting iteration to 1."); + prOpts->iNumIterations = 1; + } + } +} +/* end of */ + + + +/** + * @brief The main alignment function which wraps everything else. + * + * @param[out] prMSeq + * *the* multiple sequences structure + * @param[in] prMSeqProfile + * optional profile to align against + * @param[in] prOpts + * alignmemnt options to use + * + * @return 0 on success, -1 on failure + * + */ +int +Align(mseq_t *prMSeq, + mseq_t *prMSeqProfile, + opts_t *prOpts, + hhalign_para rHhalignPara) { + + /* HMM + */ + /* structs with pseudocounts etc; one for each HMM infile, i.e. + * index range: 0..iHMMInputFiles */ + hmm_light *prHMMs = NULL; + + /* MSA order in which nodes/profiles are to be merged/aligned + (order of nodes in guide tree (left/right)*/ + int *piOrderLR = NULL; + + /* weights per sequence */ + double *pdSeqWeights = NULL; + + /* Iteration + */ + int iIterationCounter = 0; + double dAlnScore; + /* last dAlnScore for iteration */ + double dLastAlnScore = -666.666; + + int i, j; /* aux */ + + assert(NULL != prMSeq); + if (NULL != prMSeqProfile) { + assert(TRUE == prMSeqProfile->aligned); + } + + + /* automatic setting of options + * + */ + if (prOpts->bAutoOptions) { + SetAutoOptions(prOpts, prMSeq->nseqs); + } + + +#if SHUFFLE_INPUT_SEQ_ORDER + /* + * shuffle input: only useful for testing/debugging + */ + Log(&rLog, LOG_WARN, "Shuffling input sequences! (Will also change output order)"); + ShuffleMSeq(prMSeq); +#endif + + +#if SORT_INPUT_SEQS + /* + * sort input: + * + * would ensure we *always* (unless we get into the mbed k-means stage) + * get the same answer. usually you don't, because most pairwise alignment + * scores are in theory not symmetric, therefore sequence ordering might + * have an effect on the guide-tree. Sorting by length should get rid of + * this (and takes no time even for 100k seqs). Benchmark results on + * Balibase show almost no difference after sorting. + */ + Log(&rLog, LOG_WARN, "Sorting input seq by length! This will also change the output order"); + SortMSeqByLength(prMSeq, 'd'); + +#endif + + + /* Read backgrounds HMMs and store in prHMMs + * + */ + if (0 < prOpts->iHMMInputFiles) { + int iHMMInfileIndex; + + /** + * @warning old structure used to be initialised like this: + * hmm_light rHMM = {0}; + */ + prHMMs = (hmm_light *) CKMALLOC(prOpts->iHMMInputFiles * sizeof(hmm_light)); + + for (iHMMInfileIndex=0; iHMMInfileIndexiHMMInputFiles; iHMMInfileIndex++) { + char *pcHMMInput = prOpts->ppcHMMInput[iHMMInfileIndex]; + if (OK != readHMMWrapper(&prHMMs[iHMMInfileIndex], pcHMMInput)){ + Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMMInput); + return -1; + } + +#if 0 + Log(&rLog, LOG_FORCED_DEBUG, "HMM length is %d", prHMMs[iHMMInfileIndex].L); + Log(&rLog, LOG_FORCED_DEBUG, "n-display is %d", prHMMs[iHMMInfileIndex].n_display); + for (i = 0; NULL != prHMMs[prOpts->iHMMInputFiles].seq[i]; i++){ + printf("seq[%d]: %s\n", i, prHMMs[iHMMInfileIndex].seq[i]); + } + Log(&rLog, LOG_FORCED_DEBUG, "Neff_HMM is %f", prHMMs[iHMMInfileIndex].Neff_HMM); +#endif + if (rLog.iLogLevelEnabled <= LOG_DEBUG){ + Log(&rLog, LOG_DEBUG, "print frequencies"); + for (i = 0; i < prHMMs[iHMMInfileIndex].L; i++){ +#define PRINT_TAIL 5 + if ( (PRINT_TAIL+1 == i) && (prHMMs[iHMMInfileIndex].L-PRINT_TAIL != i) ){ + printf("....\n"); + } + if ( (i > PRINT_TAIL) && (i < prHMMs[iHMMInfileIndex].L-PRINT_TAIL) ){ + continue; + } + printf("%3d:", i); + for (j = 0; j < 20; j++){ + printf("\t%1.3f", prHMMs[iHMMInfileIndex].f[i][j]); + } + printf("\n"); + } + } /* debug print block */ + + CKFREE(prOpts->ppcHMMInput[iHMMInfileIndex]); + } /* for each background HMM file */ + CKFREE(prOpts->ppcHMMInput); + } /* there were background HMM files */ + + + + /* If the input ("non-profile") sequences are aligned, then turn + * the alignment into a HMM and add to the list of background HMMs + * + */ + if (TRUE == prMSeq->aligned) { + /* FIXME: gcc warns about missing initialiser here (-Wall -Wextra -pedantic) */ + hmm_light rHMMLocal = {0}; + + Log(&rLog, LOG_INFO, + "Input sequences are aligned. Will turn alignment into HMM and add it to the user provided background HMMs."); + if (OK != +#if INDIRECT_HMM + AlnToHMM(&rHMMLocal, prMSeq) +#else + AlnToHMM2(&rHMMLocal, prMSeq->seq, prMSeq->nseqs) +#endif + ) { + Log(&rLog, LOG_ERROR, "Couldn't convert aligned input sequences to HMM. Will try to continue"); + } else { + prHMMs = (hmm_light *) CKREALLOC(prHMMs, ((prOpts->iHMMInputFiles+1) * sizeof(hmm_light))); + memcpy(&(prHMMs[prOpts->iHMMInputFiles]), &rHMMLocal, sizeof(hmm_light)); + prOpts->iHMMInputFiles++; + } + } + + + /* If we have a profile turn it into a HMM and add to + * the list of background HMMs. + * + */ + if (NULL != prMSeqProfile) { + /* FIXME: gcc warns about missing initialiser here (-Wall -Wextra -pedantic) */ + hmm_light rHMMLocal = {0}; + Log(&rLog, LOG_INFO, + "Turning profile1 into HMM and will use it during progressive alignment."); + if (OK != +#if INDIRECT_HMM + AlnToHMM(&rHMMLocal, prMSeqProfile) +#else + AlnToHMM2(&rHMMLocal, prMSeqProfile->seq, prMSeqProfile->nseqs) +#endif + ) { + Log(&rLog, LOG_ERROR, "Couldn't convert profile1 to HMM. Will try to continue"); + } else { + prHMMs = (hmm_light *) CKREALLOC(prHMMs, ((prOpts->iHMMInputFiles+1) * sizeof(hmm_light))); + memcpy(&(prHMMs[prOpts->iHMMInputFiles]), &rHMMLocal, sizeof(hmm_light)); + prOpts->iHMMInputFiles++; + } + } + + + /* Now do a first alignment of the input sequences (prMSeq) adding + * all collected background HMMs + * + */ + /* Determine progressive alignment order + */ + if (TRUE == prMSeq->aligned) { + Log(&rLog, LOG_INFO, "%s %s", + "Input sequences are aligned.", + "Will use Kimura distances of aligned sequences."); + prOpts->iPairDistType = PAIRDIST_SQUIDID_KIMURA; + } + +#if 0 + Log(&rLog, LOG_WARN, "Using a sequential alignment order."); + SequentialAlignmentOrder(&piOrderLR, prMSeq->nseqs); +#else + if (OK != AlignmentOrder(&piOrderLR, &pdSeqWeights, prMSeq, + prOpts->iPairDistType, + prOpts->pcDistmatInfile, prOpts->pcDistmatOutfile, + prOpts->iClusteringType, + prOpts->pcGuidetreeInfile, prOpts->pcGuidetreeOutfile, + prOpts->bUseMbed)) { + Log(&rLog, LOG_ERROR, "AlignmentOrder() failed. Cannot continue"); + return -1; + } +#endif + + /* Progressive alignment of input sequences. Order defined by + * branching of guide tree (piOrderLR). Use optional + * background HMM information (prHMMs[0..prOpts->iHMMInputFiles-1]) + * + */ + dAlnScore = HHalignWrapper(prMSeq, piOrderLR, pdSeqWeights, + 2*prMSeq->nseqs -1/* nodes */, + prHMMs, prOpts->iHMMInputFiles, -1, rHhalignPara); + dLastAlnScore = dAlnScore; + Log(&rLog, LOG_VERBOSE, + "Alignment score for first alignment = %f", dAlnScore); + + + + + /* ------------------------------------------------------------ + * + * prMSeq is aligned now. Now start iterations if requested and save the + * alignment at the very end. + * + * @note We discard the background HMM information at this point, + * because it was already used. Could consider to make this choice + * optional. FIXME + * + * ------------------------------------------------------------ */ + + + /* iteration after first alignment was computed (if not profile-profile + * alignment) + * + */ + for (iIterationCounter=0; + (iIterationCounter < prOpts->iNumIterations || prOpts->bIterationsAuto); + iIterationCounter++) { + + hmm_light rHMMLocal = {0}; + /* FIXME Keep copy of old alignment in case new one sucks? */ + + + if (iIterationCounter >= prOpts->iMaxHMMIterations + && + iIterationCounter >= prOpts->iMaxGuidetreeIterations) { + Log(&rLog, LOG_VERBOSE, "Reached maximum number of HMM and guide-tree iterations"); + break; + } + + if (! prOpts->bIterationsAuto) { + Log(&rLog, LOG_INFO, "Iteration step %d out of %d", + iIterationCounter+1, prOpts->iNumIterations); + } else { + Log(&rLog, LOG_INFO, "Iteration step %d out of ", + iIterationCounter+1); + } +#if 0 + if (rLog.iLogLevelEnabled <= LOG_VERBOSE) { + char zcIntermediate[1000] = {0}; + char *pcFormat = "fasta"; + sprintf(zcIntermediate, "clustalo-aln-iter~%d~", iIterationCounter); + if (WriteAlignment(prMSeq, zcIntermediate, MSAFILE_A2M)) { + Log(&rLog, LOG_ERROR, "Could not save alignment to %s", zcIntermediate); + return -1; + } + } +#endif + + + /* new guide-tree + * + */ + if (iIterationCounter < prOpts->iMaxGuidetreeIterations) { + /* determine progressive alignment order + * + * few things are different now when calling AlignmentOrder: + * - we have to ignore prOpts->pcDistmatInfile and pcGuidetreeInfile + * as they were used before + * - the corresponding outfiles are still valid though + */ + /* Free stuff that has already been allocated by or further + * downstream of AlignmentOrder() + */ + if (NULL != piOrderLR) + CKFREE(piOrderLR); + if (NULL != pdSeqWeights) + CKFREE(pdSeqWeights); + if (AlignmentOrder(&piOrderLR, &pdSeqWeights, prMSeq, + PAIRDIST_SQUIDID_KIMURA /* override */, NULL, prOpts->pcDistmatOutfile, + prOpts->iClusteringType, NULL, prOpts->pcGuidetreeOutfile, + prOpts->bUseMbedForIteration)) { + Log(&rLog, LOG_ERROR, "AlignmentOrder() failed. Cannot continue"); + return -1; + } + } else { + Log(&rLog, LOG_INFO, "Skipping guide-tree iteration at iteration step %d (reached maximum)", + iIterationCounter); + } + + + /* new local hmm iteration + * + */ + if (iIterationCounter < prOpts->iMaxHMMIterations) { + if (OK != +#if INDIRECT_HMM + AlnToHMM(&rHMMLocal, prMSeq) +#else + AlnToHMM2(&rHMMLocal, prMSeq->seq, prMSeq->nseqs) +#endif + ) { + Log(&rLog, LOG_ERROR, "Couldn't convert alignment to HMM. Will stop iterating now..."); + break; + } + } else { + Log(&rLog, LOG_INFO, "Skipping HMM iteration at iteration step %d (reached maximum)", + iIterationCounter); + } + + + /* align the sequences (again) + */ + dAlnScore = HHalignWrapper(prMSeq, piOrderLR, pdSeqWeights, + 2*prMSeq->nseqs -1/* nodes */, &rHMMLocal, 1, -1, rHhalignPara); + Log(&rLog, LOG_VERBOSE, + "Alignment score for alignmnent in hmm-iteration no %d = %f (last score = %f)", + iIterationCounter+1, dAlnScore, dLastAlnScore); + + + FreeHMMstruct(&rHMMLocal); + +#if 0 + /* FIXME: need a better score for automatic iteration */ + if (prOpts->bIterationsAuto) { + /* automatic iteration: break if score improvement was not + * big enough + */ + double dScoreImprovement = (dAlnScore-dLastAlnScore)/dLastAlnScore; + if (dScoreImprovement < ITERATION_SCORE_IMPROVEMENT_THRESHOLD) { + Log(&rLog, LOG_INFO, + "Stopping after %d guide-tree iterations. No further alignment score improvement achieved.", + iIterationCounter+1); + /* use previous alignment */ + FreeMSeq(&prMSeq); + Log(&rLog, LOG_FORCED_DEBUG, "FIXME: %s", "CopyMSeq breaks things in this context"); + CopyMSeq(&prMSeq, prMSeqCopy); + /* FIXME: prOpts->pcDistmatOutfile and pcGuidetreeOutfile + * might have been updated, but then discarded here? + */ + break; + } else { + Log(&rLog, LOG_INFO, + "Got a %d%% better score in iteration step %d", + (int)dScoreImprovement*100, iIterationCounter+1); + FreeMSeq(&prMSeqCopy); + } + } + dLastAlnScore = dAlnScore; +#endif + + } + /* end of iterations */ + + + + /* Last step: if a profile was also provided then align now-aligned mseq + * with this profile + * + * Don't use the backgrounds HMMs anymore and don't iterate. + * (which was done before). + * + */ + if (NULL != prMSeqProfile) { + if (AlignProfiles(prMSeq, prMSeqProfile, rHhalignPara)) { + Log(&rLog, LOG_ERROR, "An error occured during the profile/profile alignment"); + return -1; + } + } + + + if (NULL != piOrderLR) { + CKFREE(piOrderLR); + } + if (NULL != pdSeqWeights) { + CKFREE(pdSeqWeights); + } + if (0 < prOpts->iHMMInputFiles) { + for (i=0; iiHMMInputFiles; i++) { + FreeHMMstruct(&prHMMs[i]); + } + CKFREE(prHMMs); + } + + return 0; +} +/* end of Align() */ + + + + +/** + * @brief Align two profiles, ie two sets of prealigned sequences. Already + * aligned columns won't be changed. + * + * @param[out] prMSeqProfile1 + * First profile/aligned set of sequences. Merged alignment will be found in + * here. + * @param[in] prMSeqProfile2 + * First profile/aligned set of sequences + * + * @return 0 on success, -1 on failure + * + */ +int +AlignProfiles(mseq_t *prMSeqProfile1, + mseq_t *prMSeqProfile2, hhalign_para rHhalignPara) { + + double dAlnScore; + + /* number of seqs in first half of joined profile */ + int iProfProfSeparator = prMSeqProfile1->nseqs; + + assert(TRUE == prMSeqProfile1->aligned); + assert(TRUE == prMSeqProfile2->aligned); + + Log(&rLog, LOG_INFO, "Performing profile/profile alignment"); + + /* Combine the available mseqs into prMSeq + * which will be aligned afterwards. + */ + JoinMSeqs(&prMSeqProfile1, prMSeqProfile2); + + + /* set alignment flag explicitly to FALSE */ + prMSeqProfile1->aligned = FALSE; + + dAlnScore = HHalignWrapper(prMSeqProfile1, + NULL, /* no order */ + NULL, /* no weights */ + 3, /* nodes: root+2profiles */ + NULL, 0 /* no bg-hmms */, + iProfProfSeparator, rHhalignPara); + + Log(&rLog, LOG_VERBOSE, "Alignment score is = %f", dAlnScore); + + return 0; +} +/* end of AlignProfiles() */ + +