From: pvtroshin Date: Mon, 20 Jun 2011 16:37:41 +0000 (+0000) Subject: new version of muscle 3.8.31 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=b67f9b9dd77fa8f57d515842b8382b88d85cc029;p=jabaws.git new version of muscle 3.8.31 binaries for various systems, sources, help file and configuration is updated git-svn-id: link to svn.lifesci.dundee.ac.uk/svn/barton/ptroshin/JABA2@4283 e3abac25-378b-4346-85de-24260fe3988d --- diff --git a/TODO.txt b/TODO.txt index 0e7c5d0..7ca2625 100644 --- a/TODO.txt +++ b/TODO.txt @@ -2,7 +2,8 @@ TODO write some help on the executable.properties within this file! check that after binaries relocation build tasks point to correct locations! GET rid of binaries/help directory! update binaries -make sure conf files are optional! + + muscle binary version 3.8.31 for win/lin32/lin64 and sources, docs in website/prog_docs ++ make sure conf files are optional! TODO: Registry 1 week diff --git a/binaries/linuxAMD64/muscle b/binaries/linuxAMD64/muscle index 0728a18..569be49 100644 Binary files a/binaries/linuxAMD64/muscle and b/binaries/linuxAMD64/muscle differ diff --git a/binaries/linuxIA32/muscle b/binaries/linuxIA32/muscle index 4a0f181..da71ba3 100644 Binary files a/binaries/linuxIA32/muscle and b/binaries/linuxIA32/muscle differ diff --git a/binaries/src/muscle/Makefile b/binaries/src/muscle/Makefile index fb62886..b93f11d 100644 --- a/binaries/src/muscle/Makefile +++ b/binaries/src/muscle/Makefile @@ -1,37 +1,3 @@ -# Porting notes: -# For Solaris and other platforms where the logf function -# is missing from the math library, add the following line -# to the end of muscle.h: -# #define logf(x) ((float) log(x)) -# Using -static increases the executable size and thus gives a very -# small increase in start time, but is more portable (the binding -# to dynamic libraries often breaks when a new library is released). -# On OSX, using -static gives the error "ld: can't locate file for: -lcrt0.o", -# this is fixed by deleting "-static" from the LDLIBS line. - -CFLAGS = -O3 -funroll-loops -Winline -DNDEBUG=1 -# LDLIBS = -lm -static -LDLIBS = -lm - -OBJ = .o -EXE = - -RM = rm -f -CP = cp - -GPP = g++ -LD = $(GPP) $(CFLAGS) -CPP = $(GPP) -c $(CFLAGS) - - -all: muscle - -CPPSRC = $(sort $(wildcard *.cpp)) -CPPOBJ = $(subst .cpp,.o,$(CPPSRC)) - -$(CPPOBJ): %.o: %.cpp - $(CPP) $< -o $@ - -muscle: $(CPPOBJ) - $(LD) -o muscle $(CPPOBJ) $(LDLIBS) - strip muscle +muscle: + chmod +x ./mk + ./mk diff --git a/binaries/src/muscle/README.txt b/binaries/src/muscle/README.txt new file mode 100644 index 0000000..8e4afd2 --- /dev/null +++ b/binaries/src/muscle/README.txt @@ -0,0 +1,27 @@ +MUSCLE v3.0 source code README +------------------------------ + +http://www.drive5.com/muscle + +This version of MUSCLE was built and tested on two platforms: +Windows XP and Red Hat Linux 8.0. + +On Windows, I used Microsoft Visual C++ .Net, which I find +to be the best C++ compile / edit / test environment I've +tried on any platform. The Microsoft project file is +muscle.vcproj. + +The Linux make file is Makefile. This is a very simple-minded +make file (because I am a Linux development novice), so should +be easy to understand. By default, it uses shared libraries, +but I found this to give problems when copying between +different Linux versions. The fix was to use the linker +flag -lm static (commented out), which gives a much bigger +but more portable binary. The posted binary was linked with +static libraries. + +The source code was not written to be maintained by anyone +but me, so the usual apologies and caveats apply. + +Bob Edgar, +January 2004 diff --git a/binaries/src/muscle/aln.cpp b/binaries/src/muscle/aln.cpp index fe2053f..4452bc8 100644 --- a/binaries/src/muscle/aln.cpp +++ b/binaries/src/muscle/aln.cpp @@ -17,7 +17,7 @@ void MSA::ToAlnFile(TextFile &File) const else { File.PutString("MUSCLE (" - MUSCLE_MAJOR_VERSION "." MUSCLE_MINOR_VERSION ")" + SHORT_VERSION ")" " multiple sequence alignment\n"); File.PutString("\n"); } diff --git a/binaries/src/muscle/color.cpp b/binaries/src/muscle/color.cpp index e32a787..f0aa00f 100644 --- a/binaries/src/muscle/color.cpp +++ b/binaries/src/muscle/color.cpp @@ -55,7 +55,7 @@ static int toi_tab[26] = 15, // R 16, // S 17, // T - -1, // U + 17, // U 18, // V 19, // W 20, // X diff --git a/binaries/src/muscle/domuscle.cpp b/binaries/src/muscle/domuscle.cpp index c7a843d..73c5534 100644 --- a/binaries/src/muscle/domuscle.cpp +++ b/binaries/src/muscle/domuscle.cpp @@ -120,7 +120,7 @@ void DoMuscle() { // Discourage users... if (!g_bUseTreeNoWarn) - fprintf(stderr, g_strUseTreeWarning); + fprintf(stderr, "%s", g_strUseTreeWarning); // Read tree from file TextFile TreeFile(g_pstrUseTreeFileName); diff --git a/binaries/src/muscle/gatest.cpp b/binaries/src/muscle/gatest.cpp new file mode 100644 index 0000000..35b6ffc --- /dev/null +++ b/binaries/src/muscle/gatest.cpp @@ -0,0 +1,32 @@ +#include "muscle.h" +#include "pwpath.h" +#include "timing.h" +#include "textfile.h" +#include "msa.h" +#include "profile.h" + +SCORE GlobalAlign(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + if (g_bDiags) + return GlobalAlignDiags(PA, uLengthA, PB, uLengthB, Path); + else + return GlobalAlignNoDiags(PA, uLengthA, PB, uLengthB, Path); + } + +SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + switch (g_PPScore) + { + case PPSCORE_LE: + return GlobalAlignLA(PA, uLengthA, PB, uLengthB, Path); + + case PPSCORE_SP: + return GlobalAlignNS(PA, uLengthA, PB, uLengthB, Path); + + case PPSCORE_SV: + return GlobalAlignSimple(PA, uLengthA, PB, uLengthB, Path); + } + return 0; + } diff --git a/binaries/src/muscle/glbalignla.cpp b/binaries/src/muscle/glbalignla.cpp new file mode 100644 index 0000000..f28f487 --- /dev/null +++ b/binaries/src/muscle/glbalignla.cpp @@ -0,0 +1,432 @@ +#include "muscle.h" +#include "profile.h" +#include "pwpath.h" + +#define OCC 1 + +struct DP_MEMORY + { + unsigned uLength; + SCORE *GapOpenA; + SCORE *GapOpenB; + SCORE *GapCloseA; + SCORE *GapCloseB; + SCORE *MPrev; + SCORE *MCurr; + SCORE *MWork; + SCORE *DPrev; + SCORE *DCurr; + SCORE *DWork; + SCORE **ScoreMxB; +#if OCC + FCOUNT *OccA; + FCOUNT *OccB; +#endif + unsigned **SortOrderA; + unsigned *uDeletePos; + FCOUNT **FreqsA; + int **TraceBack; + }; + +static struct DP_MEMORY DPM; + +static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) + { +// Max prefix length + unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; + if (uLength < DPM.uLength) + return; + +// Add 256 to allow for future expansion and +// round up to next multiple of 32. + uLength += 256; + uLength += 32 - uLength%32; + + const unsigned uOldLength = DPM.uLength; + if (uOldLength > 0) + { + for (unsigned i = 0; i < uOldLength; ++i) + { + delete[] DPM.TraceBack[i]; + delete[] DPM.FreqsA[i]; + delete[] DPM.SortOrderA[i]; + } + for (unsigned n = 0; n < 20; ++n) + delete[] DPM.ScoreMxB[n]; + + delete[] DPM.MPrev; + delete[] DPM.MCurr; + delete[] DPM.MWork; + delete[] DPM.DPrev; + delete[] DPM.DCurr; + delete[] DPM.DWork; + delete[] DPM.uDeletePos; + delete[] DPM.GapOpenA; + delete[] DPM.GapOpenB; + delete[] DPM.GapCloseA; + delete[] DPM.GapCloseB; + delete[] DPM.SortOrderA; + delete[] DPM.FreqsA; + delete[] DPM.ScoreMxB; + delete[] DPM.TraceBack; +#if OCC + delete[] DPM.OccA; + delete[] DPM.OccB; +#endif + } + + DPM.uLength = uLength; + + DPM.GapOpenA = new SCORE[uLength]; + DPM.GapOpenB = new SCORE[uLength]; + DPM.GapCloseA = new SCORE[uLength]; + DPM.GapCloseB = new SCORE[uLength]; +#if OCC + DPM.OccA = new FCOUNT[uLength]; + DPM.OccB = new FCOUNT[uLength]; +#endif + + DPM.SortOrderA = new unsigned*[uLength]; + DPM.FreqsA = new FCOUNT*[uLength]; + DPM.ScoreMxB = new SCORE*[20]; + DPM.MPrev = new SCORE[uLength]; + DPM.MCurr = new SCORE[uLength]; + DPM.MWork = new SCORE[uLength]; + + DPM.DPrev = new SCORE[uLength]; + DPM.DCurr = new SCORE[uLength]; + DPM.DWork = new SCORE[uLength]; + DPM.uDeletePos = new unsigned[uLength]; + + DPM.TraceBack = new int*[uLength]; + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + DPM.ScoreMxB[uLetter] = new SCORE[uLength]; + + for (unsigned i = 0; i < uLength; ++i) + { + DPM.SortOrderA[i] = new unsigned[20]; + DPM.FreqsA[i] = new FCOUNT[20]; + DPM.TraceBack[i] = new int[uLength]; + } + } + +SCORE GlobalAlignLA(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + + AllocDPMem(uLengthA, uLengthB); + + SCORE *GapOpenA = DPM.GapOpenA; + SCORE *GapOpenB = DPM.GapOpenB; + SCORE *GapCloseA = DPM.GapCloseA; + SCORE *GapCloseB = DPM.GapCloseB; + + unsigned **SortOrderA = DPM.SortOrderA; + FCOUNT **FreqsA = DPM.FreqsA; + SCORE **ScoreMxB = DPM.ScoreMxB; + SCORE *MPrev = DPM.MPrev; + SCORE *MCurr = DPM.MCurr; + SCORE *MWork = DPM.MWork; + + SCORE *DPrev = DPM.DPrev; + SCORE *DCurr = DPM.DCurr; + SCORE *DWork = DPM.DWork; + +#if OCC + FCOUNT *OccA = DPM.OccA; + FCOUNT *OccB = DPM.OccB; +#endif + + unsigned *uDeletePos = DPM.uDeletePos; + + int **TraceBack = DPM.TraceBack; + + for (unsigned i = 0; i < uLengthA; ++i) + { + GapOpenA[i] = PA[i].m_scoreGapOpen; + GapCloseA[i] = PA[i].m_scoreGapClose; +#if OCC + OccA[i] = PA[i].m_fOcc; +#endif + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; + FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; + } + } + + for (unsigned j = 0; j < uLengthB; ++j) + { + GapOpenB[j] = PB[j].m_scoreGapOpen; + GapCloseB[j] = PB[j].m_scoreGapClose; +#if OCC + OccB[j] = PB[j].m_fOcc; +#endif + } + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + for (unsigned j = 0; j < uLengthB; ++j) + ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; + } + + for (unsigned i = 0; i < uPrefixCountA; ++i) + memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); + +// Special case for i=0 + unsigned **ptrSortOrderA = SortOrderA; + FCOUNT **ptrFreqsA = FreqsA; + assert(ptrSortOrderA == &(SortOrderA[0])); + assert(ptrFreqsA == &(FreqsA[0])); + TraceBack[0][0] = 0; + + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][0]; + } + if (0 == scoreSum) + MPrev[0] = -2.5; + else + { +#if OCC + MPrev[0] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[0]; +#else + MPrev[0] = (logf(scoreSum) - g_scoreCenter); +#endif + } + +// D(0,0) is -infinity (requires I->D). + DPrev[0] = MINUS_INFINITY; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Only way to get M(0, j) looks like this: + // A ----X + // B XXXXX + // 0 j + // So gap-open at j=0, gap-close at j-1. + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][j]; + } + if (0 == scoreSum) + MPrev[j] = -2.5; + else + { +#if OCC + MPrev[j] = (logf(scoreSum) - g_scoreCenter)*OccA[0]*OccB[j] + + GapOpenB[0] + GapCloseB[j-1]; +#else + MPrev[j] = (logf(scoreSum) - g_scoreCenter) + + GapOpenB[0] + GapCloseB[j-1]; +#endif + } + TraceBack[0][j] = -(int) j; + + // Assume no D->I transitions, then can't be a delete if only + // one letter from A. + DPrev[j] = MINUS_INFINITY; + } + + SCORE IPrev_j_1; + for (unsigned i = 1; i < uLengthA; ++i) + { + ++ptrSortOrderA; + ++ptrFreqsA; + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + SCORE *ptrMCurr_j = MCurr; + memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); + const FCOUNT *FreqsAi = *ptrFreqsA; + + const unsigned *SortOrderAi = *ptrSortOrderA; + const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20; + const SCORE *ptrMCurrMax = MCurr + uLengthB; + for (const unsigned *ptrSortOrderAi = SortOrderAi; + ptrSortOrderAi != ptrSortOrderAiEnd; + ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + SCORE *NSBR_Letter = ScoreMxB[uLetter]; + const FCOUNT fcLetter = FreqsAi[uLetter]; + if (0 == fcLetter) + break; + SCORE *ptrNSBR = NSBR_Letter; + for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) + *ptrMCurr += fcLetter*(*ptrNSBR++); + } + +#if OCC + const FCOUNT OccAi = OccA[i]; +#endif + for (unsigned j = 0; j < uLengthB; ++j) + { + if (MCurr[j] == 0) + MCurr[j] = -2.5; + else +#if OCC + MCurr[j] = (logf(MCurr[j]) - g_scoreCenter)*OccAi*OccB[j]; +#else + MCurr[j] = (logf(MCurr[j]) - g_scoreCenter); +#endif + } + + ptrMCurr_j = MCurr; + unsigned *ptrDeletePos = uDeletePos; + + // Special case for j=0 + // Only way to get M(i, 0) looks like this: + // 0 i + // A XXXXX + // B ----X + // So gap-open at i=0, gap-close at i-1. + assert(ptrMCurr_j == &(MCurr[0])); + *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; + + ++ptrMCurr_j; + + int *ptrTraceBack_ij = TraceBack[i]; + *ptrTraceBack_ij++ = (int) i; + + SCORE *ptrMPrev_j = MPrev; + SCORE *ptrDPrev = DPrev; + SCORE d = *ptrDPrev; + SCORE DNew = *ptrMPrev_j + GapOpenA[i]; + if (DNew > d) + { + d = DNew; + *ptrDeletePos = i; + } + + SCORE *ptrDCurr = DCurr; + + assert(ptrDCurr == &(DCurr[0])); + *ptrDCurr = d; + + // Can't have an insert if no letters from B + IPrev_j_1 = MINUS_INFINITY; + + unsigned uInsertPos; + const SCORE scoreGapOpenAi = GapOpenA[i]; + const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Here, MPrev_j is preserved from previous + // iteration so with current i,j is M[i-1][j-1] + SCORE MPrev_j = *ptrMPrev_j; + SCORE INew = MPrev_j + GapOpenB[j]; + if (INew > IPrev_j_1) + { + IPrev_j_1 = INew; + uInsertPos = j; + } + + SCORE scoreMax = MPrev_j; + + assert(ptrDPrev == &(DPrev[j-1])); + SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + assert(ptrDeletePos == &(uDeletePos[j-1])); + *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; + assert(*ptrTraceBack_ij > 0); + } + ++ptrDeletePos; + + SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + *ptrTraceBack_ij = (int) uInsertPos - (int) j; + assert(*ptrTraceBack_ij < 0); + } + + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + *ptrMCurr_j += scoreMax; + assert(ptrMCurr_j == &(MCurr[j])); + ++ptrMCurr_j; + + MPrev_j = *(++ptrMPrev_j); + assert(ptrDPrev == &(DPrev[j])); + SCORE d = *ptrDPrev; + SCORE DNew = MPrev_j + scoreGapOpenAi; + if (DNew > d) + { + d = DNew; + assert(ptrDeletePos == &uDeletePos[j]); + *ptrDeletePos = i; + } + assert(ptrDCurr + 1 == &(DCurr[j])); + *(++ptrDCurr) = d; + + ++ptrTraceBack_ij; + } + + Rotate(MPrev, MCurr, MWork); + Rotate(DPrev, DCurr, DWork); + } + +// Special case for i=uLengthA + SCORE IPrev = MINUS_INFINITY; + + unsigned uInsertPos; + + for (unsigned j = 1; j < uLengthB; ++j) + { + SCORE INew = MPrev[j-1] + GapOpenB[j]; + if (INew > IPrev) + { + uInsertPos = j; + IPrev = INew; + } + } + +// Special case for i=uLengthA, j=uLengthB + SCORE scoreMax = MPrev[uLengthB-1]; + int iTraceBack = 0; + + SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; + } + + SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + iTraceBack = (int) uInsertPos - (int) uLengthB; + } + + TraceBack[uLengthA][uLengthB] = iTraceBack; + + TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); + + return scoreMax; + } diff --git a/binaries/src/muscle/glbalignns.cpp b/binaries/src/muscle/glbalignns.cpp new file mode 100644 index 0000000..45827db --- /dev/null +++ b/binaries/src/muscle/glbalignns.cpp @@ -0,0 +1,374 @@ +#include "muscle.h" +#include "profile.h" +#include "pwpath.h" + +struct DP_MEMORY + { + unsigned uLength; + SCORE *GapOpenA; + SCORE *GapOpenB; + SCORE *GapCloseA; + SCORE *GapCloseB; + SCORE *MPrev; + SCORE *MCurr; + SCORE *MWork; + SCORE *DPrev; + SCORE *DCurr; + SCORE *DWork; + SCORE **ScoreMxB; + unsigned **SortOrderA; + unsigned *uDeletePos; + FCOUNT **FreqsA; + int **TraceBack; + }; + +static struct DP_MEMORY DPM; + +static void AllocDPMem(unsigned uLengthA, unsigned uLengthB) + { +// Max prefix length + unsigned uLength = (uLengthA > uLengthB ? uLengthA : uLengthB) + 1; + if (uLength < DPM.uLength) + return; + +// Add 256 to allow for future expansion and +// round up to next multiple of 32. + uLength += 256; + uLength += 32 - uLength%32; + + const unsigned uOldLength = DPM.uLength; + if (uOldLength > 0) + { + for (unsigned i = 0; i < uOldLength; ++i) + { + delete[] DPM.TraceBack[i]; + delete[] DPM.FreqsA[i]; + delete[] DPM.SortOrderA[i]; + } + for (unsigned n = 0; n < 20; ++n) + delete[] DPM.ScoreMxB[n]; + + delete[] DPM.MPrev; + delete[] DPM.MCurr; + delete[] DPM.MWork; + delete[] DPM.DPrev; + delete[] DPM.DCurr; + delete[] DPM.DWork; + delete[] DPM.uDeletePos; + delete[] DPM.GapOpenA; + delete[] DPM.GapOpenB; + delete[] DPM.GapCloseA; + delete[] DPM.GapCloseB; + delete[] DPM.SortOrderA; + delete[] DPM.FreqsA; + delete[] DPM.ScoreMxB; + delete[] DPM.TraceBack; + } + + DPM.uLength = uLength; + + DPM.GapOpenA = new SCORE[uLength]; + DPM.GapOpenB = new SCORE[uLength]; + DPM.GapCloseA = new SCORE[uLength]; + DPM.GapCloseB = new SCORE[uLength]; + + DPM.SortOrderA = new unsigned*[uLength]; + DPM.FreqsA = new FCOUNT*[uLength]; + DPM.ScoreMxB = new SCORE*[20]; + DPM.MPrev = new SCORE[uLength]; + DPM.MCurr = new SCORE[uLength]; + DPM.MWork = new SCORE[uLength]; + + DPM.DPrev = new SCORE[uLength]; + DPM.DCurr = new SCORE[uLength]; + DPM.DWork = new SCORE[uLength]; + DPM.uDeletePos = new unsigned[uLength]; + + DPM.TraceBack = new int*[uLength]; + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + DPM.ScoreMxB[uLetter] = new SCORE[uLength]; + + for (unsigned i = 0; i < uLength; ++i) + { + DPM.SortOrderA[i] = new unsigned[20]; + DPM.FreqsA[i] = new FCOUNT[20]; + DPM.TraceBack[i] = new int[uLength]; + } + } + +SCORE GlobalAlignNS(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, + unsigned uLengthB, PWPath &Path) + { + const unsigned uPrefixCountA = uLengthA + 1; + const unsigned uPrefixCountB = uLengthB + 1; + + AllocDPMem(uLengthA, uLengthB); + + SCORE *GapOpenA = DPM.GapOpenA; + SCORE *GapOpenB = DPM.GapOpenB; + SCORE *GapCloseA = DPM.GapCloseA; + SCORE *GapCloseB = DPM.GapCloseB; + + unsigned **SortOrderA = DPM.SortOrderA; + FCOUNT **FreqsA = DPM.FreqsA; + SCORE **ScoreMxB = DPM.ScoreMxB; + SCORE *MPrev = DPM.MPrev; + SCORE *MCurr = DPM.MCurr; + SCORE *MWork = DPM.MWork; + + SCORE *DPrev = DPM.DPrev; + SCORE *DCurr = DPM.DCurr; + SCORE *DWork = DPM.DWork; + unsigned *uDeletePos = DPM.uDeletePos; + + int **TraceBack = DPM.TraceBack; + + for (unsigned i = 0; i < uLengthA; ++i) + { + GapOpenA[i] = PA[i].m_scoreGapOpen; + GapCloseA[i] = PA[i].m_scoreGapClose; + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + SortOrderA[i][uLetter] = PA[i].m_uSortOrder[uLetter]; + FreqsA[i][uLetter] = PA[i].m_fcCounts[uLetter]; + } + } + + for (unsigned j = 0; j < uLengthB; ++j) + { + GapOpenB[j] = PB[j].m_scoreGapOpen; + GapCloseB[j] = PB[j].m_scoreGapClose; + } + + for (unsigned uLetter = 0; uLetter < 20; ++uLetter) + { + for (unsigned j = 0; j < uLengthB; ++j) + ScoreMxB[uLetter][j] = PB[j].m_AAScores[uLetter]; + } + + for (unsigned i = 0; i < uPrefixCountA; ++i) + memset(TraceBack[i], 0, uPrefixCountB*sizeof(int)); + +// Special case for i=0 + unsigned **ptrSortOrderA = SortOrderA; + FCOUNT **ptrFreqsA = FreqsA; + assert(ptrSortOrderA == &(SortOrderA[0])); + assert(ptrFreqsA == &(FreqsA[0])); + TraceBack[0][0] = 0; + + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][0]; + } + MPrev[0] = scoreSum - g_scoreCenter; + +// D(0,0) is -infinity (requires I->D). + DPrev[0] = MINUS_INFINITY; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Only way to get M(0, j) looks like this: + // A ----X + // B XXXXX + // 0 j + // So gap-open at j=0, gap-close at j-1. + SCORE scoreSum = 0; + unsigned *ptrSortOrderAi = SortOrderA[0]; + const unsigned *ptrSortOrderAEnd = ptrSortOrderAi + 20; + FCOUNT *ptrFreqsAi = FreqsA[0]; + for (; ptrSortOrderAi != ptrSortOrderAEnd; ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + const FCOUNT fcLetter = ptrFreqsAi[uLetter]; + if (0 == fcLetter) + break; + scoreSum += fcLetter*ScoreMxB[uLetter][j]; + } + MPrev[j] = scoreSum - g_scoreCenter + GapOpenB[0] + GapCloseB[j-1]; + TraceBack[0][j] = -(int) j; + + // Assume no D->I transitions, then can't be a delete if only + // one letter from A. + DPrev[j] = MINUS_INFINITY; + } + + SCORE IPrev_j_1; + for (unsigned i = 1; i < uLengthA; ++i) + { + ++ptrSortOrderA; + ++ptrFreqsA; + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + SCORE *ptrMCurr_j = MCurr; + memset(ptrMCurr_j, 0, uLengthB*sizeof(SCORE)); + const FCOUNT *FreqsAi = *ptrFreqsA; + + const unsigned *SortOrderAi = *ptrSortOrderA; + const unsigned *ptrSortOrderAiEnd = SortOrderAi + 20; + const SCORE *ptrMCurrMax = MCurr + uLengthB; + for (const unsigned *ptrSortOrderAi = SortOrderAi; + ptrSortOrderAi != ptrSortOrderAiEnd; + ++ptrSortOrderAi) + { + const unsigned uLetter = *ptrSortOrderAi; + SCORE *NSBR_Letter = ScoreMxB[uLetter]; + const FCOUNT fcLetter = FreqsAi[uLetter]; + if (0 == fcLetter) + break; + SCORE *ptrNSBR = NSBR_Letter; + for (SCORE *ptrMCurr = MCurr; ptrMCurr != ptrMCurrMax; ++ptrMCurr) + *ptrMCurr += fcLetter*(*ptrNSBR++); + } + + for (unsigned j = 0; j < uLengthB; ++j) + MCurr[j] -= g_scoreCenter; + + ptrMCurr_j = MCurr; + unsigned *ptrDeletePos = uDeletePos; + + // Special case for j=0 + // Only way to get M(i, 0) looks like this: + // 0 i + // A XXXXX + // B ----X + // So gap-open at i=0, gap-close at i-1. + assert(ptrMCurr_j == &(MCurr[0])); + *ptrMCurr_j += GapOpenA[0] + GapCloseA[i-1]; + + ++ptrMCurr_j; + + int *ptrTraceBack_ij = TraceBack[i]; + *ptrTraceBack_ij++ = (int) i; + + SCORE *ptrMPrev_j = MPrev; + SCORE *ptrDPrev = DPrev; + SCORE d = *ptrDPrev; + SCORE DNew = *ptrMPrev_j + GapOpenA[i]; + if (DNew > d) + { + d = DNew; + *ptrDeletePos = i; + } + + SCORE *ptrDCurr = DCurr; + + assert(ptrDCurr == &(DCurr[0])); + *ptrDCurr = d; + + // Can't have an insert if no letters from B + IPrev_j_1 = MINUS_INFINITY; + + unsigned uInsertPos; + const SCORE scoreGapOpenAi = GapOpenA[i]; + const SCORE scoreGapCloseAi_1 = GapCloseA[i-1]; + + for (unsigned j = 1; j < uLengthB; ++j) + { + // Here, MPrev_j is preserved from previous + // iteration so with current i,j is M[i-1][j-1] + SCORE MPrev_j = *ptrMPrev_j; + SCORE INew = MPrev_j + GapOpenB[j]; + if (INew > IPrev_j_1) + { + IPrev_j_1 = INew; + uInsertPos = j; + } + + SCORE scoreMax = MPrev_j; + + assert(ptrDPrev == &(DPrev[j-1])); + SCORE scoreD = *ptrDPrev++ + scoreGapCloseAi_1; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + assert(ptrDeletePos == &(uDeletePos[j-1])); + *ptrTraceBack_ij = (int) i - (int) *ptrDeletePos; + assert(*ptrTraceBack_ij > 0); + } + ++ptrDeletePos; + + SCORE scoreI = IPrev_j_1 + GapCloseB[j-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + *ptrTraceBack_ij = (int) uInsertPos - (int) j; + assert(*ptrTraceBack_ij < 0); + } + + assert(ptrSortOrderA == &(SortOrderA[i])); + assert(ptrFreqsA == &(FreqsA[i])); + + *ptrMCurr_j += scoreMax; + assert(ptrMCurr_j == &(MCurr[j])); + ++ptrMCurr_j; + + MPrev_j = *(++ptrMPrev_j); + assert(ptrDPrev == &(DPrev[j])); + SCORE d = *ptrDPrev; + SCORE DNew = MPrev_j + scoreGapOpenAi; + if (DNew > d) + { + d = DNew; + assert(ptrDeletePos == &uDeletePos[j]); + *ptrDeletePos = i; + } + assert(ptrDCurr + 1 == &(DCurr[j])); + *(++ptrDCurr) = d; + + ++ptrTraceBack_ij; + } + + Rotate(MPrev, MCurr, MWork); + Rotate(DPrev, DCurr, DWork); + } + +// Special case for i=uLengthA + SCORE IPrev = MINUS_INFINITY; + + unsigned uInsertPos; + + for (unsigned j = 1; j < uLengthB; ++j) + { + SCORE INew = MPrev[j-1] + GapOpenB[j]; + if (INew > IPrev) + { + uInsertPos = j; + IPrev = INew; + } + } + +// Special case for i=uLengthA, j=uLengthB + SCORE scoreMax = MPrev[uLengthB-1]; + int iTraceBack = 0; + + SCORE scoreD = DPrev[uLengthB-1] + GapCloseA[uLengthA-1]; + if (scoreD > scoreMax) + { + scoreMax = scoreD; + iTraceBack = (int) uLengthA - (int) uDeletePos[uLengthB-1]; + } + + SCORE scoreI = IPrev + GapCloseB[uLengthB-1]; + if (scoreI > scoreMax) + { + scoreMax = scoreI; + iTraceBack = (int) uInsertPos - (int) uLengthB; + } + + TraceBack[uLengthA][uLengthB] = iTraceBack; + + TraceBackToPath(TraceBack, uLengthA, uLengthB, Path); + + return scoreMax; + } diff --git a/binaries/src/muscle/globals.cpp b/binaries/src/muscle/globals.cpp index ca56fa0..b1a51a0 100644 --- a/binaries/src/muscle/globals.cpp +++ b/binaries/src/muscle/globals.cpp @@ -45,7 +45,7 @@ void Log(const char szFormat[], ...) return; static FILE *f = NULL; - char *mode; + const char *mode; if (g_bListFileAppend) mode = "a"; else diff --git a/binaries/src/muscle/globalsosx.cpp b/binaries/src/muscle/globalsosx.cpp new file mode 100644 index 0000000..a93385e --- /dev/null +++ b/binaries/src/muscle/globalsosx.cpp @@ -0,0 +1,92 @@ +#ifdef __MACH__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const double DEFAULT_RAM = 1e9; +const double DEFAULT_MEM_USE = 1e6; + +double GetNAN() + { + static unsigned long nan[2]={0xffffffff, 0x7fffffff}; + double dNAN = *( double* )nan; + return dNAN; + } + +double g_dNAN = GetNAN(); + + +double GetRAMSize() + { + static double CACHED_RAM = 0; + if (CACHED_RAM != 0) + return CACHED_RAM; + + uint64_t MemPages = 0; + size_t Len = sizeof(MemPages); + if (sysctlbyname("hw.memsize", &MemPages, &Len, NULL, 0) < 0) + return DEFAULT_RAM; + return (double) MemPages; + } + +double GetRAMSizeMB() + { + return GetRAMSize()/1e6; + } + +static double g_uPeakMemUseBytes; + +double GetMaxMemUseBytes() + { + return g_uPeakMemUseBytes; + } + +double GetPeakMemUseBytes() + { + return GetMaxMemUseBytes(); + } + +double GetMemUseBytes() + { + task_t mytask = mach_task_self(); + struct task_basic_info ti; + memset((void *) &ti, 0, sizeof(ti)); + mach_msg_type_number_t count = TASK_BASIC_INFO_COUNT; + kern_return_t ok = task_info(mytask, TASK_BASIC_INFO, (task_info_t) &ti, &count); + if (ok == KERN_INVALID_ARGUMENT) + return DEFAULT_MEM_USE; + + if (ok != KERN_SUCCESS) + return DEFAULT_MEM_USE; + + double uBytes = (double ) ti.resident_size; + if (uBytes > g_uPeakMemUseBytes) + g_uPeakMemUseBytes = uBytes; + return uBytes; + } + +double GetMemUseMB() + { + return GetMemUseBytes()/1e6; + } + +void OSInit() + { + } + +#endif // __MACH__ diff --git a/binaries/src/muscle/globalsother.cpp b/binaries/src/muscle/globalsother.cpp index ba3517d..a1acf0a 100644 --- a/binaries/src/muscle/globalsother.cpp +++ b/binaries/src/muscle/globalsother.cpp @@ -1,6 +1,6 @@ #include "muscle.h" -#if !defined(__linux__) && !defined(_MSC_VER) +#if !defined(__linux__) && !defined(_MSC_VER) && !defined(__MACH__) double GetNAN() { diff --git a/binaries/src/muscle/intmath.cpp b/binaries/src/muscle/intmath.cpp index 5a09dd4..40c25bb 100644 --- a/binaries/src/muscle/intmath.cpp +++ b/binaries/src/muscle/intmath.cpp @@ -8,31 +8,33 @@ PROB ScoreToProb(SCORE Score) return (PROB) pow(2.0, (double) Score/INTSCALE); } -static const double log2e = log2(exp(1.0)); - -double lnTolog2(double ln) - { - return ln*log2e; - } - -double log2(double x) - { - if (0 == x) - return MINUS_INFINITY; - - static const double dInvLn2 = 1.0/log(2.0); -// Multiply by inverse of log(2) just in case multiplication -// is faster than division. - return log(x)*dInvLn2; - } - -SCORE ProbToScore(PROB Prob) - { - if (0.0 == Prob) - return MINUS_INFINITY; -// return (SCORE) floor(INTSCALE*log2(Prob)); - return (SCORE) log2(Prob); - } +//#if 0 +//static const double log2e = log2(exp(1.0)); +// +//double lnTolog2(double ln) +// { +// return ln*log2e; +// } +// +//double log2(double x) +// { +// if (0 == x) +// return MINUS_INFINITY; +// +// static const double dInvLn2 = 1.0/log(2.0); +//// Multiply by inverse of log(2) just in case multiplication +//// is faster than division. +// return log(x)*dInvLn2; +// } +//#endif + +//SCORE ProbToScore(PROB Prob) +// { +// if (0.0 == Prob) +// return MINUS_INFINITY; +//// return (SCORE) floor(INTSCALE*log2(Prob)); +// return (SCORE) log2(Prob); +// } WEIGHT DoubleToWeight(double d) { @@ -69,97 +71,97 @@ bool BTEq(double b1, double b2) return BTEq2((BASETYPE) b1, (BASETYPE) b2); } -const double dLn2 = log(2.0); - -// pow2(x)=2^x -double pow2(double x) - { - if (MINUS_INFINITY == x) - return 0; - return exp(x*dLn2); - } - -// lp2(x) = log2(1 + 2^-x), x >= 0 -double lp2(double x) - { - return log2(1 + pow2(-x)); - } - -// SumLog(x, y) = log2(2^x + 2^y) -SCORE SumLog(SCORE x, SCORE y) - { - return (SCORE) log2(pow2(x) + pow2(y)); - } - -// SumLog(x, y, z) = log2(2^x + 2^y + 2^z) -SCORE SumLog(SCORE x, SCORE y, SCORE z) - { - return (SCORE) log2(pow2(x) + pow2(y) + pow2(z)); - } +//const double dLn2 = log(2.0); -// SumLog(w, x, y, z) = log2(2^w + 2^x + 2^y + 2^z) -SCORE SumLog(SCORE w, SCORE x, SCORE y, SCORE z) - { - return (SCORE) log2(pow2(w) + pow2(x) + pow2(y) + pow2(z)); - } +//// pow2(x)=2^x +//double pow2(double x) +// { +// if (MINUS_INFINITY == x) +// return 0; +// return exp(x*dLn2); +// } -SCORE lp2Fast(SCORE x) - { - assert(x >= 0); - const int iTableSize = 1000; - const double dRange = 20.0; - const double dScale = dRange/iTableSize; - static SCORE dValue[iTableSize]; - static bool bInit = false; - if (!bInit) - { - for (int i = 0; i < iTableSize; ++i) - dValue[i] = (SCORE) lp2(i*dScale); - bInit = true; - } - if (x >= dRange) - return 0.0; - int i = (int) (x/dScale); - assert(i >= 0 && i < iTableSize); - SCORE dResult = dValue[i]; - assert(BTEq(dResult, lp2(x))); - return dResult; - } +//// lp2(x) = log2(1 + 2^-x), x >= 0 +//double lp2(double x) +// { +// return log2(1 + pow2(-x)); +// } // SumLog(x, y) = log2(2^x + 2^y) -SCORE SumLogFast(SCORE x, SCORE y) - { - if (MINUS_INFINITY == x) - { - if (MINUS_INFINITY == y) - return MINUS_INFINITY; - return y; - } - else if (MINUS_INFINITY == y) - return x; - - SCORE dResult; - if (x > y) - dResult = x + lp2Fast(x-y); - else - dResult = y + lp2Fast(y-x); - assert(SumLog(x, y) == dResult); - return dResult; - } - -SCORE SumLogFast(SCORE x, SCORE y, SCORE z) - { - SCORE dResult = SumLogFast(x, SumLogFast(y, z)); - assert(SumLog(x, y, z) == dResult); - return dResult; - } - -SCORE SumLogFast(SCORE w, SCORE x, SCORE y, SCORE z) - { - SCORE dResult = SumLogFast(SumLogFast(w, x), SumLogFast(y, z)); - assert(SumLog(w, x, y, z) == dResult); - return dResult; - } +//SCORE SumLog(SCORE x, SCORE y) +// { +// return (SCORE) log2(pow2(x) + pow2(y)); +// } +// +//// SumLog(x, y, z) = log2(2^x + 2^y + 2^z) +//SCORE SumLog(SCORE x, SCORE y, SCORE z) +// { +// return (SCORE) log2(pow2(x) + pow2(y) + pow2(z)); +// } +// +//// SumLog(w, x, y, z) = log2(2^w + 2^x + 2^y + 2^z) +//SCORE SumLog(SCORE w, SCORE x, SCORE y, SCORE z) +// { +// return (SCORE) log2(pow2(w) + pow2(x) + pow2(y) + pow2(z)); +// } + +//SCORE lp2Fast(SCORE x) +// { +// assert(x >= 0); +// const int iTableSize = 1000; +// const double dRange = 20.0; +// const double dScale = dRange/iTableSize; +// static SCORE dValue[iTableSize]; +// static bool bInit = false; +// if (!bInit) +// { +// for (int i = 0; i < iTableSize; ++i) +// dValue[i] = (SCORE) lp2(i*dScale); +// bInit = true; +// } +// if (x >= dRange) +// return 0.0; +// int i = (int) (x/dScale); +// assert(i >= 0 && i < iTableSize); +// SCORE dResult = dValue[i]; +// assert(BTEq(dResult, lp2(x))); +// return dResult; +// } +// +//// SumLog(x, y) = log2(2^x + 2^y) +//SCORE SumLogFast(SCORE x, SCORE y) +// { +// if (MINUS_INFINITY == x) +// { +// if (MINUS_INFINITY == y) +// return MINUS_INFINITY; +// return y; +// } +// else if (MINUS_INFINITY == y) +// return x; +// +// SCORE dResult; +// if (x > y) +// dResult = x + lp2Fast(x-y); +// else +// dResult = y + lp2Fast(y-x); +// assert(SumLog(x, y) == dResult); +// return dResult; +// } +// +//SCORE SumLogFast(SCORE x, SCORE y, SCORE z) +// { +// SCORE dResult = SumLogFast(x, SumLogFast(y, z)); +// assert(SumLog(x, y, z) == dResult); +// return dResult; +// } + +//SCORE SumLogFast(SCORE w, SCORE x, SCORE y, SCORE z) +// { +// SCORE dResult = SumLogFast(SumLogFast(w, x), SumLogFast(y, z)); +// assert(SumLog(w, x, y, z) == dResult); +// return dResult; +// } double VecSum(const double v[], unsigned n) { diff --git a/binaries/src/muscle/main.cpp b/binaries/src/muscle/main.cpp index 1bfb274..7993c7b 100644 --- a/binaries/src/muscle/main.cpp +++ b/binaries/src/muscle/main.cpp @@ -9,6 +9,10 @@ #include // for isatty() #endif +const char *MUSCLE_LONG_VERSION = "MUSCLE v" SHORT_VERSION "." +#include "svnversion.h" +" by Robert C. Edgar"; + int g_argc; char **g_argv; @@ -36,7 +40,7 @@ int main(int argc, char **argv) if (g_bVersion) { - printf(MUSCLE_LONG_VERSION "\n"); + printf("%s\n", MUSCLE_LONG_VERSION); exit(EXIT_SUCCESS); } diff --git a/binaries/src/muscle/make.err b/binaries/src/muscle/make.err new file mode 100644 index 0000000..e69de29 diff --git a/binaries/src/muscle/make.out b/binaries/src/muscle/make.out new file mode 100644 index 0000000..fc8af25 --- /dev/null +++ b/binaries/src/muscle/make.out @@ -0,0 +1,2 @@ +g++ -O3 -march=pentiumpro -mcpu=pentiumpro -funroll-loops -Winline -DNDEBUG=1 -o muscle aligngivenpath.o aligngivenpathsw.o aligntwomsas.o aligntwoprofs.o alpha.o anchors.o blosumla.o clust.o cluster.o clwwt.o cons.o diaglist.o difftrees.o difftreese.o distcalc.o distfunc.o domuscle.o dosp.o dpreglist.o edgelist.o enumopts.o enumtostr.o estring.o fasta.o fastclust.o fastdist.o fastdistjones.o fastdistkbit.o fastdistkmer.o fastdistmafft.o fastscorepath2.o finddiags.o glbalign.o glbaligndiag.o glbalignle.o glbalignsimple.o glbalignsp.o globals.o globalslinux.o globalswin32.o gonnet.o gotowt.o henikoffweight.o henikoffweightpb.o hydro.o intmath.o local.o main.o makerootmsa.o mpam200.o msa.o msa2.o msadistkimura.o msf.o objscore.o objscore2.o onexception.o options.o pam200mafft.o params.o phy.o phy2.o phy3.o phy4.o phyfromclust.o phyfromfile.o phytofile.o posgap.o profile.o profilefrommsa.o progalign.o progress.o progressivealign.o pwpath.o realigndiffs.o realigndiffse.o refine.o refinehoriz.o refinesubfams.o refinetree.o refinetreee.o refinevert.o savebest.o scorehistory.o scoremx.o seq.o seqvect.o setblosumweights.o setgscweights.o setnewhandler.o sw.o textfile.o threewaywt.o traceback.o tracebackopt.o tracebacksw.o treefrommsa.o typetostr.o upgma2.o usage.o validateids.o vtml2.o -lm -static +strip muscle diff --git a/binaries/src/muscle/makerootmsa.cpp b/binaries/src/muscle/makerootmsa.cpp index 9c3e0f4..e83c3c1 100644 --- a/binaries/src/muscle/makerootmsa.cpp +++ b/binaries/src/muscle/makerootmsa.cpp @@ -66,6 +66,7 @@ static void MakeRootSeq(const Seq &s, const Tree &GuideTree, unsigned uLeafNodeI const PWPath &Path = Nodes[uNodeIndex].m_Path; Seq sTmp; PathSeq(sRoot, Path, bRight, sTmp); + sTmp.SetId(0); sRoot.Copy(sTmp); } } @@ -197,12 +198,12 @@ void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], sRootE.LogMe(); Quit("Root seqs differ"); } -#endif - #if TRACE Log("MakeRootSeq=\n"); sRoot.LogMe(); #endif +#endif + if (uInsane == uColCount) { uColCount = sRootE.Length(); diff --git a/binaries/src/muscle/mk b/binaries/src/muscle/mk new file mode 100644 index 0000000..475d25a --- /dev/null +++ b/binaries/src/muscle/mk @@ -0,0 +1,21 @@ +#!/bin/bash +CPPNames='aligngivenpath aligngivenpathsw aligntwomsas aligntwoprofs aln alpha anchors bittraceback blosum62 blosumla clust cluster clwwt color cons diaglist diffobjscore diffpaths difftrees difftreese distcalc distfunc distpwkimura domuscle dosp dpreglist drawtree edgelist enumopts enumtostr estring fasta fasta2 fastclust fastdist fastdistjones fastdistkbit fastdistkmer fastdistmafft fastdistnuc fastscorepath2 finddiags finddiagsn glbalign glbalign352 glbaligndiag glbalignle glbalignsimple glbalignsp glbalignspn glbalignss glbalndimer globals globalslinux globalsosx globalsother globalswin32 gonnet henikoffweight henikoffweightpb html hydro intmath local main makerootmsa makerootmsab maketree mhack mpam200 msa msa2 msadistkimura msf muscle muscleout nucmx nwdasimple nwdasimple2 nwdasmall nwrec nwsmall objscore objscore2 objscoreda onexception options outweights pam200mafft params phy phy2 phy3 phy4 phyfromclust phyfromfile physeq phytofile posgap ppscore profdb profile profilefrommsa progalign progress progressivealign pwpath readmx realigndiffs realigndiffse refine refinehoriz refinesubfams refinetree refinetreee refinevert refinew savebest scoredist scoregaps scorehistory scorepp seq seqvect setblosumweights setgscweights setnewhandler spfast sptest stabilize subfam subfams sw termgaps textfile threewaywt tomhydro traceback tracebackopt tracebacksw treefrommsa typetostr upgma2 usage validateids vtml2 writescorefile' +ObjNames='aligngivenpath.o aligngivenpathsw.o aligntwomsas.o aligntwoprofs.o aln.o alpha.o anchors.o bittraceback.o blosum62.o blosumla.o clust.o cluster.o clwwt.o color.o cons.o diaglist.o diffobjscore.o diffpaths.o difftrees.o difftreese.o distcalc.o distfunc.o distpwkimura.o domuscle.o dosp.o dpreglist.o drawtree.o edgelist.o enumopts.o enumtostr.o estring.o fasta.o fasta2.o fastclust.o fastdist.o fastdistjones.o fastdistkbit.o fastdistkmer.o fastdistmafft.o fastdistnuc.o fastscorepath2.o finddiags.o finddiagsn.o glbalign.o glbalign352.o glbaligndiag.o glbalignle.o glbalignsimple.o glbalignsp.o glbalignspn.o glbalignss.o glbalndimer.o globals.o globalslinux.o globalsosx.o globalsother.o globalswin32.o gonnet.o henikoffweight.o henikoffweightpb.o html.o hydro.o intmath.o local.o main.o makerootmsa.o makerootmsab.o maketree.o mhack.o mpam200.o msa.o msa2.o msadistkimura.o msf.o muscle.o muscleout.o nucmx.o nwdasimple.o nwdasimple2.o nwdasmall.o nwrec.o nwsmall.o objscore.o objscore2.o objscoreda.o onexception.o options.o outweights.o pam200mafft.o params.o phy.o phy2.o phy3.o phy4.o phyfromclust.o phyfromfile.o physeq.o phytofile.o posgap.o ppscore.o profdb.o profile.o profilefrommsa.o progalign.o progress.o progressivealign.o pwpath.o readmx.o realigndiffs.o realigndiffse.o refine.o refinehoriz.o refinesubfams.o refinetree.o refinetreee.o refinevert.o refinew.o savebest.o scoredist.o scoregaps.o scorehistory.o scorepp.o seq.o seqvect.o setblosumweights.o setgscweights.o setnewhandler.o spfast.o sptest.o stabilize.o subfam.o subfams.o sw.o termgaps.o textfile.o threewaywt.o tomhydro.o traceback.o tracebackopt.o tracebacksw.o treefrommsa.o typetostr.o upgma2.o usage.o validateids.o vtml2.o writescorefile.o' + +rm -f *.o muscle.make.stdout.txt muscle.make.stderr.txt +for CPPName in $CPPNames +do + echo $CPPName >> /dev/tty + g++ $ENV_GCC_OPTS -c -O3 -msse2 -mfpmath=sse -D_FILE_OFFSET_BITS=64 -DNDEBUG=1 $CPPName.cpp -o $CPPName.o >> muscle.make.stdout.txt 2>> muscle.make.stderr.txt +done + +LINK_OPTS= +if [ `uname -s` == Linux ] ; then + LINK_OPTS=-static +fi +g++ $LINK_OPTS $ENV_LINK_OPTS -g -o muscle $ObjNames >> muscle.make.stdout.txt 2>> muscle.make.stderr.txt +tail muscle.make.stderr.txt + +strip muscle +ls -lh muscle +sum muscle diff --git a/binaries/src/muscle/msadistkimura.h b/binaries/src/muscle/msadistkimura.h new file mode 100644 index 0000000..3a386ce --- /dev/null +++ b/binaries/src/muscle/msadistkimura.h @@ -0,0 +1,13 @@ +#ifndef MSADistKimura_h +#define MSADistKimura_h + +#include "msadist.h" + +class MSADistKimura : public MSADist + { +public: + virtual double ComputeDist(const MSA &msa, unsigned uSeqIndex1, + unsigned uSeqIndex2); + }; + +#endif // MSADistKimura_h diff --git a/binaries/src/muscle/msadistmafft.h b/binaries/src/muscle/msadistmafft.h new file mode 100644 index 0000000..56772e1 --- /dev/null +++ b/binaries/src/muscle/msadistmafft.h @@ -0,0 +1,24 @@ +#ifndef MSADistMAFFT_h +#define MSADistMAFFT_h + +#include "msadist.h" +#include + +extern double PctIdToMAFFTDist(double dPctId); + +class MSADistMAFFT : public MSADist + { +public: + virtual double ComputeDist(const MSA &msa, unsigned uSeqIndex1, + unsigned uSeqIndex2) + { + double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2); + //if (dPctId < 0.05) + // dPctId = 0.05; + //double dDist = -log(dPctId); + //return dDist; + return PctIdToMAFFTDist(dPctId); + } + }; + +#endif // MSADistMAFFT_h diff --git a/binaries/src/muscle/muscle b/binaries/src/muscle/muscle deleted file mode 100644 index 4a0f181..0000000 Binary files a/binaries/src/muscle/muscle and /dev/null differ diff --git a/binaries/src/muscle/muscle.h b/binaries/src/muscle/muscle.h index 1c944b2..30c7b6d 100644 --- a/binaries/src/muscle/muscle.h +++ b/binaries/src/muscle/muscle.h @@ -17,9 +17,8 @@ #pragma warning(disable : 4996) // deprecated names like strdup, isatty. #endif -#define MUSCLE_LONG_VERSION "MUSCLE v3.7 by Robert C. Edgar" -#define MUSCLE_MAJOR_VERSION "3" -#define MUSCLE_MINOR_VERSION "7" +extern const char *MUSCLE_LONG_VERSION; +#define SHORT_VERSION "3.8" #include #include diff --git a/binaries/src/muscle/muscle.vcproj b/binaries/src/muscle/muscle.vcproj new file mode 100644 index 0000000..226dac2 --- /dev/null +++ b/binaries/src/muscle/muscle.vcproj @@ -0,0 +1,922 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/binaries/src/muscle/muscle21 b/binaries/src/muscle/muscle21 new file mode 100644 index 0000000..45b995c Binary files /dev/null and b/binaries/src/muscle/muscle21 differ diff --git a/binaries/src/muscle/muscle21.vcproj b/binaries/src/muscle/muscle21.vcproj new file mode 100644 index 0000000..eadf4a8 --- /dev/null +++ b/binaries/src/muscle/muscle21.vcproj @@ -0,0 +1,423 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/binaries/src/muscle/onexception.cpp b/binaries/src/muscle/onexception.cpp index e74000f..f86b31f 100644 --- a/binaries/src/muscle/onexception.cpp +++ b/binaries/src/muscle/onexception.cpp @@ -8,8 +8,8 @@ static char szOnExceptionMessage[] = void OnException() { - fprintf(stderr, szOnExceptionMessage); - Log(szOnExceptionMessage); + fprintf(stderr, "%s", szOnExceptionMessage); + Log("%s", szOnExceptionMessage); Log("Finished %s\n", GetTimeAsStr()); exit(EXIT_Except); } diff --git a/binaries/src/muscle/params.cpp b/binaries/src/muscle/params.cpp index 70d39e3..ec4e221 100644 --- a/binaries/src/muscle/params.cpp +++ b/binaries/src/muscle/params.cpp @@ -566,6 +566,9 @@ void SetParams() FlagParam("PAS", &g_bPAS, true); FlagParam("MakeTree", &g_bMakeTree, true); + if (g_bStable) + Quit("-stable not supported in this version of muscle"); + bool b = false; FlagParam("clwstrict", &b, true); if (b) diff --git a/binaries/src/muscle/phy.cpp b/binaries/src/muscle/phy.cpp index a46a722..aaec915 100644 --- a/binaries/src/muscle/phy.cpp +++ b/binaries/src/muscle/phy.cpp @@ -442,7 +442,7 @@ void Tree::LogMe() const { Log("%5u ", n1); if (m_bHasEdgeLength1[uNodeIndex]) - Log("%7.3g ", m_dEdgeLength1[uNodeIndex]); + Log("%7.4f ", m_dEdgeLength1[uNodeIndex]); else Log(" * "); } @@ -453,7 +453,7 @@ void Tree::LogMe() const { Log("%5u ", n2); if (m_bHasEdgeLength2[uNodeIndex]) - Log("%7.3g ", m_dEdgeLength2[uNodeIndex]); + Log("%7.4f ", m_dEdgeLength2[uNodeIndex]); else Log(" * "); } @@ -464,7 +464,7 @@ void Tree::LogMe() const { Log("%5u ", n3); if (m_bHasEdgeLength3[uNodeIndex]) - Log("%7.3g ", m_dEdgeLength3[uNodeIndex]); + Log("%7.4f ", m_dEdgeLength3[uNodeIndex]); else Log(" * "); } diff --git a/binaries/src/muscle/progress.cpp b/binaries/src/muscle/progress.cpp index 104f306..a7fa01f 100644 --- a/binaries/src/muscle/progress.cpp +++ b/binaries/src/muscle/progress.cpp @@ -15,20 +15,6 @@ static bool g_bWipeDesc = false; static int g_nPrevDescLength; static unsigned g_uTotalSteps; -double GetCheckMemUseMB() - { - unsigned MB = (unsigned) GetMemUseMB(); - if (0 == g_uMaxMB || MB <= g_uMaxMB) - return MB; - fprintf(stderr, "\n\n*** MAX MEMORY %u MB EXCEEDED***\n", g_uMaxMB); - fprintf(stderr, "Memory allocated so far %u MB, physical RAM %u MB\n", - MB, (unsigned) GetRAMSizeMB()); - fprintf(stderr, "Use -maxmb option to increase limit, where is in MB.\n"); - SaveCurrentAlignment(); - exit(EXIT_FatalError); - return MB; - } - const char *ElapsedTimeAsStr() { time_t Now = time(0); @@ -41,7 +27,7 @@ const char *MemToStr(double MB) if (MB < 0) return ""; - static char Str[9]; + static char Str[16]; static double MaxMB = 0; static double RAMMB = 0; @@ -118,7 +104,7 @@ void Progress(const char *szFormat, ...) if (g_bQuiet) return; - double MB = GetCheckMemUseMB(); + double MB = GetMemUseMB(); char szStr[4096]; va_list ArgList; @@ -142,7 +128,7 @@ void Progress(unsigned uStep, unsigned uTotalSteps) return; double dPct = ((uStep + 1)*100.0)/uTotalSteps; - double MB = GetCheckMemUseMB(); + double MB = GetMemUseMB(); fprintf(g_fProgress, "%8.8s %12s Iter %3u %6.2f%% %s", ElapsedTimeAsStr(), MemToStr(MB), @@ -168,7 +154,7 @@ void ProgressStepsDone() if (g_bVerbose) { - double MB = GetCheckMemUseMB(); + double MB = GetMemUseMB(); Log("Elapsed time %8.8s Peak memory use %12s Iteration %3u %s\n", ElapsedTimeAsStr(), MemToStr(MB), diff --git a/binaries/src/muscle/redblack.cpp b/binaries/src/muscle/redblack.cpp new file mode 100644 index 0000000..7194653 --- /dev/null +++ b/binaries/src/muscle/redblack.cpp @@ -0,0 +1,471 @@ +#include "muscle.h" +#include "clust.h" + +void Clust::InsertMetric(unsigned uIndex1, unsigned uIndex2, float dMetric) + { + RBInsert(uIndex1, uIndex2, dMetric); + } + +void Clust::DeleteMetric(unsigned uIndex) + { + for (unsigned uNodeIndex = GetFirstCluster(); uNodeIndex != uInsane; + uNodeIndex = GetNextCluster(uNodeIndex)) + { + if (uIndex == uNodeIndex) + continue; + DeleteMetric(uIndex, uNodeIndex); + } + } + +void Clust::InitMetric(unsigned uMaxNodeIndex) + { + m_uRBNodeCount = m_uTriangularMatrixSize; + m_RBParent = new unsigned[m_uRBNodeCount]; + m_RBLeft = new unsigned[m_uRBNodeCount]; + m_RBRight = new unsigned[m_uRBNodeCount]; + m_RBi = new ushort[m_uRBNodeCount]; + m_RBj = new ushort[m_uRBNodeCount]; + m_RBMetric = new float[m_uRBNodeCount]; + m_RBColor = new bool[m_uRBNodeCount]; + m_RBRoot = RB_NIL; + +#if DEBUG + { +// Initialize fields to invalid values so we have a chance +// catch attempts to use them if they're not properly set. + unsigned InvalidNode = m_uRBNodeCount + 1; + for (unsigned Node = 0; Node < m_uRBNodeCount; ++Node) + { + m_RBParent[Node] = InvalidNode; + m_RBLeft[Node] = InvalidNode; + m_RBRight[Node] = InvalidNode; + m_RBi[Node] = InvalidNode; + m_RBj[Node] = InvalidNode; + } + } +#endif + } + +void Clust::ListMetric() const + { + Log("Red-black tree root=%u\n", m_RBRoot); + Log("\n"); + Log(" Node Parent Left Right Color i j Metric\n"); + Log("----- ------ ----- ----- ----- ----- ----- ------\n"); + + if (RB_NIL == m_RBRoot) + return; + + unsigned Count = 0; + unsigned Start = RBMin(m_RBRoot); + for (unsigned Node = Start; RB_NIL != Node; Node = RBNext(Node)) + { + Log("%5u", Node); + + if (RB_NIL != m_RBParent[Node]) + Log(" %6u", m_RBParent[Node]); + else + Log(" "); + + if (RB_NIL != m_RBLeft[Node]) + Log(" %5u", m_RBLeft[Node]); + else + Log(" "); + + if (RB_NIL != m_RBRight[Node]) + Log(" %5u", m_RBRight[Node]); + else + Log(" "); + + Log(" %s %5u %5u %g\n", + m_RBColor[Node] ? " Red" : "Black", + m_RBi[Node], + m_RBj[Node], + m_RBMetric[Node]); + + if (++Count > m_uRBNodeCount) + { + Log(" ** LOOP ** \n"); + break; + } + } + } + +// If there is a left subtree, predecessor is the +// largest key found under the left branch. Otherwise, +// is first node in path to root that is a right child. +unsigned Clust::RBPrev(unsigned Node) const + { + assert(Node < m_uRBNodeCount); + + unsigned Left = m_RBLeft[Node]; + if (RB_NIL != Left) + return RBMax(Left); + + for (;;) + { + unsigned Parent = m_RBParent[Node]; + if (RB_NIL == Parent) + return RB_NIL; + if (m_RBRight[Parent] == Node) + return Parent; + Node = Parent; + } + } + +// If there is a right subtree, sucessor is the +// smallest key found under the right branch. Otherwise, +// is first node in path to root that is a left child. +unsigned Clust::RBNext(unsigned Node) const + { + if (Node >= m_uRBNodeCount) + Quit("RBNext(%u)", Node); + assert(Node < m_uRBNodeCount); + + unsigned Right = m_RBRight[Node]; + if (RB_NIL != Right) + return RBMin(Right); + + for (;;) + { + unsigned Parent = m_RBParent[Node]; + if (RB_NIL == Parent) + return RB_NIL; + if (m_RBLeft[Parent] == Node) + return Parent; + Node = Parent; + } + } + +// Minimum is in leftmost leaf +unsigned Clust::RBMin(unsigned RBNode) const + { + assert(RB_NIL != RBNode); + for (;;) + { + unsigned Left = m_RBLeft[RBNode]; + if (RB_NIL == Left) + return RBNode; + RBNode = Left; + } + } + +// Maximum is in rightmost leaf +unsigned Clust::RBMax(unsigned RBNode) const + { + assert(RB_NIL != RBNode); + for (;;) + { + unsigned Right = m_RBRight[RBNode]; + if (RB_NIL == Right) + return RBNode; + RBNode = Right; + } + } + +void Clust::DeleteMetric(unsigned uIndex1, unsigned uIndex2) + { + unsigned RBNode = (unsigned) VectorIndex(uIndex1, uIndex2); + RBDelete(RBNode); + } + +void Clust::RBDelete(unsigned Node) + { +#if DEBUG + ValidateRB(); + //Log("@@ Before RBDelete(%u)\n", Node); + //ListMetric(); +#endif + + unsigned Left = m_RBLeft[Node]; + unsigned Right = m_RBRight[Node]; + unsigned Parent = m_RBParent[Node]; + +// If one or two nil children, splice out this node. + if (RB_NIL == Left || RB_NIL == Right) + { +// Log("@@ One child\n"); + // Child is non-NIL child, or NIL if none. + unsigned Child = (Left != RB_NIL ? Left : Right); + + // Special case if root + if (RB_NIL == Parent) + { + assert(Node == m_RBRoot); + m_RBRoot = Child; + if (RB_NIL != Child) + m_RBParent[Child] = RB_NIL; + return; + } + + // Typical case. + // Update parent->child link + if (m_RBLeft[Parent] == Node) + m_RBLeft[Parent] = Child; + else + { + assert(m_RBRight[Parent] == Node); + m_RBRight[Parent] = Child; + } + + // Update child->parent link + if (RB_NIL != Child) + m_RBParent[Child] = Parent; + +#if DEBUG + //Log("@@ After RBDelete(%u)\n", Node); + //ListMetric(); + ValidateRB(); +#endif + return; + } + + //Log("@@ RBDelete(%u) Tricky case\n", Node); + //ListMetric(); + +// Trickier case, node has two children. + assert(Left != RB_NIL && Right != RB_NIL); + +// We're going to splice out successor node from its +// current position and insert it in place of node +// to be deleted. + +// Successor cannot be nil because there is a right child. + unsigned Next = RBNext(Node); + assert(Next != RB_NIL); + +// The successor of a node with two children is +// guaranteed to have no more than one child. + unsigned NextLeft = m_RBLeft[Next]; + unsigned NextRight = m_RBRight[Next]; + assert(RB_NIL == NextLeft || RB_NIL == NextRight); + +// Successor of node with two children cannot be the root. + unsigned NextParent = m_RBParent[Next]; + assert(RB_NIL != NextParent); + +// Ugly special case if successor is right child + if (Next == Right) + { +#if DEBUG + //Log("@@ Before RBDelete(%u) (tricky next==right)\n", Node); + //ListMetric(); +#endif + m_RBParent[Next] = Parent; + + if (RB_NIL == Parent) + { + m_RBRoot = Next; + m_RBParent[Next] = RB_NIL; + } + else + { + if (m_RBLeft[Parent] == Node) + m_RBLeft[Parent] = Next; + else + { + assert(m_RBRight[Parent] == Node); + m_RBRight[Parent] = Next; + } + } + + m_RBLeft[Next] = Left; + + if (RB_NIL != Left) + m_RBParent[Left] = Next; + +#if DEBUG + //Log("@@ After RBDelete(%u) (tricky next==right)\n", Node); + //ListMetric(); + ValidateRB(); +#endif + return; + } + +// Set NextChild either to the one child of successor, or nil. + unsigned NextChild = (NextLeft != RB_NIL ? NextLeft : NextRight); + +// Splice successor from its current position + if (m_RBLeft[NextParent] == Next) + m_RBLeft[NextParent] = NextChild; + else + { + assert(m_RBRight[NextParent] == Next); + m_RBRight[NextParent] = NextChild; + } + + if (RB_NIL != NextChild) + m_RBParent[NextChild] = NextParent; + +// Insert successor into position currently held by node +// to be deleted. + if (RB_NIL == Parent) + { + m_RBRoot = Next; + m_RBParent[Next] = RB_NIL; + } + else + { + if (m_RBLeft[Parent] == Node) + m_RBLeft[Parent] = Next; + else + { + assert(m_RBRight[Parent] == Node); + m_RBRight[Parent] = Next; + } + } + + m_RBLeft[Next] = Left; + m_RBRight[Next] = Right; + m_RBParent[Next] = Parent; + + m_RBParent[Left] = Next; + m_RBParent[Right] = Next; + +#if DEBUG + //Log("@@ After RBDelete(%u)\n", Node); + //ListMetric(); + ValidateRB(); +#endif + } + +unsigned Clust::RBInsert(unsigned i, unsigned j, float fMetric) + { +#if DEBUG + ValidateRB(); +#endif + + unsigned NewNode = VectorIndex(i, j); + m_RBMetric[NewNode] = fMetric; + m_RBi[NewNode] = i; + m_RBj[NewNode] = j; + +// New node is always inserted as a leaf. +// Proof that this is possible is found in algorithm +// textbooks (I forget the argument). + m_RBLeft[NewNode] = RB_NIL; + m_RBRight[NewNode] = RB_NIL; + + unsigned NewParent = RB_NIL; + unsigned Node = m_RBRoot; + + unsigned uCount = 0; + while (RB_NIL != Node) + { + NewParent = Node; + if (fMetric < m_RBMetric[Node]) + Node = m_RBLeft[Node]; + else + Node = m_RBRight[Node]; + ++uCount; + if (uCount > m_uRBNodeCount) + Quit("Infinite loop in RBInsert"); + } + + m_RBParent[NewNode] = NewParent; + if (RB_NIL == NewParent) + m_RBRoot = NewNode; + else + { + if (fMetric < m_RBMetric[NewParent]) + m_RBLeft[NewParent] = NewNode; + else + m_RBRight[NewParent] = NewNode; + } + +#if DEBUG + { + unsigned Next = RBNext(NewNode); + if (Next != RB_NIL) + assert(NewNode == RBPrev(Next)); + unsigned Prev = RBPrev(NewNode); + if (Prev != RB_NIL) + assert(NewNode == RBNext(Prev)); + ValidateRB(); + } +#endif + return NewNode; + } + +void Clust::ValidateRBNode(unsigned Node, const char szMsg[]) const + { + if (RB_NIL == Node) + return; + + unsigned Parent = m_RBParent[Node]; + unsigned Left = m_RBLeft[Node]; + unsigned Right = m_RBRight[Node]; + + unsigned Next = RBNext(Node); + unsigned Prev = RBPrev(Node); + + if (RB_NIL != Next && RBPrev(Next) != Node) + { + ListMetric(); + Quit("ValidateRB(%s) Node=%u Next=%u Prev(Next)=%u", + szMsg, Node, Next, RBPrev(Next)); + } + + if (RB_NIL != Prev && RBNext(Prev) != Node) + { + ListMetric(); + Quit("ValidateRB(%s) Node=%u Prev=%u Next(Prev)=%u", + szMsg, Node, Prev, RBNext(Prev)); + } + + if (RB_NIL != Parent) + { + if (m_RBLeft[Parent] != Node && m_RBRight[Parent] != Node) + { + ListMetric(); + Quit("ValidateRB(%s): Parent %u not linked to child %u\n", + szMsg, Parent, Node); + } + } + + if (RB_NIL != Left) + { + if (m_RBParent[Left] != Node) + { + ListMetric(); + Quit("ValidateRB(%s): Left child %u not linked to parent %u\n", + szMsg, Left, Node); + } + } + + if (RB_NIL != Right) + { + if (m_RBParent[Right] != Node) + { + ListMetric(); + Quit("ValidateRB(%s): Right child %u not linked to parent %u\n", + szMsg, Right, Node); + } + } + + ValidateRBNode(Left, szMsg); + ValidateRBNode(Right, szMsg); + } + +void Clust::ValidateRB(const char szMsg[]) const + { + if (RB_NIL == m_RBRoot) + return; + + ValidateRBNode(m_RBRoot, szMsg); + + unsigned Node = RBMin(m_RBRoot); + for (;;) + { + unsigned Next = RBNext(Node); + if (RB_NIL == Next) + break; + if (m_RBMetric[Node] > m_RBMetric[Next]) + { + ListMetric(); + Quit("ValidateRBNode(%s): metric out of order %u=%g %u=%g", + szMsg, Node, m_RBMetric[Node], Next, m_RBMetric[Next]); + } + Node = Next; + } + } diff --git a/binaries/src/muscle/releases.txt b/binaries/src/muscle/releases.txt new file mode 100644 index 0000000..cd6e5cd --- /dev/null +++ b/binaries/src/muscle/releases.txt @@ -0,0 +1,13 @@ +ver=2.01 rev=1 +ver=2.10 rev=3 +ver=3.00 rev=5 +ver=3.20 rev=7 +ver=3.30 rev=9 +ver=3.41 rev=11 +ver=3.40 rev=12 +ver=3.51 rev=14 +ver=3.52 rev=16 +ver=3.50 rev=17 +ver=3.60 rev=19 +ver=3.70 rev=21 +ver=3.80 rev=22 diff --git a/binaries/src/muscle/scoremx.cpp b/binaries/src/muscle/scoremx.cpp new file mode 100644 index 0000000..ca36c4d --- /dev/null +++ b/binaries/src/muscle/scoremx.cpp @@ -0,0 +1,45 @@ +#include "muscle.h" +#include "profile.h" + +extern SCOREMATRIX VTML_LA; +extern SCOREMATRIX PAM200; +extern SCOREMATRIX PAM200NoCenter; +extern SCOREMATRIX VTML_SP; +extern SCOREMATRIX VTML_SPNoCenter; +extern SCOREMATRIX NUC_SP; + +PTR_SCOREMATRIX g_ptrScoreMatrix; + +void SetScoreMatrix() + { + switch (g_PPScore) + { + case PPSCORE_LE: + g_ptrScoreMatrix = &VTML_LA; + break; + + case PPSCORE_SP: + if (g_bPrecompiledCenter) + g_ptrScoreMatrix = &PAM200; + else + g_ptrScoreMatrix = &PAM200NoCenter; + break; + + case PPSCORE_SV: + if (g_bPrecompiledCenter) + g_ptrScoreMatrix = &VTML_SP; + else + g_ptrScoreMatrix = &VTML_SPNoCenter; + break; + + case PPSCORE_SPN: + if (g_bPrecompiledCenter) + g_ptrScoreMatrix = &NUC_SP; + else + Quit("SPN requires precompiled center"); + break; + + default: + Quit("Invalid g_PPScore"); + } + } diff --git a/binaries/src/muscle/spfast.cpp b/binaries/src/muscle/spfast.cpp index 62c0f1b..51e0dba 100644 --- a/binaries/src/muscle/spfast.cpp +++ b/binaries/src/muscle/spfast.cpp @@ -11,7 +11,7 @@ enum GG = 3, }; -static char *GapTypeToStr(int GapType) +static const char *GapTypeToStr(int GapType) { switch (GapType) { diff --git a/binaries/src/muscle/subfam.cpp b/binaries/src/muscle/subfam.cpp index 23f9473..e720cfa 100644 --- a/binaries/src/muscle/subfam.cpp +++ b/binaries/src/muscle/subfam.cpp @@ -170,7 +170,7 @@ void AlignSubFam(SeqVect &vAll, const Tree &GuideTree, unsigned uNodeIndex, char CmdLine[4096]; sprintf(CmdLine, "probcons %s > %s 2> /dev/null", InTmp, OutTmp); // sprintf(CmdLine, "muscle -in %s -out %s -maxiters 1", InTmp, OutTmp); - system(CmdLine); + int NotUsed = system(CmdLine); TextFile fOut(OutTmp); msaOut.FromFile(fOut); diff --git a/binaries/src/muscle/svnmods.h b/binaries/src/muscle/svnmods.h new file mode 100644 index 0000000..548d4e1 --- /dev/null +++ b/binaries/src/muscle/svnmods.h @@ -0,0 +1 @@ +"export" diff --git a/binaries/src/muscle/svnversion.h b/binaries/src/muscle/svnversion.h new file mode 100644 index 0000000..0caeafe --- /dev/null +++ b/binaries/src/muscle/svnversion.h @@ -0,0 +1 @@ +"31" diff --git a/binaries/src/muscle/textfile.cpp b/binaries/src/muscle/textfile.cpp index f58138b..dbe68e2 100644 --- a/binaries/src/muscle/textfile.cpp +++ b/binaries/src/muscle/textfile.cpp @@ -57,7 +57,9 @@ bool TextFile::GetLine(char szLine[], unsigned uBytes) if (0 == uBytes) Quit("TextFile::GetLine, buffer zero size"); - memset(szLine, 0, uBytes); + + int FillVal = 0; // suppress warning from gcc that I don't understand + memset(szLine, FillVal, (size_t) uBytes); unsigned uBytesCopied = 0; @@ -83,6 +85,8 @@ bool TextFile::GetLine(char szLine[], unsigned uBytes) // As GetLine, but trim leading and trailing blanks; skip empty lines bool TextFile::GetTrimLine(char szLine[], unsigned uBytes) { + if (uBytes == 0) + Quit("GetTrimLine"); for (;;) { bool bEOF = GetLine(szLine, uBytes); @@ -132,6 +136,8 @@ void TextFile::PutFormat(const char szFormat[], ...) void TextFile::GetLineX(char szLine[], unsigned uBytes) { + if (uBytes == 0) + Quit("GetLineX"); bool bEof = GetLine(szLine, uBytes); if (bEof) Quit("end-of-file in GetLineX"); diff --git a/binaries/src/muscle/typetostr.cpp b/binaries/src/muscle/typetostr.cpp index ca49361..2c0afdd 100644 --- a/binaries/src/muscle/typetostr.cpp +++ b/binaries/src/muscle/typetostr.cpp @@ -10,7 +10,7 @@ const char *SecsToStr(unsigned long Secs) mm = (Secs/60)%60; ss = Secs%60; - sprintf(Str, "%02d:%02d:%02d", hh, mm, ss); + sprintf(Str, "%02ld:%02ld:%02ld", hh, mm, ss); return Str; } diff --git a/binaries/src/muscle/usage.cpp b/binaries/src/muscle/usage.cpp index 79b9d18..39a49f6 100644 --- a/binaries/src/muscle/usage.cpp +++ b/binaries/src/muscle/usage.cpp @@ -7,7 +7,7 @@ void Credits() if (Displayed) return; - fprintf(stderr, "\n" MUSCLE_LONG_VERSION "\n\n"); + fprintf(stderr, "\n%s\n\n", MUSCLE_LONG_VERSION); fprintf(stderr, "http://www.drive5.com/muscle\n"); fprintf(stderr, "This software is donated to the public domain.\n"); fprintf(stderr, "Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.\n\n"); @@ -30,15 +30,12 @@ void Usage() " -diags Find diagonals (faster for similar sequences)\n" " -maxiters Maximum number of iterations (integer, default 16)\n" " -maxhours Maximum time to iterate in hours (default no limit)\n" -" -maxmb Maximum memory to allocate in Mb (default 80%% of RAM)\n" " -html Write output in HTML format (default FASTA)\n" " -msf Write output in GCG MSF format (default FASTA)\n" " -clw Write output in CLUSTALW format (default FASTA)\n" " -clwstrict As -clw, with 'CLUSTAL W (1.81)' header\n" " -log[a] Log to file (append if -loga, overwrite if -log)\n" " -quiet Do not write progress messages to stderr\n" -" -stable Output sequences in input order (default is -group)\n" -" -group Group sequences by similarity (this is the default)\n" " -version Display version information and exit\n" "\n" "Without refinement (very fast, avg accuracy similar to T-Coffee): -maxiters 2\n" diff --git a/binaries/src/muscle/version.txt b/binaries/src/muscle/version.txt new file mode 100644 index 0000000..bd28b9c --- /dev/null +++ b/binaries/src/muscle/version.txt @@ -0,0 +1 @@ +3.9 diff --git a/binaries/windows/muscle.exe b/binaries/windows/muscle.exe index 1ebe7f4..93bae22 100644 Binary files a/binaries/windows/muscle.exe and b/binaries/windows/muscle.exe differ diff --git a/conf/settings/MuscleParameters.xml b/conf/settings/MuscleParameters.xml index 89ad920..08ff452 100644 --- a/conf/settings/MuscleParameters.xml +++ b/conf/settings/MuscleParameters.xml @@ -1,6 +1,8 @@ compbio.runner.muscle.Muscle + Anchor optimisation @@ -17,6 +20,15 @@ prog_docs/muscle.html -anchors + Root alignment computation method Use Steven Brenner's method for computing the root alignment. @@ -69,7 +81,8 @@ auto auto protein - nucleo + dna + rna Maxiters @@ -83,6 +96,68 @@ 100 + + + Diagonal break + Maximum distance between two diagonals that allows them to merge into one diagonal + -diagbreak + prog_docs/muscle.html + 1 + + Integer + 1 + 100 + + + + Diagonal length + Minimum length of diagonal + -diaglength + prog_docs/muscle.html + 24 + + Integer + 2 + 100 + + + + Diagonal margin + Discard this many positions at ends of diagonal + -diagmargin + prog_docs/muscle.html + 5 + + Integer + 1 + 100 + + + + Anchor spacing + Minimum spacing between anchor columns + -anchorspacing + prog_docs/muscle.html + 32 + + Integer + 2 + 1000 + + Matrix Substitution Matrix to use @@ -222,6 +297,30 @@ 10 + + Minimum anchor score + Minimum score a column must have to be an anchor (default depends on the profile scoring function!) + -minbestcolscore + prog_docs/muscle.html + 1.2 + + Float + 0 + 10 + + + + Minimum smoothed anchor score + Minimum smoothed score a column must have to be an anchor (default depends on the profile scoring function!) + -minsmoothscore + prog_docs/muscle.html + 1.2 + + Float + 0 + 10 + + cluster1 Clustering method to use on the iteration 1 diff --git a/conf/settings/MusclePresets.xml b/conf/settings/MusclePresets.xml index 190adad..6b63ac6 100644 --- a/conf/settings/MusclePresets.xml +++ b/conf/settings/MusclePresets.xml @@ -20,12 +20,13 @@ - Huge alignments (speed-oriented) - Very large number of sequences (several thousand), or they are very long, + Large alignments (balanced) + A large number of sequences (a few thousand), or very long sequences, + then the default settings of may be too slow for practical use. + A good compromise between speed and accuracy is to run just the first two iterations of + the algorithm - - - + diff --git a/runner/compbio/runner/msa/Muscle.java b/runner/compbio/runner/msa/Muscle.java index f3ac1c7..1d3b827 100644 --- a/runner/compbio/runner/msa/Muscle.java +++ b/runner/compbio/runner/msa/Muscle.java @@ -60,7 +60,8 @@ public class Muscle extends SkeletalExecutable { * completes. So –quiet and –verbose are not contradictory."-quiet", * "-verbose" */ - addParameters(Arrays.asList("-clwstrict", "-quiet", "-verbose")); + addParameters(Arrays.asList("-clwstrict", "-quiet", "-verbose", + "-nocore")); cbuilder.setParam("-log", EXEC_STAT_FILE); } diff --git a/testsrc/testdata/MuscleLimits.xml b/testsrc/testdata/MuscleLimits.xml index 6c2a78a..2aceb46 100644 --- a/testsrc/testdata/MuscleLimits.xml +++ b/testsrc/testdata/MuscleLimits.xml @@ -2,12 +2,32 @@ compbio.runner.muscle.Muscle - 300 - 400 + 100000 + 100000 # LocalEngineExecutionLimit # - 20 - 500 + 100000 + 100000 + + + + diff --git a/testsrc/testdata/MuscleParameters.xml b/testsrc/testdata/MuscleParameters.xml index 952887f..c27d86a 100644 --- a/testsrc/testdata/MuscleParameters.xml +++ b/testsrc/testdata/MuscleParameters.xml @@ -5,47 +5,58 @@ Group sequences Group sequences by similarity (this is the default) or preserve the input order -group - -stable - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html - -stable + prog_docs/muscle.html + -group + Anchor optimisation Enable/disable anchor optimization in tree dependent refinement iterations -anchors -noanchors - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html -anchors + Root alignment computation method Use Steven Brenner's method for computing the root alignment. -brenner - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html - + --> dimer Use dimer approximation for the SP score (faster, slightly less accurate) -dimer - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html Diagonal Use diagonal optimizations. Faster, especially for closely related sequences, but may be less accurate. -diags + prog_docs/muscle.html Diagonal 1 Use diagonal optimizations in first iteration (faster for similar sequences) -diags1 + prog_docs/muscle.html Profile scoring method @@ -55,7 +66,7 @@ -le -sp -sv - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html -le @@ -63,17 +74,18 @@ Sequence type Sequence type - Amino acid/Nucleotide -seqtype - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html auto auto protein - nucleo + dna + rna Maxiters Maximum number of iterations (integer, default 16) -maxiters - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html 16 Integer @@ -81,23 +93,154 @@ 100 - + + Diagonal break + Maximum distance between two diagonals that allows them to merge into one diagonal + -diagbreak + prog_docs/muscle.html + 1 + + Integer + 1 + 100 + + + + Diagonal length + Minimum length of diagonal + -diaglength + prog_docs/muscle.html + 24 + + Integer + 2 + 100 + + + + Diagonal margin + Discard this many positions at ends of diagonal + -diagmargin + prog_docs/muscle.html + 5 + + Integer + 1 + 100 + + + + Anchor spacing + Minimum spacing between anchor columns + -anchorspacing + prog_docs/muscle.html + 32 + + Integer + 2 + 1000 + + + +--> Gap open penalty Gap opening penalty. Must be negative -gapopen - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html -12.0 Float @@ -109,7 +252,7 @@ Gap extension penalty Gap extension penalty. Must be negative -gapextend - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html -1.0 Float @@ -121,7 +264,7 @@ Center Center parameter. Should be negative. -center - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html 0.0 Float @@ -133,7 +276,7 @@ Hydro Window size for determining whether a region is hydrophobic. -hydro - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html 5 Integer @@ -145,7 +288,31 @@ Hydrofactor Multiplier for gap open/close penalties in hydrophobic regions. -hydrofactor - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html + 1.2 + + Float + 0 + 10 + + + + Minimum anchor score + Minimum score a column must have to be an anchor (default depends on the profile scoring function!) + -minbestcolscore + prog_docs/muscle.html + 1.2 + + Float + 0 + 10 + + + + Minimum smoothed anchor score + Minimum smoothed score a column must have to be an anchor (default depends on the profile scoring function!) + -minsmoothscore + prog_docs/muscle.html 1.2 Float @@ -157,7 +324,7 @@ cluster1 Clustering method to use on the iteration 1 -cluster1 - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html upgma upgma @@ -165,7 +332,7 @@ cluster2 Clustering method to use on the iteration 2 and all subsequent itarations -cluster2 - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html upgmb upgmb neighborjoining @@ -179,7 +346,7 @@ clustalw=CLUSTALW method. threeway=Gotoh three-way method -weight1 - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html clustalw none henikoff @@ -198,7 +365,7 @@ clustalw=CLUSTALW method. threeway=Gotoh three-way method -weight2 - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html clustalw none henikoff @@ -211,7 +378,7 @@ Distance1 Distance measure for iteration 1. Defaults Kmer6_6 (for amino ) or Kmer4_6 (for nucleo) -distance1 - http://www.compbio.dundee.ac.uk/users/pvtroshin/ws/Index.html + prog_docs/muscle.html kmer6_6 kmer6_6 kmer20_3 diff --git a/testsrc/testdata/MusclePresets.xml b/testsrc/testdata/MusclePresets.xml index 190adad..6b63ac6 100644 --- a/testsrc/testdata/MusclePresets.xml +++ b/testsrc/testdata/MusclePresets.xml @@ -20,12 +20,13 @@ - Huge alignments (speed-oriented) - Very large number of sequences (several thousand), or they are very long, + Large alignments (balanced) + A large number of sequences (a few thousand), or very long sequences, + then the default settings of may be too slow for practical use. + A good compromise between speed and accuracy is to run just the first two iterations of + the algorithm - - - + diff --git a/website/prog_docs/muscle.html b/website/prog_docs/muscle.html index 36055f8..be5e96a 100644 --- a/website/prog_docs/muscle.html +++ b/website/prog_docs/muscle.html @@ -1,230 +1,101 @@ - - - - - - - - + + + MUSCLE User Guide - - - - + -
+ -

 

+
-

 

+

 

-

 

+

 

-

 

+

 

-

MUSCLE User Guide

+

 

-

                                                                                                                                                                            

+

MUSCLE +User Guide

-

 

+

                                                                                                                                                                            

-

 

+

 

-

 

+

 

-

 

+

 

-

 

+

 

-

 

+

 

-

Multiple sequence comparison -by log-expectation

+

 

-

by Robert C. Edgar

+

Multiple +sequence comparison by log-expectation

-

 

+

by Robert C. +Edgar

-

Version 3.6

+

 

-

September 2005

+

Version 3.8

-

 

+

May 2010

-

 

+

 

-

http://www.drive5.com/muscle

+

 

-

email: muscle (at) drive5.com

+

http://www.drive5.com/muscle

-

 

+

 

-

MUSCLE is updated regularly. -Send me an e-mail if you would like to be notified of new releases.

+

robert@drive5.com

-

 

+

 

-

 

+

 

-

Citation:

+

Citation:

-

 

+

 

-

Edgar, +

Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high accuracy and -high throughput, Nucleic Acids Research 32(5), 1792-97.

- -

 

- -

For a complete -description of the algorithm, see also:

- -

 

- -

Edgar, Robert C (2004), MUSCLE: a multiple sequence alignment method -with reduced time and space complexity. BMC Bioinformatics, 5(1):113.
-
Table of Contents

- -

1 -Introduction. 3

- -

2 -Quick Start 3

- -

2.1 -Installation. 3

- -

2.2 -Making an alignment 3

- -

2.3 -Large alignments. 3

- -

2.4 -Faster speed. 4

- -

2.5 -Huge alignments. 4

- -

2.6 -Accuracy: caveat emptor 4

- -

2.7 -Pipelining. 4

- -

2.8 -Refining an existing alignment 4

- -

2.9 -Using a pre-computed guide tree. 4

- -

2.10 -Profile-profile alignment 5

- -

2.11 -Adding sequences to an existing alignment 5

- -

2.12 -Sequence clustering. 5

- -

2.13 -Specifying a substitution matrix. 6

- -

2.14 -Refining a long alignment 6

- -

3 -File Formats. 6

- -

3.1 -Input files. 6

- -

3.1.1 -Amino acid sequences. 6

- -

3.1.2 -Nucleotide sequences. 6

- -

3.1.3 -Determining sequence type. 7

- -

3.2 -Output files. 7

- -

3.2.1 -Sequence grouping. 7

- -

3.3 -CLUSTALW format 7

- -

3.4 -MSF format 7

- -

3.5 -HTML format 8

- -

3.6 -Phylip format 8

- -

4 -Using MUSCLE. 8

- -

4.1 -How the algorithm works. 8

- -

4.2 -Command-line options. 9

- -

4.3 -The maxiters option. 9

- -

4.4 -The maxtrees option. 10

- -

4.5 -The maxhours option. 10

- -

4.6 -The maxmb option. 10

- -

4.7 -The profile scoring function. 10

- -

4.8 -Diagonal optimization. 10

- -

4.9 -Anchor optimization. 11

- -

4.10 -Log file. 11

- -

4.11 -Progress messages. 11

- -

4.12 -Running out of memory. 12

- -

4.13 -Troubleshooting. 12

- -

4.14 -Technical support 12

- -

5 -Command Line Reference. 13

- -

 

- -
+high throughput, Nucleic Acids Research 32(5), 1792-97.

+ +

 

+ +

For a +complete description of the algorithm, see also:

+ +

 

+ +

Edgar, Robert C (2004), MUSCLE: +a multiple sequence alignment method with reduced time and space complexity. BMC +Bioinformatics, 5(1):113.
+
Table +of Contents

+ +

1 Introduction. 3

+ +

2 Quick +Start 3

+ +

2.1 +Installation. 3

+ +

2.2 Making +an alignment 3

+ +

2.3 Large +alignments. 3

+ +

2.4 Faster +speed. 4

+ +

2.5 Huge +alignments. 4

+ +

2.6 +Pipelining. 4

+ +

2.7 Refining +an existing alignment 4

+ +

2.8 Using a +pre-computed guide tree. 4

+ +

2.9 +Profile-profile alignment 5

+ +

2.10 Adding +sequences to an existing alignment 5

+ +

2.11 +Specifying a protein substitution matrix. 5

+ +

2.12 +Specifying a nucleotide substitution matrix. 5

+ +

2.13 +Refining a long alignment 6

+ +

3 File +Formats. 6

+ +

3.1 Input +files. 6

+ +

3.1.1 Amino +acid sequences. 6

+ +

3.1.2 +Nucleotide sequences. 6

+ +

3.1.3 +Determining sequence type. 6

+ +

3.2 Output +files. 7

+ +

3.2.1 +Sequence grouping. 7

+ +

3.2.2 Output +to multiple file formats. 7

+ +

3.3 CLUSTALW +format 7

+ +

3.4 MSF +format 7

+ +

3.5 HTML +format 8

+ +

3.6 Phylip +format 8

+ +

4 Using +MUSCLE.. 8

+ +

4.1 How the +algorithm works. 8

+ +

4.2 +Command-line options. 9

+ +

4.3 The +maxiters option. 9

+ +

4.4 The +maxtrees option. 10

+ +

4.5 The +maxhours option. 10

+ +

4.6 The +profile scoring function. 10

+ +

4.7 Diagonal +optimization. 10

+ +

4.8 Anchor +optimization. 10

+ +

4.9 Log file. 11

+ +

4.10 +Progress messages. 11

+ +

4.11 Running +out of memory. 11

+ +

4.12 +Troubleshooting. 12

+ +

4.13 +Technical support 12

+ +

5 Command +Line Reference. 12

+ +

 

+ +
-

1 Introduction

+

1 Introduction

-

MUSCLE is a program for creating multiple alignments of +

MUSCLE is a program for creating multiple alignments of amino acid or nucleotide sequences. A range of options is provided that give you the choice of optimizing accuracy, speed, or some compromise between the -two. Default parameters are those that give the best average accuracy in our -tests. Using versions current at the time of writing, my tests show that MUSCLE -can achieve both better average accuracy and better speed than CLUSTALW or T‑Coffee, -depending on the chosen options. Many command line options are provided to vary -the internals of the algorithm; some of these will primarily be of interest to -algorithm developers who wish to better understand which features of the algorithm -are important in different circumstances.

- -

2 Quick Start

- -

The MUSCLE algorithm is delivered as a command-line program -called muscle. If you are running under Linux or Unix -you will be working at a shell prompt. If you are running under Windows, you should -be in a command window (nostalgically known to us older people as a DOS prompt). -If you don't know how to use command-line programs, you should get help from a -local guru.

- -

2.1 Installation

- -

Copy the muscle binary file to a directory that is +two. Default parameters are those that gave the best average benchmark accuracy +in my tests. However, benchmark accuracy is a rather dubious measure; see:

+ +

 

+ +

Edgar, +R.C. (2010) Quality measures for protein alignment benchmarks, Nucleic Acids +Res., 2010, 1–9.

+ +

 

+ +

WARNING

+ +

THE -stable OPTION +HAD A SERIOUS BUG IN VERSIONS OF MUSCLE PRIOR TO v3.8 AND IS CURRENTLY NOT +SUPPORTED. Go to this link for latest news and work-arounds:

+ +

 

+ +

http://drive5.com/muscle/stable.html

+ +

 

+ +

2 Quick Start

+ +

The MUSCLE algorithm is delivered as a command-line program +called muscle. If you are running under Linux or Unix you will be +working at a shell prompt. If you are running under Windows, you should be in a +command window (nostalgically known to us older people as a DOS prompt). If you +don't know how to use command-line programs, you should get help from a local +guru.

+ +

2.1 Installation

+ +

Copy the muscle binary file to a directory that is accessible from your computer. That's it—there are no configuration files, libraries, environment variables or other settings to worry about. If you are -using Windows, then the binary file is named muscle.exe. From now on muscle -should be understood to mean "muscle if you are using Linux or Unix, muscle.exe if you are using Windows".

+using Windows, then the binary file is named something like muscle3.8.31_i86win32.exe. +From now on muscle should be understood to mean "the file or path +name of your executable file".

-

2.2 Making +

2.2 Making an alignment

-

Make a FASTA file containing some sequences. (If you are not +

Make a FASTA file containing some sequences. (If you are not familiar with FASTA format, it is described in detail later in this Guide.) For now, just to make things fast, limit the number of sequence in the file to no more than 50 and the sequence length to be no more than 500. Call the input -file seqs.fa. (An example file named seqs.fa is distributed with the standard MUSCLE -package). Make sure the directory containing the muscle binary is in -your path. (If it isn't, you can run it by typing the full path name, and the -following example command lines must be changed accordingly). Now type:

- -

 

- -

muscle -in seqs.fa -out seqs.afa

- -

 

- -

You should see some progress messages. If muscle completes -successfully, it will create a file seqs.afa -containing the alignment. By default, output is created in "aligned -FASTA" format (hence the .afa extension). -This is just like regular FASTA except that gaps are added in order to align -the sequences. This is a nice format for computers but not very readable for -people, so to look at the alignment you will want an alignment viewer such as Belvu, or a script that converts FASTA to a more readable format. -You can also use the –clw command-line option -to request output in CLUSTALW format, which is easier to understand for people. -If muscle gives an error message and you don't know how to fix it, -please read the Troubleshooting section.

- -

 

- -

The default settings are designed to give the best accuracy, +file seqs.fa. Make sure the directory containing the muscle +binary is in your path. (If it isn't, you can run it by typing the full path +name, and the following example command lines must be changed accordingly). Now +type:

+ +

 

+ +

muscle -in seqs.fa -out seqs.afa

+ +

 

+ +

You should see some progress messages. If muscle +completes successfully, it will create a file seqs.afa containing the +alignment. By default, output is created in "aligned FASTA" format (hence +the .afa extension). This is just like regular FASTA except that gaps +are added in order to align the sequences. This is a nice format for computers +but not very readable for people, so to look at the alignment you will want an +alignment viewer such as Belvu, or a script that converts FASTA to a more readable +format. You can also use the –clw command-line option to request output +in CLUSTALW format, which is easier to understand for people. If muscle +gives an error message and you don't know how to fix it, please read the +Troubleshooting section.

+ +

 

+ +

The default settings are designed to give the best accuracy, so this may be all you need to know.

-

2.3 Large +

2.3 Large alignments

-

If you have a large number of sequences (a few thousand), or +

If you have a large number of sequences (a few thousand), or they are very long, then the default settings of may be too slow for practical use. A good compromise between speed and accuracy is to run just the first two -iterations of the algorithm. On average, this gives accuracy comparable to -T-Coffee and speeds much faster than CLUSTALW. This is done by the option –maxiters -2, as in the following example.

+iterations of the algorithm. This is done by the option –maxiters 2, as +in the following example.

-

 

+

 

-

muscle -in seqs.fa -out seqs.afa -maxiters 2

+

muscle -in seqs.fa -out seqs.afa -maxiters 2

-

2.4 Faster -speed

+

2.4 Faster speed

-

The –diags option enables -an optimization for speed by finding common words (6-mers in a compressed amino -acid alphabet) between the two sequences as seeds for diagonals. This is -related to optimizations in programs such as BLAST and FASTA: you get faster speed, -but sometimes lower average accuracy. For large numbers of closely related -sequences, this option works very well.

+

The –diags option enables an optimization for speed +by finding common words (6-mers in a compressed amino acid alphabet) between +the two sequences as seeds for diagonals. This is related to optimizations in +programs such as BLAST and FASTA: you get faster speed, but sometimes lower +average accuracy. For large numbers of closely related sequences, this option +works very well.

-

 

+

 

-

If you want the fastest possible speed, then the following +

If you want the fastest possible speed, then the following example shows the applicable options for proteins.

-

 

+

 

-

muscle -in seqs.fa -out seqs.afa -maxiters 1 -diags -sv +

muscle -in seqs.fa -out seqs.afa -maxiters 1 -diags -sv -distance1 kbit20_3

-

 

+

 

-

For nucleotides, use:

+

For nucleotides, use:

-

 

+

 

-

muscle -in seqs.fa -out seqs.afa -maxiters 1 -diags

+

muscle -in seqs.fa -out seqs.afa -maxiters 1 -diags

-

 

+

 

-

At the time of writing, muscle with these options is faster -than any other multiple sequence alignment program -that I have tested. The alignments are not bad, especially when the sequences +

The alignments are not bad, especially when the sequences are closely related. However, as you might expect, this blazing speed comes at the cost of the lowest average accuracy of the options that muscle provides.

-

2.5 Huge +

2.5 Huge alignments

-

If you have a very large number of sequences (several -thousand), or they are very long, then the kbit20_3 option may cause -problems because it needs a relatively large amount of memory. Better is to use -the default distance measure, which is roughly 2× or 3× slower but needs less -memory, like this:

+

If you have thousands of sequences, then attempting to +create a multiple alignment is dubious for many technical reasons. It may be +better to cluster first, then align the reduced set of sequences. Clustering +can be done using UCLUST, which is an algorithm implemented in the USEARCH +program:

-

 

+

 

-

muscle -in seqs.fa -out seqs.afa -maxiters 1 -diags1 -sv

+

        http://www.drive5.com/usearch

-

2.6 Accuracy: -caveat emptor

+

 

-

Why do I keep using the clumsy phrase "average -accuracy" instead of just saying "accuracy"? That's because the -quality of alignments produced by MUSCLE varies, as do those produced other programs -such as CLUSTALW and T-Coffee. The state of the art leaves plenty of room for -improvement. Sometimes the fastest speed options to muscle give -alignments that are better than T-Coffee, though the reverse will more often be -the case. With challenging sets of sequences, it is a good idea to make several -different alignments using different muscle options and to try other programs -too. Regions where different alignments agree are more believable than regions -where they disagree.

+

At the time of writing (May 2010), I'm working on methods +for leveraging a combination of UCLUST and MUSCLE to create very large but +still reasonably accurate alignments. If you're interested, email me and let's discuss.

-

2.7 Pipelining

+

2.6 Pipelining

-

Input can be taken from standard input, and output can be +

Input can be taken from standard input, and output can be written to standard output. This is the default, so our first example would also work like this:

-

 

+

 

-

muscle < seqs.fa > seqs.afa

+

muscle < seqs.fa > seqs.afa

-

2.8 Refining +

2.7 Refining an existing alignment

-

You can ask muscle to try to improve an existing +

You can ask muscle to try to improve an existing alignment by using the –refine option. The input file must then be a FASTA file containing an alignment. All sequences must be of equal length, gaps -can be specified using dots "." or dashes "–". For example:

+can be specified using dots "." or dashes "–". For example:

-

 

+

 

-

muscle -in seqs.afa -out refined.afa -refine

+

muscle -in seqs.afa -out refined.afa -refine

-

2.9 Using -a pre-computed guide tree

+

2.8 Using a +pre-computed guide tree

-

The –usetree option allows -you to provide your own guide tree. For example,

+

The –usetree option allows you to provide your own +guide tree. For example,

-

 

+

 

-

muscle -in seqs.fa -out seqs.afa -usetree mytree.phy

+

muscle -in seqs.fa -out seqs.afa -usetree mytree.phy

-

 

+

 

-

The tree must by in Newick format, -as used by the Phylip package (hence the .phy extension). The Newick -format is described here:

+

The tree must by in Newick format, as used by the Phylip +package (hence the .phy extension). The Newick format is described here:

-

 

+

 

-

        http://evolution.genetics.washington.edu/phylip/newicktree.html

+

        http://evolution.genetics.washington.edu/phylip/newicktree.html

-

 

+

 

-

WARNING. Do not use this -option just because you believe that you have an accurate evolutionary tree for -your sequences. The best guide tree for multiple alignment -is not in general the correct evolutionary tree. This can be understood -by the following argument. Alignment accuracy decreases with lower sequence -identity. It follows that given a set of profiles, the -two that can be aligned most accurately will tend to be the pair with the -highest identity, i.e. at the shortest evolutionary distance. This is exactly -the pair selected by the nearest-neighbor criterion which MUSCLE uses by -default. When mutation rates are variable, the evolutionary neighbor may -not be the nearest neighbor. This explains why a nearest-neighbor tree -may be superior to the true evolutionary tree for guiding a progressive alignment.

+

WARNING. Do not use this option just because you +believe that you have an accurate evolutionary tree for your sequences. The +best guide tree for multiple alignment is not in general the correct +evolutionary tree. This can be understood by the following argument. Alignment +accuracy decreases with lower sequence identity. It follows that given a set of +profiles, the two that can be aligned most accurately will tend to be the pair +with the highest identity, i.e. at the shortest evolutionary distance. This is +exactly the pair selected by the nearest-neighbor criterion which MUSCLE uses +by default. When mutation rates are variable, the evolutionary neighbor +may not be the nearest neighbor. This explains why a nearest-neighbor +tree may be superior to the true evolutionary tree for guiding a progressive +alignment.

-

 

+

 

-

You will get a warning if you use the –usetree -option. To disable the warning, use ­–usetree_nowarn -instead,

+

You will get a warning if you use the –usetree +option. To disable the warning, use ­–usetree_nowarn instead,

-

e.g.:

+

e.g.:

-

 

+

 

-

muscle -in seqs.fa -out seqs.afa -usetree_nowarn mytree.phy

+

muscle -in seqs.fa -out seqs.afa -usetree_nowarn mytree.phy

-

2.10 Profile-profile +

2.9 Profile-profile alignment

-

A fundamental step in the MUSCLE algorithm is aligning two +

A fundamental step in the MUSCLE algorithm is aligning two multiple sequence alignments. This operation is sometimes called -"profile-profile alignment". If you have two existing alignments of +"profile-profile alignment". If you have two existing alignments of related sequences you can use the –profile option of MUSCLE to align those two sequences. Typical usage is:

-

 

+

 

-

muscle -profile -in1 one.afa -in2 two.afa -out both.afa

+

muscle -profile -in1 one.afa -in2 two.afa -out both.afa

-

 

+

 

-

The alignments in one.afa -and two.afa, which must be in aligned FASTA -format, are aligned to each other, keeping input columns intact and inserting -columns of gaps where needed. Output is stored in both.afa.

+

The alignments in one.afa and two.afa, which +must be in aligned FASTA format, are aligned to each other, keeping input +columns intact and inserting columns of gaps where needed. Output is stored in both.afa.

-

 

+

 

-

MUSCLE does not compute a similarity measure or measure of +

MUSCLE does not compute a similarity measure or measure of statistical significance (such as an E-value), so this option is not useful for -discriminating homologs from unrelated sequences. For this task, I recommend Sadreyev & Grishin's COMPASS -program.

+discriminating homologs from unrelated sequences. For this task, I recommend +Sadreyev & Grishin's COMPASS program.

-

2.11 Adding +

2.10 Adding sequences to an existing alignment

-

To add a sequence to an existing alignment that you wish to +

To add a sequence to an existing alignment that you wish to keep intact, use profile-profile alignment with the new sequence as a profile. -For example, if you have an existing alignment existing_aln.afa -and want to add a new sequence in new_seq.fa, -use the following commands:

+For example, if you have an existing alignment existing_aln.afa and want +to add a new sequence in new_seq.fa, use the following commands:

-

 

+

 

-

muscle -profile -in1 existing_aln.afa -in2 new_seq.fa -out -combined.afa

+

muscle -profile -in1 existing_aln.afa -in2 new_seq.fa -out +combined.afa

-

 

+

 

-

If you have more than one new sequences, -you can align them first then add them, for example:

+

If you have more than one new sequences, you can align them +first then add them, for example:

-

 

+

 

-

muscle -in new_seqs.fa -out new_seqs.afa

+

muscle -in new_seqs.fa -out new_seqs.afa

-

muscle -profile -in1 existing_aln.afa -in2 new_seqs.fa -out +

muscle -profile -in1 existing_aln.afa -in2 new_seqs.afa -out combined.afas

-

2.12 Sequence -clustering

- -

The first stage in MUSCLE is a fast clustering algorithm. -This may be of use in other applications. Typical usage is:

+

2.11 Specifying +a protein substitution matrix

-

 

+

You can specify your own substitution matrix by using the -matrix +option. This reads a protein substitution matrix in NCBI or WU-BLAST format. +The alphabet is assumed to be amino acid, and sum-of-pairs scoring is used. The +­-gapopen, -gapextend and -center parameters should be +specified; normally you will specify a zero value for the center. Note that gap +penalties MUST be negative. The environment variable MUSCLE_MXPATH can be used +to specify a path where the matrices are stored. For example,

-

muscle -cluster -in seqs.fa -tree1 tree.phy -maxiters 1

+

 

-

 

+

muscle -in seqs.fa -out seqs.afa -matrix /data/matrix/blosum62

-

The sequences will be clustered, and a tree written to tree.phy. Options –weight1, –distance1, -–cluster1 and –root1 can be applied if desired. Note that by -default, UPGMA clustering is used. You can use

+

    -gapopen -12.0 -gapextend -1.0 -center 0.0

-

 –neighborjoining if you prefer, but note that this is -substantially slower than UPGMA for large numbers of sequences, and is also -slightly less accurate. See discussion of –usetree -above.

+

 

-

2.13 Specifying -a substitution matrix

+

Example matrices can be downloaded from the NCBI FTP site. +At the time of writing they are found here:

-

You can specify your own substitution matrix by using the -matrix -option. This reads a protein substitution matrix in NCBI or WU-BLAST format. -The alphabet is assumed to be amino acid, and sum-of-pairs scoring is used. The -­-gapopen, -gapextend -and -center parameters should be specified; normally you will specify a -zero value for the center. Note that gap penalties MUST be negative. The -environment variable MUSCLE_MXPATH can be used to specify a path where the -matrices are stored. For example,

+

 

-

 

+

        ftp://ftp.ncbi.nih.gov/blast/matrices/

-

muscle -in seqs.fa -out seqs.afa -matrix blosum62 -gapopen -12.0

+

2.12 Specifying +a nucleotide substitution matrix

-

    -gapextend -1.0 -center 0.0

+

MUSCLE isn't really designed to support a nucleotide matrix, +but you can hack it by pretending that AGCT are amino acids and making a 20x20 +matrix out of the original 4x4 matrix. You MUST specify the -seqtype protein +option to fool MUSCLE into believing that it is aligning amino acid sequences. For +more information, see:

-

 

+

 

-

You can hack a nucleotide matrix by pretending that AGCT are -amino acids and making a 20x20 matrix out of the original 4x4 matrix. Let me -know if this isn't clear, I can help you through it.

+

http://drive5.com/muscle/nucmx.html

-

2.14 Refining +

2.13 Refining a long alignment

-

A long alignment can be refined using the –refinew option, which is primarily designed for -refining whole-genome nucleotide alignments. Usage is:

+

A long alignment can be refined using the –refinew +option, which is primarily designed for refining whole-genome nucleotide +alignments. Usage is:

-

 

+

 

-

muscle -in input.afa -out output.afa

+

muscle -refinew -in input.afa -out output.afa

-

 

+

 

-

MUSCLE divides the input alignment into non-overlapping +

MUSCLE divides the input alignment into non-overlapping windows and re-aligns each window from scratch, i.e. all gap characters are -discarded. The –refinewindow option may be -used to change the window length, which is 200 columns -by default.

- -

3 File Formats

- -

MUSCLE uses FASTA format for both input and output. For -output only, it also offers CLUSTALW, MSF, HTML, Phylip -sequential and Phylip interleaved formats. See the -following command-line options: ‑clw, ‑clwstrict, –msf, -–html, –phys, –phyi,clwout, ‑clwstrictout, -–msfout, –htmlout, -–physout and –phyiout.

- -

 

- -

3.1 Input +discarded. The –refinewindow option may be used to change the window +length, which is 200 columns by default.

+ +

3 File +Formats

+ +

MUSCLE uses FASTA format for both input and output. For +output only, it also offers CLUSTALW, MSF, HTML, Phylip sequential and Phylip +interleaved formats. See the following command-line options: ‑clw, +‑clwstrict, –msf, –html, –phys, –phyi, +‑clwout, ‑clwstrictout, –msfout, –htmlout, +–physout and –phyiout.

+ +

 

+ +

3.1 Input files

-

Input files must be in FASTA format. These are plain text -files (word processing files such as Word documents are not understood!). Unix, Windows and DOS text files are supported (end-of-line -may be NL or CR NL). There is no explicit limit on the length of a sequence, -however if you are running a 32-bit version of muscle then the maximum -will be very roughly 10,000 letters due to maximum addressable size of tables -required in memory. Each sequence starts with an annotation line, which is -recognized by having a greater-than symbol ">" as its first -character. There is no limit on the length of an annotation line (this is new -as of version 3.5), and there is no requirement that the annotation be unique. The -sequence itself follows on one or more subsequent lines, and is terminated -either by the next annotation line or by the end of the file.

- -

3.1.1 Amino acid sequences

- -

The standard single-letter amino acid alphabet is used. Upper +

Input files must be in FASTA format. These are plain text +files (word processing files such as Word documents are not understood!). Unix, +Windows and DOS text files are supported (end-of-line may be NL or CR NL). There +is no explicit limit on the length of a sequence, however if you are running a +32-bit version of muscle then the maximum will be very roughly 10,000 letters +due to maximum addressable size of tables required in memory. Each sequence +starts with an annotation line, which is recognized by having a greater-than +symbol ">" as its first character. There is no limit on the length +of an annotation line (this is new as of version 3.5), and there is no +requirement that the annotation be unique. The sequence itself follows on one +or more subsequent lines, and is terminated either by the next annotation line +or by the end of the file.

+ +

3.1.1 Amino +acid sequences

+ +

The standard single-letter amino acid alphabet is used. Upper and lower case is allowed, the case is not significant. The special characters -X, B, Z and U are understood. X means "unknown amino acid", B is D or -N, Z is E or Q. U is understood to be the 21st amino acid Selenocysteine. -White space (spaces, tabs and the end-of-line characters CR and NL) is allowed -inside sequence data. Dots "." and dashes "–" in sequences -are allowed and are discarded unless the input is expected to be aligned (e.g. -for the –refine option).

- -

3.1.2 Nucleotide sequences

- -

The usual letters A, G, C, T and U stand for nucleotides. +X, B, Z and U are understood. X means "unknown amino acid", B is D or +N, Z is E or Q. U is understood to be the 21st amino acid Selenocysteine. White +space (spaces, tabs and the end-of-line characters CR and NL) is allowed inside +sequence data. Dots "." and dashes "–" in sequences are +allowed and are discarded unless the input is expected to be aligned (e.g. for +the –refine option).

+ +

3.1.2 Nucleotide +sequences

+ +

The usual letters A, G, C, T and U stand for nucleotides. The letters T and U are equivalent as far as MUSCLE is concerned. N is the -wildcard meaning "unknown nucleotide". R means A or G, Y means C or +wildcard meaning "unknown nucleotide". R means A or G, Y means C or T/U. Other wildcards, such as those used by RFAM, are not understood in this version and will be replaced by Ns. If you would like support for other DNA / RNA alphabets, please let me know.

-

3.1.3 Determining sequence type

+

3.1.3 Determining +sequence type

-

By default, MUSCLE looks at the first 100 letters in the +

By default, MUSCLE looks at the first 100 letters in the input sequence data (excluding gaps). If 95% or more of those letters are valid nucleotides (AGCTUN), then the file is treated as nucleotides, otherwise as amino acids. This method almost always guesses correctly, but you can make sure -by specifying the sequence type on the command line. This is done using the –seqtype option, which can take the following values:

+by specifying the sequence type on the command line. This is done using the –seqtype +option, which can take the following values:

+ +

 

-

 

+

        –­seqtype protein                          + Amino +acid

-

        –­seqtype protein                          Amino acid

+

        –seqtype dna                +         +                + DNA +(ACGT alphabet)

-

        –seqtype nucleo                          Nucleotide

+

        –seqtype rna         +         +                + DNA +(ACGT alphabet)

-

        –seqtype auto                               Automatic -detection (default).

+

        –seqtype auto                               + Automatic +detection (default).

-

3.2 Output +

3.2 Output files

-

By default, output is also written in FASTA format. All -letters are upper-case and gaps are represented by dashes "–". Output +

By default, output is also written in FASTA format. All +letters are upper-case and gaps are represented by dashes "–". Output is written to the following destination(s):

-

 

+

 

-

        If no other -output option is given, then standard output.

+

        If no other output option is given, then standard +output.

-

        If -out <filename> -is given, to the specified file.

+

        If -out <filename> is given, to +the specified file.

-

        For all of the -xxxout options -(e.g. -fastaout, -clwout), -to the specified files.

+

        For all of the -xxxout options (e.g. -fastaout, +-clwout), to the specified files.

-

3.2.1 Sequence grouping

+

3.2.1 Sequence +grouping

-

By default, MUSCLE re-arranges sequences so that similar +

By default, MUSCLE re-arranges sequences so that similar sequences are adjacent in the output file. (This is done by ordering sequences according to a prefix traversal of the guide tree). This makes the alignment easier to evaluate by eye. If you want to the sequences to be output in the same order as the input file, you can use the –stable option.

-

3.2.2 Output to multiple file -formats

+

 

+ +

WARNING

+ +

THE -stable OPTION +HAD A SERIOUS BUG IN VERSIONS OF MUSCLE PRIOR TO v3.8 AND IS CURRENTLY NOT +SUPPORTED.

+ +

3.2.2 Output +to multiple file formats

-

You can request output to more than one file format by using -the -xxxout options. For example, to get both -FASTA and CLUSTALW formats:

+

You can request output to more than one file format by using +the -xxxout options. For example, to get both FASTA and CLUSTALW +formats:

-

 

+

 

-

muscle -in seqs.fa -fastaout seqs.afa -clwout seqs.aln

+

muscle -in seqs.fa -fastaout seqs.afa -clwout seqs.aln

-

3.3 CLUSTALW +

3.3 CLUSTALW format

-

You can request CLUSTALW output by using the –clw option. This should be compatible with CLUSTALW, -with the exception of the program name in the file header. You can ask MUSCLE -to impersonate CLUSTALW by writing "CLUSTAL W (1.81)" as the program -name by using –clwstrict or clwstrictout. Note that MUSCLE allows duplicate -sequence labels, while CLUSTALW forbids duplicates. If you use the –stable -option of muscle, then the order of the input sequences is preserved and +

You can request CLUSTALW output by using the –clw +option. This should be compatible with CLUSTALW, with the exception of the +program name in the file header. You can ask MUSCLE to impersonate CLUSTALW by +writing "CLUSTAL W (1.81)" as the program name by using –clwstrict +or ‑clwstrictout. Note that MUSCLE allows duplicate sequence +labels, while CLUSTALW forbids duplicates. If you use the –stable option +of muscle, then the order of the input sequences is preserved and sequences can be unambiguously identified even if the labels differ. If you have problems parsing MUSCLE output with scripts designed for CLUSTALW, please -let me know and I'll do my best to provide a fix.

+let me know and I'll do my best to provide a fix.

-

3.4 MSF +

3.4 MSF format

-

MSF format, as used in the GCG package, is requested by -using the –msf option. As with CLUSTALW -format, this is easier for people to read than FASTA. As of MUSCLE 3.52, the -MSF format has been tweaked to be more compatible with GCG. The following -differences remain.

+

MSF format, as used in the GCG package, is requested by +using the –msf option. As with CLUSTALW format, this is easier for +people to read than FASTA. As of MUSCLE 3.52, the MSF format has been tweaked +to be more compatible with GCG. The following differences remain.

-

 

+

 

-

(a) MUSCLE truncates at the first white space or after 63 +

(a) MUSCLE truncates at the first white space or after 63 characters, which ever comes first. The GCG package apparently truncates after 10 characters. If this is a problem for you, please let me know and I'll add an option to truncate after 10 in a future version.

-

 

+

 

-

(b) MUSCLE allows duplicate sequence labels, while GCG +

(b) MUSCLE allows duplicate sequence labels, while GCG forbids duplicates. If you use the –stable option of muscle, then the order of the input sequences is preserved and sequences can be unambiguously identified even if the labels differ.

-

 

+

 

-

Thanks to Eric Martel for help with improving GCG -compatibility.

+

Thanks to Eric Martel for help with improving GCG +compatibility.

-

3.5 HTML +

3.5 HTML format

-

I've added an experimental feature starting in version 3.4. To +

I added an experimental feature starting in version 3.4. To get a Web page as output, use the –html option. The alignment is colored -using a color scheme from Eric Sonnhammer's Belvu editor, which is my personal favorite. A drawback of -this option is that the Web page typically contains a very large number of HTML -tags, which can be slow to display in the Internet Explorer browser. The -Netscape browser works much better. If you have any ideas about good ways to -make Web pages, please let me know.

- -

3.6 Phylip format

- -

The Phylip package supports two -different multiple sequence alignment file formats, called sequential and -interleaved respectively.

- -

4 Using MUSCLE

- -

In this section we give more details of the MUSCLE algorithm -and the more important options offered by the muscle implementation.

- -

4.1 How -the algorithm works

- -

I won't give a complete description of the MUSCLE algorithm +using a color scheme from Eric Sonnhammer's Belvu editor, which is my personal +favorite. A drawback of this option is that the Web page typically contains a +very large number of HTML tags, which can be slow to display in the Internet +Explorer browser. The Netscape browser works much better. If you have any ideas +about good ways to make Web pages, please let me know.

+ +

3.6 Phylip +format

+ +

The Phylip package supports two different multiple sequence +alignment file formats, called sequential and interleaved respectively.

+ +

4 Using +MUSCLE

+ +

In this section we give more details of the MUSCLE algorithm +and the more important options offered by the muscle implementation.

+ +

4.1 How the +algorithm works

+ +

I won't give a complete description of the MUSCLE algorithm here—for that, you will have to read the papers. (See citations on title page above). But hopefully a summary will help explain what some of the command-line options do and how they might be useful in your work.

-

 

+

 

-

The first step is to calculate a tree. In CLUSTALW, this is +

The first step is to calculate a tree. In CLUSTALW, this is done as follows. Each pair of input sequences is aligned, and used to compute the pair-wise identity of the pair. Identities are converted to a measure of distance. Finally, the distance matrix is converted to a tree using a clustering method (CLUSTALW uses neighbor-joining). If you have 1,000 -sequences, there are (1,000 ´ 999)/2 = 499,500 pairs, so aligning every pair can take -a while. MUSCLE uses a much faster, but somewhat more approximate, method to -compute distances: it counts the number of short sub-sequences (known as k-mers, k-tuples or words) -that two sequences have in common, without constructing an alignment. This is -typically around 3,000 times faster that CLUSTALW's -method, but the trees will generally be less accurate. We call this step "k-mer clustering".

- -

 

- -

The second step is to use the tree to construct what is +sequences, there are (1,000 ´ 999)/2 = +499,500 pairs, so aligning every pair can take a while. MUSCLE uses a much +faster, but somewhat more approximate, method to compute distances: it counts +the number of short sub-sequences (known as k-mers, k-tuples or +words) that two sequences have in common, without constructing an alignment. +This is typically around 3,000 times faster that CLUSTALW's method, but the trees +will generally be less accurate. We call this step "k-mer +clustering".

+ +

 

+ +

The second step is to use the tree to construct what is known as a progressive alignment. At each node of the binary tree, a pair-wise alignment is constructed, progressing from the leaves towards the root. The first alignment will be made from two sequences. Later alignments will be one of the three following types: sequence-sequence, profile-sequence or -profile-profile, where "profile" means the multiple alignment of the sequences under a given internal node of -the tree. This is very similar to what CLUSTALW does once it has built a tree.

+profile-profile, where "profile" means the multiple alignment of the sequences +under a given internal node of the tree. This is very similar to what CLUSTALW +does once it has built a tree.

-

 

+

 

-

Now we have a multiple +

Now we have a multiple alignment, which has been built very quickly compared with conventional -methods, mainly because of the distance calculation using k-mers rather than alignments. The quality of this alignment -is typically pretty good—it will often tie or beat a T-Coffee alignment on our -tests. However, on average, we find that it can be improved by proceeding -through the following steps.

- -

 

- -

From the multiple alignment, we can now compute the pair-wise identities of -each pair of sequences. This gives us a new distance matrix, from which we -estimate a new tree. We compare the old and new trees, and re-align subgroups -where needed to produce a progressive multiple alignment from the new tree. If -the two trees are identical, there is nothing to do; if there are no subtrees -that agree (very unusual), then the whole progressive alignment procedure must -be repeated from scratch. Typically we find that the tree is pretty stable near -the leaves, but some re-alignments are needed closer the root. This procedure -(compute pair-wise identities, estimate new tree, compare trees, re-align) is -iterated until the tree stabilizes or until a specified maximum number of -iterations has been done. We call this process "tree refinement", -although it also tends to improve the alignment.

- -

 

- -

We now keep the tree fixed +methods, mainly because of the distance calculation using k-mers rather +than alignments. The quality of this alignment is typically pretty good—it will +often tie or beat a T-Coffee alignment on our tests. However, on average, we +find that it can be improved by proceeding through the following steps.

+ +

 

+ +

From the multiple alignment, +we can now compute the pair-wise identities of each pair of sequences. This +gives us a new distance matrix, from which we estimate a new tree. We compare +the old and new trees, and re-align subgroups where needed to produce a +progressive multiple alignment from the new tree. If the two trees are +identical, there is nothing to do; if there are no subtrees that agree (very unusual), +then the whole progressive alignment procedure must be repeated from scratch. +Typically we find that the tree is pretty stable near the leaves, but some +re-alignments are needed closer the root. This procedure (compute pair-wise +identities, estimate new tree, compare trees, re-align) is iterated until the +tree stabilizes or until a specified maximum number of iterations has been +done. We call this process "tree refinement", although it also tends +to improve the alignment.

+ +

 

+ +

We now keep the tree fixed and move to a new procedure which is designed to improve the multiple alignment. The set of sequences is divided into two subsets (i.e., we make a bipartition on the set of sequences). A profile is constructed for each of the -two subsets based on the current multiple alignment. -These two profiles are then re-aligned to each other using the same pair-wise -alignment algorithm as used in the progressive stage. If this improves an -"objective score" that measures the quality of the alignment, then -the new multiple alignment is kept, otherwise it is -discarded. By default, the objective score is the classic sum-of-pairs score -that takes the (sequence weighted) average of the pair-wise alignment score of -every pair of sequences in the alignment. Bipartitions are chosen by deleting -an edge in the guide tree, each of the two resulting subtrees defines a subset -of sequences. This procedure is called "tree dependent refinement". One iteration of tree dependent refinement tries +two subsets based on the current multiple alignment. These two profiles are +then re-aligned to each other using the same pair-wise alignment algorithm as +used in the progressive stage. If this improves an "objective score" +that measures the quality of the alignment, then the new multiple alignment is +kept, otherwise it is discarded. By default, the objective score is the classic +sum-of-pairs score that takes the (sequence weighted) average of the pair-wise +alignment score of every pair of sequences in the alignment. Bipartitions are +chosen by deleting an edge in the guide tree, each of the two resulting +subtrees defines a subset of sequences. This procedure is called "tree +dependent refinement". One iteration of tree dependent refinement tries bipartitions produced by deleting every edge of the tree in depth order moving from the leaves towards the center of the tree. Iterations continue until convergence or up to a specified maximum.

-

 

+

 

-

For convenience, the major -steps in MUSCLE are described as "iterations", though the first three +

For convenience, the major +steps in MUSCLE are described as "iterations", though the first three iterations all do quite different things and may take very different lengths of time to complete. The tree-dependent refinement iterations 3, 4 ... are true iterations and will take similar lengths of time.

-

 

+

 

- - -
-

Iteration

+ + + - - - - + + - - + - - - + - -
+

Iteration

-

Actions

+
+

Actions

-

1

-
-

Distance matrix by k-mer clustering, estimate tree, progressive alignment - according to this tree.

-

 

+
+

1

+
+

Distance matrix by k-mer + clustering, estimate tree, progressive alignment according to this tree.

+

 

-

2

+
+

2

-

Distance matrix by +

+

Distance matrix by pair-wise identities from current multiple alignment, estimate tree, progressive alignment according to new tree, repeat until convergence or specified maximum number of times.

-

 

+

 

-

3, 4 ...

+
+

3, 4 ...

-

Tree-dependent refinement. One iteration visits every edge in the tree one time.

+
+

Tree-dependent refinement. + One iteration visits every edge in the tree one time.

+
-

4.2 Command-line +

4.2 Command-line options

-

There are two types of command-line options: value options +

There are two types of command-line options: value options and flag options. Value options are followed by the value of the given parameter, for example –in <filename>; flag options just stand for -themselves, such as –msf. All options are a -dash (not two dashes!) followed by a long name; there are no single-letter -equivalents. Value options must be separated from their values by white space -in the command line. Thus, muscle does not follow Unix, -Linux or Posix standards, for which we apologize. The +themselves, such as –msf. All options are a dash (not two dashes!) +followed by a long name; there are no single-letter equivalents. Value options +must be separated from their values by white space in the command line. Thus, muscle +does not follow Unix, Linux or Posix standards, for which we apologize. The order in which options are given is irrelevant unless two options contradict, -in which case the right-most option silently wins.

+in which case the right-most option silently wins.

-

4.3 The +

4.3 The maxiters option

-

You can control the number of iterations that MUSCLE does by +

You can control the number of iterations that MUSCLE does by specifying the –maxiters option. If you specify 1, 2 or 3, then this is exactly the number of iterations that will be performed. If the value is greater than 3, then muscle will continue up to the maximum you specify or until convergence is reached, which ever happens sooner. The default is 16. If you have a large number of sequences, refinement may be rather slow.

-

4.4 The -maxtrees option

+

4.4 The +maxtrees option

-

This option controls the maximum number of new trees to +

This option controls the maximum number of new trees to create in iteration 2. Our experience suggests that a point of diminishing returns is typically reached after the first tree, so the default value is 1. If a larger value is given, the process will repeat until convergence or until this number of trees has been created, which ever comes first.

-

4.5 The -maxhours option

+

4.5 The +maxhours option

-

If you have a large alignment, muscle may take a long -time to complete. It is sometimes convenient to say "I want the best -alignment I can get in 24 hours" rather than specifying a set of options -that will take an unknown length of time. This is done by using –maxhours, which specifies a floating-point number of -hours. If this time is exceeded, muscle will write out current alignment -and stop. For example,

+

If you have a large alignment, muscle may take a long +time to complete. It is sometimes convenient to say "I want the best +alignment I can get in 24 hours" rather than specifying a set of options +that will take an unknown length of time. This is done by using –maxhours, +which specifies a floating-point number of hours. If this time is exceeded, muscle +will write out current alignment and stop. For example,

-

 

+

 

-

muscle -in huge.fa -out huge.afa -maxiters 9999 -maxhours 24.0

+

muscle -in huge.fa -out huge.afa -maxiters 9999 -maxhours 24.0

-

 

+

 

-

Note that the actual time may exceed the specified limit by +

Note that the actual time may exceed the specified limit by a few minutes while muscle finishes up on a step. It is also possible for no alignment to be produced if the time limit is too small.

-

4.6 The -maxmb option

- -

If the amount of memory needed by MUSCLE exceeds available -physical RAM, then the operating system will probably begin paging (i.e., -swapping memory to and from hard disk), causing MUSCLE to run very slowly. This -is especially problematic when MUSCLE is used for batch processing, where one or -two very large alignments can cause a batch to effectively hang. Starting in -version 3.52, MUSCLE attempts to limit the amount of memory used. If the limit -is exceeded, MUSCLE quits, saving the best alignment so far produced (if any). -MUSCLE attempts to determine the amount of physical RAM by making an -appropriate operating system call. Under Linux and Windows, this works well. On -other systems, particularly other flavors of Unix, -MUSCLE doesn't know how to query the system and assumes that there is 500 Mb of -RAM. To override this default, you can specify the maximum number of megabytes -to allocate by using the –maxmb option, for -example to set a limit of 1.5 Gb:

- -

 

- -

muscle -in huge.fa -out huge.afa -maxhours 1.0 -maxmb 1500

- -

 

- -

This feature has been hacked on top of code that wasn't -really designed for it. So it doesn't always work perfectly, but is better than -nothing. The ideal solution would be to implement linear space dynamic -programming code (e.g., the Myers-Miller algorithm) for situations where memory -is tight. One day I might do this if there is sufficient interest. If you are -interested in contributing the code, e.g. for a class project, please let me -know, I'll be glad to provide support.

- -

4.7 The +

4.6 The profile scoring function

-

Three different protein profile scoring functions are +

Three different protein profile scoring functions are supported, the log-expectation score (–le option) and a sum of pairs -score using either the PAM200 matrix (–sp) or the VTML240 matrix (–sv). The log-expectation score is the default as it -gives better results on our tests, but is typically somewhere between two or -three times slower than the sum-of-pairs score. For nucleotides, –spn is currently the only option (which is of course -the default for nucleotide data, so you don't need to specify this option).

- -

4.8 Diagonal +score using either the PAM200 matrix (–sp) or the VTML240 matrix (–sv). +The log-expectation score is the default as it gives better results on our +tests, but is typically somewhere between two or three times slower than the +sum-of-pairs score. For nucleotides, –spn is currently the only option +(which is of course the default for nucleotide data, so you don't need to +specify this option).

+ +

4.7 Diagonal optimization

-

Creating a pair-wise alignment by dynamic programming -requires computing an L1 ´ L2 -matrix, where L1 and L2 are the sequence -lengths. A trick used in algorithms such as BLAST is to reduce the size of this -matrix by using fast methods to find "diagonals", i.e. short regions -of high similarity between the two sequences. This speeds up the algorithm at -the expense of some reduction in accuracy. MUSCLE uses a technique called k-mer extension to find diagonals. It is disabled by default -because of the slight reduction in average accuracy and can be turned on by -specifying the –diags option. To enable -diagonal optimization in the first iteration, use –diags1, to enable -diagonal optimization in the second iteration, use –diags2. These are -provided separately because it would be a reasonable strategy to enable +

Creating a pair-wise alignment by dynamic programming +requires computing an L1 ´ +L2 matrix, where L1 and L2 +are the sequence lengths. A trick used in algorithms such as BLAST is to reduce +the size of this matrix by using fast methods to find "diagonals", also +called "gapless high-scoring segment pairs", i.e. short regions of +high similarity between the two sequences. This speeds up the algorithm at the +expense of some reduction in accuracy. MUSCLE uses a technique called k-mer +extension to find diagonals, which is described in this paper:

+ +

 

+ +

Edgar, R.C. +(2004) Local homology recognition and distance measures in linear time using +compressed amino acid alphabets, Nucleic Acids Res., 32, 1.

+ +

 

+ +

It is disabled by default because of the slight reduction in +average accuracy and can be turned on by specifying the –diags option. +To enable diagonal optimization in the first iteration, use –diags1, to +enable diagonal optimization in the second iteration, use –diags2. These +are provided separately because it would be a reasonable strategy to enable diagonals in the first iteration but not the second (because the main goal of the first iteration is to construct a multiple alignment quickly in order to improve the distance matrix, which is not very sensitive to alignment quality; whereas the goal of the second iteration is to make the best possible -progressive alignment).

+progressive alignment).

-

4.9 Anchor +

4.8 Anchor optimization

-

Tree-dependent refinement (iterations 3, 4 ... ) can be speeded up by dividing the alignment vertically -into blocks. Block boundaries are found by identifying high-scoring columns -(e.g., a perfectly conserved column of Cs or Ws would be a candidate). Each -vertical block is then refined independently before reassembling the complete -alignment, which is faster because of the L2 factor in -dynamic programming (e.g., suppose the alignment is split into two vertical -blocks, then 2 ´ -0.52 = 0.5, so the dynamic programming time is roughly halved). The -–noanchors option is used to disable this -feature. This option has no effect if –maxiters 1 or –maxiters 2 +

Tree-dependent refinement (iterations 3, 4 ... ) can be +speeded up by dividing the alignment vertically into blocks. Block boundaries +are found by identifying high-scoring columns (e.g., a perfectly conserved +column of Cs or Ws would be a candidate). Each vertical block is then refined +independently before reassembling the complete alignment, which is faster +because of the L2 factor in dynamic programming (e.g., +suppose the alignment is split into two vertical blocks, then 2 ´ 0.52 = 0.5, so the dynamic +programming time is roughly halved). The –noanchors option is used to disable +this feature. This option has no effect if –maxiters 1 or –maxiters 2 is specified. On benchmark tests, enabling anchors has little or no effect on accuracy, but if you want to be very conservative and are striving for the best -possible accuracy then –noanchors is a -reasonable choice.

+possible accuracy then –noanchors is a reasonable choice.

-

4.10 Log +

4.9 Log file

-

You can specify a log file by using –log <filename> -or –loga <filename>. Using –log -causes any existing file to be deleted, –loga -appends to any existing file. A message will be written to the log file when muscle -starts and stops. Error and warning messages will also be written to the log. -If –verbose is specified, then more information will be written, -including the command line used to invoke muscle, the resulting internal -parameter settings, and also progress messages. The content and format of -verbose log file output is subject to change in future versions.

- -

 

- -

The use of a log file may seem contrary to Unix conventions for using standard output and standard -error. I like these conventions, but never found a fully satisfactory way to -use them. I like progress messages (see below), but they mess up a file if you -re-direct standard error and there are errors or warning messages too. I could -try to detect whether a standard file handle is a tty device or a disk file and change behavior -accordingly, but I regard this as too complicated and too hard for the user to -understand. On Windows it can be hard to re-direct standard file handles, -especially when working in a GUI debugger. Maybe one day I will figure out a -better solution (suggestions welcomed).

- -

 

- -

I highly recommend using –verbose and ­–log[a], +

You can specify a log file by using –log <filename> +or –loga <filename>. Using –log causes any existing file to +be deleted, –loga appends to any existing file. A message will be +written to the log file when muscle starts and stops. Error and warning +messages will also be written to the log. If –verbose is specified, then +more information will be written, including the command line used to invoke muscle, +the resulting internal parameter settings, and also progress messages. The +content and format of verbose log file output is subject to change in future +versions.

+ +

 

+ +

The use of a log file may seem contrary to Unix conventions for +using standard output and standard error. I like these conventions, but never +found a fully satisfactory way to use them. I like progress messages (see +below), but they mess up a file if you re-direct standard error and there are +errors or warning messages too. I could try to detect whether a standard file +handle is a tty device or a disk file and change behavior accordingly, +but I regard this as too complicated and too hard for the user to understand. On +Windows it can be hard to re-direct standard file handles, especially when +working in a GUI debugger. Maybe one day I will figure out a better solution +(suggestions welcomed).

+ +

 

+ +

I highly recommend using –verbose and ­–log[a], especially when running muscle in a batch mode. This enables you to verify whether a particular alignment was completed and to review any errors or -warnings that occurred.

+warnings that occurred.

-

4.11 Progress +

4.10 Progress messages

-

By default, muscle writes progress messages to +

By default, muscle writes progress messages to standard error periodically so that you know it's doing something and get some feedback about the time and memory requirements for the alignment. Here is a typical progress message.

-

 

+

 

-

00:00:23     25 Mb (5%)  Iter   -2  87.20%  Build guide tree

+

00:00:23     25 Mb (5%)  Iter   2  87.20%  Build guide tree

-

 

+

 

-

The fields are as follows.

+

The fields are as follows.

-

 

+

 

- - -
-

00:00:23

+ + + - - - + - - - + - - - + - - - + - -
+

00:00:23

-

Elapsed time since muscle - started.

+
+

Elapsed time since muscle + started.

-

25 Mb (5%)

+
+

25 Mb (5%)

-

Peak memory use in megabytes +

+

Peak memory use in megabytes (i.e., not the current usage, but the maximum amount of memory used since muscle - started). The number in parentheses is the fraction of physical memory (see –maxmb option for more discussion).

+ started). The number in parentheses is the fraction of physical memory (see –maxmb + option for more discussion).

-

Iter 2

+
+

Iter 2

-

Iteration currently in +

+

Iteration currently in progress.

-

87.20%

+
+

87.20%

-

How much of the current step +

+

How much of the current step has been completed (percentage).

-

Build...

+
+

Build...

-

A brief description of the current step.

+
+

A brief description of the current step.

+
-

 

+

 

-

The –quiet command-line option disables writing +

The –quiet command-line option disables writing progress messages to standard error. If the –verbose command-line option -is specified, a progress message will be written to the log file when each iteration completes. So –quiet and –verbose -are not contradictory.

+is specified, a progress message will be written to the log file when each +iteration completes. So –quiet and –verbose are not +contradictory.

-

4.12 Running +

4.11 Running out of memory

-

The muscle code tries to deal gracefully with -low-memory conditions by using the following technique. A block of "emergency -reserve" memory is allocated when muscle starts. If a later request +

The muscle code tries to deal gracefully with +low-memory conditions by using the following technique. A block of "emergency +reserve" memory is allocated when muscle starts. If a later request to allocate memory fails, this reserve block is made available, and muscle attempts to save the current alignment. With luck, the reserved memory will be enough to allow muscle to save the alignment and exit gracefully with an -informative error message. See also the –maxmb -option.

+informative error message.

-

4.13 Troubleshooting

+

4.12 Troubleshooting

-

Here is some general advice on what to do if muscle +

Here is some general advice on what to do if muscle fails and you don't understand what happened. The code is designed to fail gracefully with an informative error message when something goes wrong, but there will no doubt be situations I haven't anticipated (not to mention bugs).

-

 

+

 

-

Check the MUSCLE web site for updates, bug reports and other +

Check the MUSCLE web site for updates, bug reports and other relevant information.

-

 

+

 

-

        http://www.drive5.com/muscle

+

        http://www.drive5.com/muscle

-

 

+

 

-

Check the input file to make sure it is in valid FASTA +

Check the input file to make sure it is in valid FASTA format. Try giving it to another sequence analysis program that can accept -large FASTA files (e.g., the NCBI formatdb -utility) to see if you get an informative error message. Try dividing the file -into two halves and using each half individually as input. If one half fails -and the other does not, repeat until the problem is -localized as far as possible.

+large FASTA files (e.g., the NCBI formatdb utility) to see if you get an +informative error message. Try dividing the file into two halves and using each +half individually as input. If one half fails and the other does not, repeat +until the problem is localized as far as possible.

-

 

+

 

-

Use –log or –loga -and –­verbose and check the log file to see if there are any messages -that give you a hint about the problem. Look at the peak memory requirements -(reported in progress messages) to see if you may be exceeding the physical or -virtual memory capacity of your computer.

+

Use –log or –loga and –­verbose and +check the log file to see if there are any messages that give you a hint about the +problem. Look at the peak memory requirements (reported in progress messages) +to see if you may be exceeding the physical or virtual memory capacity of your +computer.

-

 

+

 

-

If muscle crashes without giving an error message, or +

If muscle crashes without giving an error message, or hangs, then you may need to refer to the source code or use a debugger. A -"debug" version, muscled, may be provided. This is built from +"debug" version, muscled, may be provided. This is built from the same source code but with the DEBUG macro defined and without compiler optimizations. This version runs much more slowly (perhaps by a factor of three or more), but does a lot more internal checking and may be able to catch something that is going wrong in the code. The –­core option specifies that muscle should not catch exceptions. When –core is specified, an exception may result in a debugger trap or a core dump, depending on the -execution environment. The –nocore option has -the opposite effect. In muscle, –nocore -is the default, –­core is the default in muscled.

+execution environment. The –nocore option has the opposite effect. In muscle, +–nocore is the default, –­core is the default in muscled.

-

4.14 Technical +

4.13 Technical support

-

I am happy to provide support. But I am busy, and am +

I am happy to provide support. But I am busy, and am offering this program at no charge, so I ask you to make a reasonable effort to -figure things out for yourself before contacting me.

+figure things out for yourself before contacting +me.

-

5 Command Line Reference

+

5 Command Line +Reference

-

 

+

 

- +
- - + - - - - - + - - - - - + - - - - - - - - + + + + - - - - - + + + + - - + - - - - - + - - - - - + - - - - - + - - - - - - - - + - - - - - - + + - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - - - - + + + + - - + - - - - - + - - - - - + - - - - - + - - - - - - - - + + + + - - + - - - - - - - - + + + + - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - + - - - - - - - - + + + + -
-

Value option

+
+

Value option

-

Legal values

+
+

Legal values

-

Default

+
+

Default

-

Description

+
+

Description

-

anchorspacing

+
+

anchorspacing

-

Integer

+
+

Integer

-

32

+
+

32

-

Minimum spacing between +

+

Minimum spacing between anchor columns.

-

 

+

 

-

center

+
+

center

-

Floating point

+
+

Floating point

-

[1]

+
+

[1]

-

Center parameter. Should be +

+

Center parameter. Should be negative.

-

 

+

 

-

cluster1

-

cluster2

-
-

upgma

-

upgmb

-

neighborjoining

-
-

upgmb

-
-

Clustering method. cluster1 is used in iteration 1 and 2, cluster2 in later - iterations.

-

 

+
+

cluster1

+

cluster2

+
+

upgma

+

upgmb

+

neighborjoining

+
+

upgmb

+
+

Clustering method. cluster1 + is used in iteration 1 and 2, cluster2 in later iterations.

+

 

-

clwout

-
-

File name

-
-

None

-
-

Write output in CLUSTALW +

+

clwout

+
+

File name

+
+

None

+
+

Write output in CLUSTALW format to given file name.

-

clwout

+
+

clwout

-

File name

+
+

File name

-

None

+
+

None

-

As -clwout, - except that header is strictly compatible with CLUSTALW 1.81.

-

 

+
+

As -clwout, except + that header is strictly compatible with CLUSTALW 1.81.

+

 

-

diagbreak

+
+

diagbreak

-

Integer

+
+

Integer

-

1

+
+

1

-

Maximum distance between two diagonals that allows them to +

+

Maximum distance between two diagonals that allows them to merge into one diagonal.

-

 

+

 

-

diaglength

+
+

diaglength

-

Integer

+
+

Integer

-

24

+
+

24

-

Minimum length of diagonal.

-

 

+
+

Minimum length of diagonal.

+

 

-

diagmargin

+
+

diagmargin

-

Integer

+
+

Integer

-

5

+
+

5

-

Discard this many positions +

+

Discard this many positions at ends of diagonal.

-

 

+

 

-

distance1

-

 

-
-

kmer6_6

-

kmer20_3

-

kmer20_4

-

kbit20_3

-

kmer4_6

-

 

-
-

Kmer6_6 - (amino) or Kmer4_6 (nucleo)

-
-

Distance measure for iteration 1.

+
+

distance1

+

 

-

distance2

-

 

-
-

kmer6_6

-

kmer20_3

-

kmer20_4

-

kbit20_3

-

pctid_kimura

-

pctid_log

-

 

-
-

pctid_kimura

-
-

Distance measure for iterations 2, 3 ...

-

 

-

 

-

 

-

 

+
+

kmer6_6

+

kmer20_3

+

kmer20_4

+

kbit20_3

+

kmer4_6

+

 

+
+

Kmer6_6 + (amino) or Kmer4_6 (nucleo)

+
+

Distance measure for iteration 1.

-

fastaout

+
+

distance2

+

 

-

File - name

+
+

pctid_kimura

+

pctid_log

+

 

-

None

+
+

pctid_kimura

-

Write output in FASTA format to the given file.

-

 

+
+

Distance measure for iterations 2, 3 ...

+

 

+

 

+

 

+

 

-

gapopen

+
+

fastaout

-

Floating point

+
+

File + name

-

[1]

+
+

None

-

The gap open score. Must be - negative.

-

 

+
+

Write output in FASTA format to the given file.

+

 

-

hydro

+
+

gapopen

-

Integer

+
+

Floating point

-

5

+
+

[1]

-

Window size for determining whether a region is - hydrophobic.

-

 

+
+

The gap open score. Must be negative.

+

 

-

hydrofactor

+
+

hydro

-

Floating point

+
+

Integer

-

1.2

+
+

5

-

Multiplier for gap open/close penalties in hydrophobic - regions.

-

 

+
+

Window size for determining whether a region is hydrophobic.

+

 

-

in

+
+

hydrofactor

-

Any file name

+
+

Floating point

-

standard input

+
+

1.2

-

Where to find the input sequences.

-

 

+
+

Multiplier for gap open/close penalties in hydrophobic + regions.

+

 

-

in1

+
+

in

-

Any file name

+
+

Any file name

-

None

+
+

standard input

-

Where to find an input alignment.

-

 

+
+

Where to find the input sequences.

+

 

-

in2

+
+

in1

-

Any file name

+
+

Any file name

-

None

+
+

None

-

Where to find an input alignment.

-

 

+
+

Where to find an input alignment.

+

 

-

log

+
+

in2

-

File name

+
+

Any file name

-

None.

+
+

None

-

Log file name (delete existing file).

-

 

+
+

Where to find an input alignment.

+

 

-

loga

+
+

log

-

File name

+
+

File name

-

None.

+
+

None.

-

Log file name (append to existing file).

-

 

+
+

Log file name (delete existing file).

+

 

-

matrix

+
+

loga

-

File name

+
+

File name

-

None

+
+

None.

-

File name for substitution matrix in NCBI or WU-BLAST - format. If you specify your own matrix, you should also specify:

-

 

-

-gapopen <g>, -gapextend <e> -center 0.0

-

 

-

Note that <g> and <e> MUST be negative.

-

 

+
+

Log file name (append to existing file).

+

 

-

maxhours

+
+

matrix

-

Floating point

+
+

File name

-

None.

+
+

None

-

Maximum time to run in hours. The actual time may exceed - the requested limit by a few minutes. Decimals are allowed, so 1.5 means one hour - and 30 minutes.

-

 

+
+

File name for substitution matrix in NCBI or WU-BLAST + format. If you specify your own matrix, you should also specify:

+

 

+

-gapopen <g>, -gapextend <e> -center 0.0

+

 

+

Note that <g> and <e> MUST be negative.

+

 

-

maxiters

+
+

maxhours

-

Integer 1, 2 ...

+
+

Floating point

-

16

+
+

None.

-

Maximum number of iterations.

-

 

+
+

Maximum time to run in hours. The actual time may exceed + the requested limit by a few minutes. Decimals are allowed, so 1.5 means one + hour and 30 minutes.

+

 

-

maxmb

+
+

maxiters

-

Integer

+
+

Integer 1, 2 ...

-

80% - of Physical RAM, or 500 Mb if not known.

-

 

+
+

16

-

Maximum memory to allocate in Mb.

+
+

Maximum number of iterations.

+

 

-

maxtrees

+
+

maxtrees

-

Integer

+
+

Integer

-

1

+
+

1

-

Maximum number of new trees to build in iteration 2.

-

 

+
+

Maximum number of new trees to build in iteration 2.

+

 

-

minbestcolscore

+
+

minbestcolscore

-

Floating point

+
+

Floating point

-

[1]

+
+

[1]

-

Minimum score a column must +

+

Minimum score a column must have to be an anchor.

-

 

+

 

-

minsmoothscore

+
+

minsmoothscore

-

Floating point

+
+

Floating point

-

[1]

+
+

[1]

-

Minimum smoothed score a +

+

Minimum smoothed score a column must have to be an anchor.

-

 

+

 

-

msaout

+
+

msaout

-

File name

+
+

File name

-

None

+
+

None

-

Write output to given file +

+

Write output to given file name in MSF format.

-

 

+

 

-

objscore

-
-

sp

-

ps

-

dp

-

xp

-

spf

-

spm

-
-

spm

-
-

Objective score used by tree dependent refinement.

-

sp=sum-of-pairs score.

-

spf=sum-of-pairs score (dimer - approximation)

-

spm=sp for < 100 seqs, otherwise spf

-

dp=dynamic programming score.

-

ps=average profile-sequence score.

-

xp=cross profile score.

-

 

+
+

objscore

+
+

sp

+

ps

+

dp

+

xp

+

spf

+

spm

+
+

spm

+
+

Objective score used by tree dependent refinement.

+

sp=sum-of-pairs score.

+

spf=sum-of-pairs score (dimer approximation)

+

spm=sp for < 100 seqs, otherwise spf

+

dp=dynamic programming score.

+

ps=average profile-sequence score.

+

xp=cross profile score.

+

 

-

out

+
+

out

-

File name

+
+

File name

-

standard output

+
+

standard output

-

Where to write the alignment.

-

 

+
+

Where to write the alignment.

+

 

-

phyiout

+
+

phyiout

-

File name

+
+

File name

-

None

+
+

None

-

Write output in Phylip - interleaved format to given file name.

-

 

+
+

Write output in Phylip interleaved format to given file + name.

+

 

-

physout

+
+

physout

-

File name

+
+

File name

-

None

+
+

None

-

Write output in Phylip - sequential format to given file name.

-

 

+
+

Write output in Phylip sequential format to given file + name.

+

 

-

refinewindow

+
+

refinewindow

-

Integer

+
+

Integer

-

200

+
+

200

-

Length of window for -refinew.

-

 

+
+

Length of window for -refinew.

+

 

-

root1

-

root2

-
-

pseudo

-

midlongestspan

-

minavgleafdist

-
-

psuedo

-
-

Method used to root tree; root1 is used in iteration 1 and +

+

root1

+

root2

+
+

pseudo

+

midlongestspan

+

minavgleafdist

+
+

psuedo

+
+

Method used to root tree; root1 is used in iteration 1 and 2, root2 in later iterations.

-

 

-

 

+

 

+

 

-

scorefile

+
+

scorefile

-

File - name

+
+

File + name

-

None

+
+

None

-

File name where to write a score file. This contains one +

+

File name where to write a score file. This contains one line for each column in the alignment. The line contains the letters in the column followed by the average BLOSUM62 score over pairs of letters in the column.

-

 

+

 

-

seqtype

-
-

protein

-

nucleo

-

auto

-

 

-
-

auto

-
-

Sequence type.

+
+

seqtype

+
+

protein

+

dna

+

rna

+

auto

+

 

+
+

auto

+
+

Sequence type.

-

smoothscoreceil

+
+

smoothscoreceil

-

Floating point

+
+

Floating point

-

[1]

+
+

[1]

-

Maximum value of column score for - smoothing purposes.

-

 

+
+

Maximum value of column score for smoothing purposes.

+

 

-

smoothwindow

+
+

smoothwindow

-

Integer

+
+

Integer

-

7

+
+

7

-

Window used for anchor column smoothing.

-

 

+
+

Window used for anchor column smoothing.

+

 

-

spscore

+
+

spscore

-

File name

+
+

File name

-

 

+
+

 

-

Compute SP objective score of multiple alignment.

-

 

+
+

Compute SP objective score of multiple alignment.

+

 

-

SUEFF

+
+

SUEFF

-

Floating point value between 0 and 1.

-

 

+
+

Floating point value between 0 and 1.

+

 

-

0.1

+
+

0.1

-

Constant used in UPGMB clustering. Determines the relative - fraction of average linkage (SUEFF) vs. nearest-neighbor linkage (1 – SUEFF).
-
-

+
+

Constant used in UPGMB clustering. Determines the relative + fraction of average linkage (SUEFF) vs. nearest-neighbor linkage (1 – SUEFF).
+
+

-

tree1

-

tree2

+
+

tree1

+

tree2

-

File name

+
+

File name

-

None

+
+

None

-

Save tree produced in first or second iteration to given - file in Newick (Phylip-compatible) - format.

-

 

+
+

Save tree produced in first or second iteration to given + file in Newick (Phylip-compatible) format.

+

 

-

usetree

+
+

usetree

-

File name

+
+

File name

-

None

+
+

None

-

Use given tree as guide tree. Must by in Newick (Phyip-compatible) - format.

-

 

+
+

Use given tree as guide tree. Must by in Newick + (Phyip-compatible) format.

+

 

-

weight1

-

weight2

-
-

none

-

henikoff

-

henikoffpb

-

gsc

-

clustalw

-

threeway

-
-

clustalw

-

 

-
-

Sequence weighting scheme.

-

weight1 - is used in iterations 1 and 2.

-

weight2 - is used for tree-dependent refinement.

-

none=all - sequences have equal weight.

-

henikoff=Henikoff & +

+

weight1

+

weight2

+
+

none

+

henikoff

+

henikoffpb

+

gsc

+

clustalw

+

threeway

+
+

clustalw

+

 

+
+

Sequence weighting scheme.

+

weight1 is used in + iterations 1 and 2.

+

weight2 is used for + tree-dependent refinement.

+

none=all sequences have + equal weight.

+

henikoff=Henikoff & Henikoff weighting scheme.

-

henikoffpb=Modified +

henikoffpb=Modified Henikoff scheme as used in PSI-BLAST.

-

clustalw=CLUSTALW method.

-

threeway=Gotoh three-way +

clustalw=CLUSTALW method.

+

threeway=Gotoh three-way method.

-

 

+

 

+ -

 

+

 

- +
- - + - - - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - + - - - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - + - - - - - - + + + - - + - - - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - - - + + + - - + - - -
-

Flag option

+
+

Flag option

-

Set by default?

+
+

Set by default?

-

Description

+
+

Description

-

anchors

-
-

yes

-
-

Use anchor optimization in +

+

anchors

+
+

yes

+
+

Use anchor optimization in tree dependent refinement iterations.

-

 

+

 

-

brenner

-
-

no

-
-

Use Steven Brenner's method +

+

brenner

+
+

no

+
+

Use Steven Brenner's method for computing the root alignment.

-

 

+

 

-

cluster

-
-

no

-
-

Perform fast clustering of +

+

cluster

+
+

no

+
+

Perform fast clustering of input sequences. Use the –tree1 option to save the tree.

-

 

+

 

-

dimer

-
-

no

-
-

Use dimer approximation for +

+

dimer

+
+

no

+
+

Use dimer approximation for the SP score (faster, slightly less accurate).

-

 

+

 

-

clw

-
-

no

-
-

Write output in CLUSTALW +

+

clw

+
+

no

+
+

Write output in CLUSTALW format (default is FASTA).

-

 

+

 

-

clwstrict

-
-

no

-
-

Write output in CLUSTALW - format with the "CLUSTAL W (1.81)" header rather than the MUSCLE +

+

clwstrict

+
+

no

+
+

Write output in CLUSTALW + format with the "CLUSTAL W (1.81)" header rather than the MUSCLE version. This is useful when a post-processing step is picky about the file header.

-

 

+

 

-

core

-
-

yes in muscle,

-

no - in muscled.

-
-

Do not catch exceptions.

-

 

-

 

+
+

core

+
+

yes in muscle,

+

no in muscled.

+
+

Do not catch exceptions.

+

 

+

 

-

diags

-
-

no

-
-

Use diagonal optimizations. +

+

diags

+
+

no

+
+

Use diagonal optimizations. Faster, especially for closely related sequences, but may be less accurate.

-

 

+

 

-

diags1

-
-

no

-
-

Use diagonal optimizations +

+

diags1

+
+

no

+
+

Use diagonal optimizations in first iteration.

-

 

+

 

-

diags2

-
-

no

-
-

Use diagonal optimizations +

+

diags2

+
+

no

+
+

Use diagonal optimizations in second iteration.

-

 

+

 

-

fasta

-
-

yes

-
-

Write output in FASTA +

+

fasta

+
+

yes

+
+

Write output in FASTA format.

-

 

+

 

-

group

-
-

yes

-
-

Group similar sequences +

+

group

+
+

yes

+
+

Group similar sequences together in the output. This is the default. See also –stable.

-

 

+

 

-

html

-
-

no

-
-

Write output in HTML format +

+

html

+
+

no

+
+

Write output in HTML format (default is FASTA).

-

 

+

 

-

le

+
+

le

-

maybe

+
+

maybe

-

Use log-expectation profile score (VTML240). Alternatives - are to use –sp or –sv. This is the - default for amino acid sequences.

-

 

+
+

Use log-expectation profile score (VTML240). Alternatives + are to use –sp or –sv. This is the default for amino acid + sequences.

+

 

-

msf

-
-

no

-
-

Write output in MSF format (default is FASTA). Designed to +

+

msf

+
+

no

+
+

Write output in MSF format (default is FASTA). Designed to be compatible with the GCG package.

-

 

+

 

-

noanchors

-
-

no

-
-

Disable anchor optimization. Default is –anchors.

-

 

+
+

noanchors

+
+

no

+
+

Disable anchor optimization. Default is –anchors.

+

 

-

nocore

-
-

no in muscle,

-

yes in muscled.

-
-

Catch exceptions and give an error message if possible.

-

 

-

 

+
+

nocore

+
+

no in muscle,

+

yes in muscled.

+
+

Catch exceptions and give an error message if possible.

+

 

+

 

-

phyi

-
-

no

-
-

Write output in Phylip - interleaved format.

-

 

+
+

phyi

+
+

no

+
+

Write output in Phylip interleaved format.

+

 

-

phys

+
+

phys

-

no

+
+

no

-

Write output in Phylip - sequential format.

-

 

+
+

Write output in Phylip sequential format.

+

 

-

profile

-
-

no

-
-

Compute profile-profile alignment. Input alignments must +

+

profile

+
+

no

+
+

Compute profile-profile alignment. Input alignments must be given using –in1 and –in2 options.

-

 

+

 

-

quiet

+
+

quiet

-

no

+
+

no

-

Do not display progress messages.

-

 

+
+

Do not display progress messages.

+

 

-

refine

-
-

no

-
-

Input file is already aligned, skip first two iterations +

+

refine

+
+

no

+
+

Input file is already aligned, skip first two iterations and begin tree dependent refinement.

-

 

+

 

-

refinew

-
-

no

-
-

Refine an alignment by dividing it into non-overlapping +

+

refinew

+
+

no

+
+

Refine an alignment by dividing it into non-overlapping windows and re-aligning each window. Typically used for whole-genome nucleotide alignments.

-

 

+

 

-

sp

-
-

no

-
-

Use sum-of-pairs protein profile score (PAM200). Default - is –le.

-

 

+
+

sp

+
+

no

+
+

Use sum-of-pairs protein profile score (PAM200). Default + is –le.

+

 

-

spscore

-
-

no

-
-

Compute alignment score of profile-profile alignment. +

+

spscore

+
+

no

+
+

Compute alignment score of profile-profile alignment. Input alignments must be given using –in1 and –in2 options. These must be pre-aligned with gapped columns as needed, i.e. must be of the same length (have same number of columns).

-

 

+

 

-

spn

-
-

maybe

-

 

-
-

Use sum-of-pairs nucleotide profile score. This is the +

+

spn

+
+

maybe

+

 

+
+

Use sum-of-pairs nucleotide profile score. This is the only option for nucleotides, and is therefore the default. The substitution - scores and gap penalty scores are "borrowed" from BLASTZ.

-

 

+ scores and gap penalty scores are "borrowed" from BLASTZ.

+

 

-

stable

-
-

no

-
-

Preserve input order of sequences in output file. Default +

+

stable

+
+

no

+
+

Preserve input order of sequences in output file. Default is to group sequences by similarity (–group).

-

 

+

WARNING THIS OPTION WAS + BUGGY AND IS NOT SUPPORTED IN v3.8. Go to this link for more + information:  http://drive5.com/muscle/stable.html

+

 

-

sv

-
-

no

-
-

Use sum-of-pairs profile score (VTML240). Default is –le.

-

 

+
+

sv

+
+

no

+
+

Use sum-of-pairs profile score (VTML240). Default is –le.

+

 

-

termgaps4

-
-

yes

-
-

Use 4-way test for treatment of terminal gaps. (Cannot be +

+

termgaps4

+
+

yes

+
+

Use 4-way test for treatment of terminal gaps. (Cannot be disabled in this version).

-

 

+

 

-

termgapsfull

-
-

no

-
-

Terminal gaps penalized with full penalty.

-

[1] Not fully supported in this version.

-

 

+
+

termgapsfull

+
+

no

+
+

Terminal gaps penalized with full penalty.

+

[1] Not fully supported in this version.

+

 

-

termgapshalf

-
-

yes

-
-

Terminal gaps penalized with half penalty.

-

[1] Not fully supported in this version.

-

 

+
+

termgapshalf

+
+

yes

+
+

Terminal gaps penalized with half penalty.

+

[1] Not fully supported in this version.

+

 

-

termgapshalflonger

-
-

no

-
-

Terminal gaps penalized with half penalty if gap relative +

+

termgapshalflonger

+
+

no

+
+

Terminal gaps penalized with half penalty if gap relative to

-

longer sequence, otherwise with - full penalty.

-

[1] Not fully supported in this version.

-

 

+

longer sequence, otherwise with full penalty.

+

[1] Not fully supported in this version.

+

 

-

verbose

-
-

no

-
-

Write parameter settings and progress messages to log +

+

verbose

+
+

no

+
+

Write parameter settings and progress messages to log file.

-

 

+

 

-

version

+
+

version

-

no

+
+

no

-

Write version string to stdout - and exit.

+
+

Write version string to stdout and exit.

+ -

 

+

 

-

Notes

+

Notes

-

[1] Default depends on the profile scoring function. To +

[1] Default depends on the profile scoring function. To determine the default, use –verbose –log and check the log file.

-

 

+

 

- - + + + \ No newline at end of file