15 #ifdef _MSC_VER // Miscrosoft compiler
\r
16 #pragma warning(disable : 4800) // int-bool conversion
\r
17 #pragma warning(disable : 4996) // deprecated names like strdup, isatty.
\r
20 #define MUSCLE_LONG_VERSION "MUSCLE v3.7 by Robert C. Edgar"
\r
21 #define MUSCLE_MAJOR_VERSION "3"
\r
22 #define MUSCLE_MINOR_VERSION "7"
\r
30 #define DOUBLE_AFFINE 0
\r
31 #define SINGLE_AFFINE 1
\r
35 #include "intmath.h"
\r
40 #define stricmp strcasecmp
\r
41 #define strnicmp strncasecmp
\r
42 #define _snprintf snprintf
\r
43 #define _fsopen(name, mode, share) fopen((name), (mode))
\r
48 #define assert(b) Call_MY_ASSERT(__FILE__, __LINE__, b, #b)
\r
49 void Call_MY_ASSERT(const char *file, int line, bool b, const char *msg);
\r
51 #define assert(exp) ((void)0)
\r
55 extern char **g_argv;
\r
57 #define Rotate(a, b, c) { SCORE *tmp = a; a = b; b = c; c = tmp; }
\r
59 const double VERY_LARGE_DOUBLE = 1e20;
\r
61 extern unsigned g_uTreeSplitNode1;
\r
62 extern unsigned g_uTreeSplitNode2;
\r
64 // Number of elements in array a[]
\r
65 #define countof(a) (sizeof(a)/sizeof(a[0]))
\r
67 // Maximum of two of any type
\r
68 #define Max2(a, b) ((a) > (b) ? (a) : (b))
\r
70 // Maximum of three of any type
\r
71 #define Max3(a, b, c) Max2(Max2(a, b), c)
\r
73 // Minimum of two of any type
\r
74 #define Min2(a, b) ((a) < (b) ? (a) : (b))
\r
76 // Maximum of four of any type
\r
77 #define Max4(a, b, c, d) Max2(Max2(a, b), Max2(c, d))
\r
79 const double VERY_NEGATIVE_DOUBLE = -9e29;
\r
80 const float VERY_NEGATIVE_FLOAT = (float) -9e29;
\r
82 const double BLOSUM_DIST = 0.62; // todo settable
\r
84 // insane value for uninitialized variables
\r
85 const unsigned uInsane = 8888888;
\r
86 const int iInsane = 8888888;
\r
87 const SCORE scoreInsane = 8888888;
\r
88 const char cInsane = (char) 0xcd; // int 3 instruction, used e.g. for unint. memory
\r
89 const double dInsane = VERY_NEGATIVE_DOUBLE;
\r
90 const float fInsane = VERY_NEGATIVE_FLOAT;
\r
91 const char INVALID_STATE = '*';
\r
92 const BASETYPE BTInsane = (BASETYPE) dInsane;
\r
93 const WEIGHT wInsane = BTInsane;
\r
95 extern double g_dNAN;
\r
97 extern unsigned long g_tStart;
\r
99 void Quit(const char szFormat[], ...);
\r
100 void Warning(const char szFormat[], ...);
\r
101 void TrimBlanks(char szStr[]);
\r
102 void TrimLeadingBlanks(char szStr[]);
\r
103 void TrimTrailingBlanks(char szStr[]);
\r
104 void Log(const char szFormat[], ...);
\r
106 const char *ScoreToStr(SCORE Score);
\r
107 const char *ScoreToStrL(SCORE Score);
\r
108 SCORE StrToScore(const char *pszStr);
\r
111 double VecSum(const double v[], unsigned n);
\r
112 bool IsValidInteger(const char *Str);
\r
113 bool IsValidSignedInteger(const char *Str);
\r
114 bool IsValidIdentifier(const char *Str);
\r
115 bool IsValidFloatChar(char c);
\r
116 bool isident(char c);
\r
117 bool isidentf(char c);
\r
119 void TreeFromSeqVect(const SeqVect &c, Tree &tree, CLUSTER Cluster,
\r
120 DISTANCE Distance, ROOT Root, const char *SaveFileName = 0);
\r
121 void TreeFromMSA(const MSA &msa, Tree &tree, CLUSTER Cluster,
\r
122 DISTANCE Distance, ROOT Root, const char *SaveFileName = 0);
\r
124 void StripGaps(char szStr[]);
\r
125 void StripWhitespace(char szStr[]);
\r
126 const char *GetTimeAsStr();
\r
127 unsigned CalcBLOSUMWeights(MSA &Aln, ClusterTree &BlosumCluster);
\r
128 void CalcGSCWeights(MSA &Aln, const ClusterTree &BlosumCluster);
\r
129 void AssertNormalized(const PROB p[]);
\r
130 void AssertNormalizedOrZero(const PROB p[]);
\r
131 void AssertNormalized(const double p[]);
\r
132 bool VectorIsZero(const double dValues[], unsigned n);
\r
133 void VectorSet(double dValues[], unsigned n, double d);
\r
134 bool VectorIsZero(const float dValues[], unsigned n);
\r
135 void VectorSet(float dValues[], unsigned n, float d);
\r
137 // @@TODO should be "not linux"
\r
139 double log2(double x); // Defined in <math.h> on Linux
\r
142 double pow2(double x);
\r
143 double lnTolog2(double ln);
\r
145 double lp2(double x);
\r
146 SCORE SumLog(SCORE x, SCORE y);
\r
147 SCORE SumLog(SCORE x, SCORE y, SCORE z);
\r
148 SCORE SumLog(SCORE w, SCORE x, SCORE y, SCORE z);
\r
150 double lp2Fast(double x);
\r
151 double SumLogFast(double x, double y);
\r
152 double SumLogFast(double x, double y, double z);
\r
153 double SumLogFast(double w, double x, double y, double z);
\r
155 void chkmem(const char szMsg[] = "");
\r
157 void Normalize(PROB p[], unsigned n);
\r
158 void Normalize(PROB p[], unsigned n, double dRequiredTotal);
\r
159 void NormalizeUnlessZero(PROB p[], unsigned n);
\r
161 void DebugPrintf(const char szFormat[], ...);
\r
162 void SetListFileName(const char *ptrListFileName, bool bAppend);
\r
163 void ModelFromAlign(const char *strInputFileName, const char *strModelFileName,
\r
165 double GetMemUseMB();
\r
166 double GetRAMSizeMB();
\r
167 double GetPeakMemUseMB();
\r
168 void CheckMemUse();
\r
169 const char *ElapsedTimeAsString();
\r
170 char *SecsToHHMMSS(long lSecs, char szStr[]);
\r
171 double GetCPUGHz();
\r
172 SCORE GetBlosum62(unsigned uLetterA, unsigned uLetterB);
\r
173 SCORE GetBlosum62d(unsigned uLetterA, unsigned uLetterB);
\r
174 SCORE GetBlosum50(unsigned uLetterA, unsigned uLetterB);
\r
175 void AssertNormalizedDist(const PROB p[], unsigned N);
\r
176 void CmdLineError(const char *Format, ...);
\r
177 void Fatal(const char *Format, ...);
\r
179 void ExecCommandLine(int argc, char *argv[]);
\r
182 void NameFromPath(const char szPath[], char szName[], unsigned uBytes);
\r
183 char *strsave(const char *s);
\r
184 void DistKmer20_3(const SeqVect &v, DistFunc &DF);
\r
185 void DistKbit20_3(const SeqVect &v, DistFunc &DF);
\r
186 void DistKmer6_6(const SeqVect &v, DistFunc &DF);
\r
187 void DistKmer4_6(const SeqVect &v, DistFunc &DF);
\r
188 void DistPWKimura(const SeqVect &v, DistFunc &DF);
\r
189 void FastDistKmer(const SeqVect &v, DistFunc &DF);
\r
190 void DistUnaligned(const SeqVect &v, DISTANCE DistMethod, DistFunc &DF);
\r
191 double PctIdToMAFFTDist(double dPctId);
\r
192 double KimuraDist(double dPctId);
\r
193 void SetFastParams();
\r
194 void AssertProfsEq(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
\r
195 unsigned uLengthB);
\r
196 void ValidateMuscleIds(const MSA &msa);
\r
197 void ValidateMuscleIds(const Tree &tree);
\r
198 void TraceBackToPath(int **TraceBack, unsigned uLengthA,
\r
199 unsigned uLengthB, PWPath &Path);
\r
200 void BitTraceBack(char **TraceBack, unsigned uLengthA, unsigned uLengthB,
\r
201 char LastEdge, PWPath &Path);
\r
202 SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path,
\r
203 bool bLockLeft = false, bool bLockRight = false);
\r
204 SCORE AlignTwoProfs(
\r
205 const ProfPos *PA, unsigned uLengthA, WEIGHT wA,
\r
206 const ProfPos *PB, unsigned uLengthB, WEIGHT wB,
\r
207 PWPath &Path, ProfPos **ptrPout, unsigned *ptruLengthOut);
\r
208 void AlignTwoProfsGivenPath(const PWPath &Path,
\r
209 const ProfPos *PA, unsigned uLengthA, WEIGHT wA,
\r
210 const ProfPos *PB, unsigned uLengthB, WEIGHT wB,
\r
211 ProfPos **ptrPOut, unsigned *ptruLengthOut);
\r
212 void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB,
\r
214 void AlignTwoMSAsGivenPath(const PWPath &Path, const MSA &msaA, const MSA &msaB,
\r
216 SCORE FastScorePath2(const ProfPos *PA, unsigned uLengthA,
\r
217 const ProfPos *PB, unsigned uLengthB, const PWPath &Path);
\r
218 SCORE GlobalAlignDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
\r
219 unsigned uLengthB, PWPath &Path);
\r
220 SCORE GlobalAlignSimple(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
\r
221 unsigned uLengthB, PWPath &Path);
\r
222 SCORE GlobalAlignSP(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
\r
223 unsigned uLengthB, PWPath &Path);
\r
224 SCORE GlobalAlignSPN(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
\r
225 unsigned uLengthB, PWPath &Path);
\r
226 SCORE GlobalAlignLE(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
\r
227 unsigned uLengthB, PWPath &Path);
\r
228 void CalcThreeWayWeights(const Tree &tree, unsigned uNode1, unsigned uNode2,
\r
230 SCORE GlobalAlignSS(const Seq &seqA, const Seq &seqB, PWPath &Path);
\r
231 bool RefineHoriz(MSA &msaIn, const Tree &tree, unsigned uIters, bool bLockLeft, bool bLockRight);
\r
232 bool RefineVert(MSA &msaIn, const Tree &tree, unsigned uIters);
\r
233 SCORE GlobalAlignNoDiags(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
\r
234 unsigned uLengthB, PWPath &Path);
\r
236 void SetInputFileName(const char *pstrFileName);
\r
237 void SetIter(unsigned uIter);
\r
239 void SetMaxIters(unsigned uMaxIters);
\r
240 void Progress(unsigned uStep, unsigned uTotalSteps);
\r
241 void Progress(const char *szFormat, ...);
\r
242 void SetStartTime();
\r
243 void ProgressStepsDone();
\r
244 void SetProgressDesc(const char szDesc[]);
\r
245 void SetSeqStats(unsigned uSeqCount, unsigned uMaxL, unsigned uAvgL);
\r
247 void SetNewHandler();
\r
248 void SaveCurrentAlignment();
\r
249 void SetCurrentAlignment(MSA &msa);
\r
250 void SetOutputFileName(const char *out);
\r
253 void SetMuscleSeqVect(SeqVect &v);
\r
254 void SetMuscleInputMSA(MSA &msa);
\r
255 void ValidateMuscleIds(const MSA &msa);
\r
256 void ValidateMuscleIds(const Tree &tree);
\r
258 #define SetMuscleSeqVect(x) /* empty */
\r
259 #define SetMuscleInputMSA(x) /* empty */
\r
260 #define ValidateMuscleIds(x) /* empty */
\r
263 void ProcessArgVect(int argc, char *argv[]);
\r
264 void ProcessArgStr(const char *Str);
\r
268 void SortCounts(const FCOUNT fcCounts[], unsigned SortOrder[]);
\r
269 unsigned ResidueGroupFromFCounts(const FCOUNT fcCounts[]);
\r
270 FCOUNT SumCounts(const FCOUNT Counts[]);
\r
272 bool FlagOpt(const char *Name);
\r
273 const char *ValueOpt(const char *Name);
\r
277 void ProgAlignSubFams();
\r
280 void OnException();
\r
281 void SetSeqWeightMethod(SEQWEIGHT Method);
\r
282 SEQWEIGHT GetSeqWeightMethod();
\r
283 WEIGHT GetMuscleSeqWeightById(unsigned uId);
\r
284 void ListDiagSavings();
\r
285 void CheckMaxTime();
\r
286 const char *MaxSecsToStr();
\r
287 unsigned long GetStartTime();
\r
289 void ProgressiveAlign(const SeqVect &v, const Tree &GuideTree, MSA &a);
\r
290 ProgNode *ProgressiveAlignE(const SeqVect &v, const Tree &GuideTree, MSA &a);
\r
292 void CalcDistRangeKmer6_6(const MSA &msa, unsigned uRow, float Dist[]);
\r
293 void CalcDistRangeKmer20_3(const MSA &msa, unsigned uRow, float Dist[]);
\r
294 void CalcDistRangeKmer20_4(const MSA &msa, unsigned uRow, float Dist[]);
\r
295 void CalcDistRangePctIdKimura(const MSA &msa, unsigned uRow, float Dist[]);
\r
296 void CalcDistRangePctIdLog(const MSA &msa, unsigned uRow, float Dist[]);
\r
298 void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a);
\r
299 void MakeRootMSABrenner(SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a);
\r
305 void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage);
\r
307 char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel,
\r
308 bool DeleteGaps = true);
\r
309 SCORE SW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
\r
310 unsigned uLengthB, PWPath &Path);
\r
311 void TraceBackSW(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
\r
312 unsigned uLengthB, const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_,
\r
313 unsigned uPrefixLengthAMax, unsigned uPrefixLengthBMax, PWPath &Path);
\r
314 void DiffPaths(const PWPath &p1, const PWPath &p2, unsigned Edges1[],
\r
315 unsigned *ptruDiffCount1, unsigned Edges2[], unsigned *ptruDiffCount2);
\r
316 void SetPPScore(bool bRespectFlagOpts = true);
\r
317 void SetPPScore(PPSCORE p);
\r
318 SCORE GlobalAlignDimer(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB,
\r
319 unsigned uLengthB, PWPath &Path);
\r
320 bool MissingCommand();
\r
322 void ProfileProfile(MSA &msa1, MSA &msa2, MSA &msaOut);
\r
323 void MHackStart(SeqVect &v);
\r
324 void MHackEnd(MSA &msa);
\r
325 void WriteScoreFile(const MSA &msa);
\r
326 char ConsensusChar(const ProfPos &PP);
\r
327 void Stabilize(const MSA &msa, MSA &msaStable);
\r
328 void MuscleOutput(MSA &msa);
\r
329 PTR_SCOREMATRIX ReadMx(TextFile &File);
\r
330 void MemPlus(size_t Bytes, char *Where);
\r
331 void MemMinus(size_t Bytes, char *Where);
\r