--- /dev/null
+/***\r
+Code for implementing HMMer's "BLOSUM weighting" algorithm.\r
+\r
+The algorithm was deduced by reverse-engineering the HMMer code.\r
+\r
+The HMMer documentation refers to BLOSUM weighting as "Henikoff\r
+simple filter weighting"\r
+\r
+The name BLOSUM implied to me that HMMer would be using a\r
+substitution probability matrix to compute distances, but this\r
+turned out not to be the case.\r
+\r
+It is notable, not to say puzzling, that the HMMer BLOSUM weighting\r
+algorithm is guaranteed to produce an integral NIC (number-of-indepdent-\r
+counts, also known as effective sequence count). Presumably Eddy must\r
+have known this, though he doesn't comment on it and he computes & stores\r
+the value in a float.\r
+\r
+Here's the algorithm:\r
+\r
+Distances between two sequences are based on the average of a simple \r
+binary equal (one) / not equal (zero) at each position. The only thing\r
+that has anything to do with BLOSUM in this calculation is an obscure\r
+(to me) constant value of 0.62. The sequences are clustered using this\r
+distance. If the pairwise identity (fraction of identical positions)\r
+is less than 0.62, they get assigned to disjoint clusters, the final\r
+number of disjoint clusters is the NIC. This makes some intuitive sense:\r
+I would interpret this by saying that if a set of sequences are close\r
+enough they count as one sequence. The weight for each sequence within a\r
+disjoint cluster is then determined to be 1 / (clustersize), from which it\r
+follows that the sum of all weights is equal to the number of disjoint\r
+clusters and is thus guaranteed to be an integer value. It is exactly this\r
+sum that HMMer uses for the NIC, by default.\r
+\r
+The individual BLOSUM sequence weights are not used for anything else in\r
+HMMer, unless you specify that BLOSUM weighting should override the default\r
+GSC weighting. GSC weighting uses a different clustering algorithm to\r
+determine relative weights. The BLOSUM NIC is then distributed over the\r
+GSC tree according to those relative weights.\r
+***/\r
+\r
+#include "muscle.h"\r
+#include "msa.h"\r
+#include "cluster.h"\r
+#include "distfunc.h"\r
+\r
+// Set weights of all sequences in the subtree under given node.\r
+void MSA::SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const\r
+ {\r
+ if (0 == ptrNode)\r
+ return;\r
+\r
+ const ClusterNode *ptrRight = ptrNode->GetRight();\r
+ const ClusterNode *ptrLeft = ptrNode->GetLeft();\r
+\r
+// If leaf, set weight\r
+ if (0 == ptrRight && 0 == ptrLeft)\r
+ {\r
+ unsigned uIndex = ptrNode->GetIndex();\r
+ WEIGHT w = DoubleToWeight(dWeight);\r
+ m_Weights[uIndex] = w;\r
+ return;\r
+ }\r
+\r
+// Otherwise, recursively set subtrees\r
+ SetBLOSUMSubtreeWeight(ptrLeft, dWeight);\r
+ SetBLOSUMSubtreeWeight(ptrRight, dWeight);\r
+ }\r
+\r
+// Traverse a subtree looking for clusters where all\r
+// the leaves are sufficiently similar that they\r
+// should be weighted as a group, i.e. given a weight\r
+// of 1/N where N is the cluster size. The idea is\r
+// to avoid sample bias where we have closely related\r
+// sequences in the input alignment.\r
+// The weight at a node is the distance between\r
+// the two closest sequences in the left and right\r
+// subtrees under that node. "Sufficiently similar"\r
+// is defined as being where that minimum distance\r
+// is less than the dMinDist threshhold. I don't know\r
+// why the clustering is done using a minimum rather\r
+// than a maximum or average, either of which would\r
+// seem more natural to me.\r
+// Return value is number of groups under this node.\r
+// A "group" is the cluster found under a node with a\r
+// weight less than the minimum.\r
+unsigned MSA::SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const\r
+ {\r
+ if (0 == ptrNode)\r
+ return 0;\r
+\r
+ if (ptrNode->GetWeight() < dMinDist)\r
+ {\r
+ unsigned uClusterSize = ptrNode->GetClusterSize(); \r
+ assert(uClusterSize > 0);\r
+ double dWeight = 1.0 / uClusterSize;\r
+ SetBLOSUMSubtreeWeight(ptrNode, dWeight);\r
+ return 1;\r
+ }\r
+\r
+ const ClusterNode *ptrLeft = ptrNode->GetLeft();\r
+ const ClusterNode *ptrRight = ptrNode->GetRight();\r
+\r
+ unsigned uLeftGroupCount = SetBLOSUMNodeWeight(ptrLeft, dMinDist);\r
+ unsigned uRightGroupCount = SetBLOSUMNodeWeight(ptrRight, dMinDist);\r
+\r
+ return uLeftGroupCount + uRightGroupCount;\r
+ }\r
+\r
+// Return value is the group count, i.e. the effective number\r
+// of distinctly different sequences.\r
+unsigned MSA::CalcBLOSUMWeights(ClusterTree &BlosumCluster) const\r
+ {\r
+// Build distance matrix\r
+ DistFunc DF;\r
+ unsigned uSeqCount = GetSeqCount();\r
+ DF.SetCount(uSeqCount);\r
+ for (unsigned i = 0; i < uSeqCount; ++i)\r
+ for (unsigned j = i+1; j < uSeqCount; ++j)\r
+ {\r
+ double dDist = GetPctIdentityPair(i, j);\r
+ assert(dDist >= 0.0 && dDist <= 1.0);\r
+ DF.SetDist(i, j, (float) (1.0 - dDist));\r
+ }\r
+\r
+// Cluster based on the distance function\r
+ BlosumCluster.Create(DF);\r
+\r
+// Return value is HMMer's "effective sequence count".\r
+ return SetBLOSUMNodeWeight(BlosumCluster.GetRoot(), 1.0 - BLOSUM_DIST);\r
+ }\r