1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * SRE, Sun Jul 18 09:49:47 1993
13 * moved to squid Thu Mar 3 08:42:57 1994
14 * RCS $Id: cluster.c,v 1.1.1.1 2005/03/22 08:34:27 cmzmasek Exp $
16 * almost identical to bord.c, from fd
17 * also now contains routines for constructing difference matrices
20 * "branch ordering": Input a symmetric or upper-right-diagonal
21 * NxN difference matrix (usually constructed by pairwise alignment
22 * and similarity calculations for N sequences). Use the simple
23 * cluster analysis part of the Fitch/Margoliash tree-building algorithm
24 * (as described by Fitch and Margoliash 1967 as well as Feng
25 * and Doolittle 1987) to calculate the topology of an "evolutionary
26 * tree" consistent with the difference matrix. Returns an array
27 * which represents the tree.
29 * The input difference matrix is just an NxN matrix of floats.
30 * A good match is a small difference score (the algorithm is going
31 * to search for minima among the difference scores). The original difference
32 * matrix remains unchanged by the calculations.
34 * The output requires some explanation. A phylogenetic
35 * tree is a binary tree, with N "leaves" and N-1 "nodes". The
36 * topology of the tree may be completely described by N-1 structures
37 * containing two pointers; each pointer points to either a leaf
38 * or another node. Here, this is implemented with integer indices
39 * rather than pointers. An array of N-1 pairs of ints is returned.
40 * If the index is in the range (0..N-1), it is a "leaf" -- the
41 * number of one of the sequences. If the index is in the range
42 * (N..2N-2), it is another "node" -- (index-N) is the index
43 * of the node in the returned array.
45 * If both indices of a member of the returned array point to
46 * nodes, the tree is "compound": composed of more than one
47 * cluster of related sequences.
49 * The higher-numbered elements of the returned array were the
50 * first constructed, and hence represent the distal tips
51 * of the tree -- the most similar sequences. The root
53 ******************************************************************
58 * - copy the difference matrix (otherwise the caller's copy would
59 * get destroyed by the operations of this algorithm). If
60 * it's asymmetric, make it symmetric.
61 * - make a (0..N-1) array of ints to keep track of the indices in
62 * the difference matrix as they get swapped around. Initialize
63 * this matrix to 0..N-1.
64 * - make a (0..N-2) array of int[2] to store the results (the tree
65 * topology). Doesn't need to be initialized.
66 * - keep track of a "N'", the current size of the difference
67 * matrix being operated on.
69 * PROCESSING THE DIFFERENCE MATRIX:
70 * - for N' = N down to N' = 2 (N-1 steps):
71 * - in the half-diagonal N'xN' matrix, find the indices i,j at which
72 * there's the minimum difference score
75 * - at position N'-2 of the result array, store coords[i] and
78 * Move i,j rows, cols to the outside edges of the matrix:
79 * - swap row i and row N'-2
80 * - swap row j and row N'-1
81 * - swap column i and column N'-2
82 * - swap column j and column N'-1
83 * - swap indices i, N'-2 in the index array
84 * - swap indices j, N'-1 in the index array
86 * Build a average difference score for differences to i,j:
87 * - for all columns, find avg difference between rows i and j and store in row i:
88 * row[i][col] = (row[i][col] + row[j][col]) / 2.0
89 * - copy the contents of row i to column i (it's a symmetric
90 * matrix, no need to recalculate)
91 * - store an index N'+N-2 at position N'-2 of the index array: means
92 * that this row/column is now a node rather than a leaf, and
93 * contains minimum values
98 * GARBAGE COLLECTION & RETURN.
100 **********************************************************************
104 * Feng D-F and R.F. Doolittle. "Progressive sequence alignment as a
105 * prerequisite to correct phylogenetic trees." J. Mol. Evol.
108 * Fitch W.M. and Margoliash E. "Construction of phylogenetic trees."
109 * Science 155:279-284, 1967.
111 **********************************************************************
113 * SRE, 18 March 1992 (bord.c)
114 * SRE, Sun Jul 18 09:52:14 1993 (cluster.c)
115 * added to squid Thu Mar 3 09:13:56 1994
116 **********************************************************************
117 * Mon May 4 09:47:02 1992: keep track of difference scores at each node
129 #include "dbmalloc.h"
132 /* Function: Cluster()
134 * Purpose: Cluster analysis on a distance matrix. Constructs a
135 * phylogenetic tree which contains the topology
136 * and info for each node: branch lengths, how many
137 * sequences are included under the node, and which
138 * sequences are included under the node.
140 * Args: dmx - the NxN distance matrix ( >= 0.0, larger means more diverged)
141 * N - size of mx (number of sequences)
142 * mode - CLUSTER_MEAN, CLUSTER_MAX, or CLUSTER_MIN
143 * ret_tree- RETURN: the tree
145 * Return: 1 on success, 0 on failure.
146 * The caller is responsible for freeing the tree's memory,
147 * by calling FreePhylo(tree, N).
150 Cluster(float **dmx, int N, enum clust_strategy mode, struct phylo_s **ret_tree)
152 struct phylo_s *tree; /* (0..N-2) phylogenetic tree */
153 float **mx; /* copy of difference matrix */
154 int *coord; /* (0..N-1), indices for matrix coords */
155 int i, j; /* coords of minimum difference */
156 int idx; /* counter over seqs */
157 int Np; /* N', a working copy of N */
158 int row, col; /* loop variables */
159 float min; /* best minimum score found */
160 float *trow; /* tmp pointer for swapping rows */
161 float tcol; /* tmp storage for swapping cols */
162 float *diff; /* (0..N-2) difference scores at nodes */
163 int swapfoo; /* for SWAP() macro */
165 /**************************
167 **************************/
168 /* We destroy the matrix we work on, so make a copy of dmx.
170 mx = MallocOrDie (sizeof(float *) * N);
171 for (i = 0; i < N; i++)
173 mx[i] = MallocOrDie (sizeof(float) * N);
174 for (j = 0; j < N; j++)
175 mx[i][j] = dmx[i][j];
177 /* coord array alloc, (0..N-1) */
178 coord = MallocOrDie (N * sizeof(int));
179 diff = MallocOrDie ((N-1) * sizeof(float));
180 /* init the coord array to 0..N-1 */
181 for (col = 0; col < N; col++) coord[col] = col;
182 for (i = 0; i < N-1; i++) diff[i] = 0.0;
184 /* tree array alloc, (0..N-2) */
185 if ((tree = AllocPhylo(N)) == NULL) Die("AllocPhylo() failed");
187 /*********************************
188 * Process the difference matrix
189 *********************************/
191 /* N-prime, for an NxN down to a 2x2 diffmx */
192 j= 0; /* just to silence gcc uninit warnings */
193 for (Np = N; Np >= 2; Np--)
195 /* find a minimum on the N'xN' matrix*/
197 for (row = 0; row < Np; row++)
198 for (col = row+1; col < Np; col++)
199 if (mx[row][col] < min)
206 /* We're clustering row i with col j. write necessary
207 * data into a node on the tree
210 tree[Np-2].left = coord[i];
211 tree[Np-2].right = coord[j];
212 if (coord[i] >= N) tree[coord[i]-N].parent = N + Np - 2;
213 if (coord[j] >= N) tree[coord[j]-N].parent = N + Np - 2;
215 /* keep score info */
216 diff[Np-2] = tree[Np-2].diff = min;
218 /* way-simple branch length estimation */
219 tree[Np-2].lblen = tree[Np-2].rblen = min;
220 if (coord[i] >= N) tree[Np-2].lblen -= diff[coord[i]-N];
221 if (coord[j] >= N) tree[Np-2].rblen -= diff[coord[j]-N];
223 /* number seqs included at node */
226 tree[Np-2].incnum ++;
227 tree[Np-2].is_in[coord[i]] = 1;
231 tree[Np-2].incnum += tree[coord[i]-N].incnum;
232 for (idx = 0; idx < N; idx++)
233 tree[Np-2].is_in[idx] |= tree[coord[i]-N].is_in[idx];
238 tree[Np-2].incnum ++;
239 tree[Np-2].is_in[coord[j]] = 1;
243 tree[Np-2].incnum += tree[coord[j]-N].incnum;
244 for (idx = 0; idx < N; idx++)
245 tree[Np-2].is_in[idx] |= tree[coord[j]-N].is_in[idx];
249 /* Now build a new matrix, by merging row i with row j and
250 * column i with column j; see Fitch and Margoliash
252 /* Row and column swapping. */
253 /* watch out for swapping i, j away: */
254 if (i == Np-1 || j == Np-2)
259 /* swap row i, row N'-2 */
260 trow = mx[Np-2]; mx[Np-2] = mx[i]; mx[i] = trow;
261 /* swap col i, col N'-2 */
262 for (row = 0; row < Np; row++)
264 tcol = mx[row][Np-2];
265 mx[row][Np-2] = mx[row][i];
268 /* swap coord i, coord N'-2 */
269 SWAP(coord[i], coord[Np-2]);
274 /* swap row j, row N'-1 */
275 trow = mx[Np-1]; mx[Np-1] = mx[j]; mx[j] = trow;
276 /* swap col j, col N'-1 */
277 for (row = 0; row < Np; row++)
279 tcol = mx[row][Np-1];
280 mx[row][Np-1] = mx[row][j];
283 /* swap coord j, coord N'-1 */
284 SWAP(coord[j], coord[Np-1]);
287 /* average i and j together; they're now
288 at Np-2 and Np-1 though */
291 /* merge by saving avg of cols of row i and row j */
292 for (col = 0; col < Np; col++)
295 case CLUSTER_MEAN: mx[i][col] =(mx[i][col]+ mx[j][col]) / 2.0; break;
296 case CLUSTER_MIN: mx[i][col] = MIN(mx[i][col], mx[j][col]); break;
297 case CLUSTER_MAX: mx[i][col] = MAX(mx[i][col], mx[j][col]); break;
298 default: mx[i][col] =(mx[i][col]+ mx[j][col]) / 2.0; break;
301 /* copy those rows to columns */
302 for (col = 0; col < Np; col++)
303 mx[col][i] = mx[i][col];
304 /* store the node index in coords */
305 coord[Np-2] = Np+N-2;
308 /**************************
309 * Garbage collection and return
310 **************************/
311 Free2DArray((void **) mx, N);
318 /* Function: AllocPhylo()
320 * Purpose: Allocate space for a phylo_s array. N-1 structures
321 * are allocated, one for each node; in each node, a 0..N
322 * is_in flag array is also allocated and initialized to
325 * Args: N - size; number of sequences being clustered
327 * Return: pointer to the allocated array
333 struct phylo_s *tree;
336 if ((tree = (struct phylo_s *) malloc ((N-1) * sizeof(struct phylo_s))) == NULL)
339 for (i = 0; i < N-1; i++)
342 tree[i].lblen = tree[i].rblen = 0.0;
343 tree[i].left = tree[i].right = tree[i].parent = -1;
345 if ((tree[i].is_in = (char *) calloc (N, sizeof(char))) == NULL)
352 /* Function: FreePhylo()
354 * Purpose: Free a clustree array that was built to cluster N sequences.
356 * Args: tree - phylogenetic tree to free
357 * N - size of clustree; number of sequences it clustered
362 FreePhylo(struct phylo_s *tree, int N)
366 for (idx = 0; idx < N-1; idx++)
367 free(tree[idx].is_in);
372 /* Function: MakeDiffMx()
374 * Purpose: Given a set of aligned sequences, construct
375 * an NxN fractional difference matrix. (i.e. 1.0 is
376 * completely different, 0.0 is exactly identical).
378 * Args: aseqs - flushed, aligned sequences
379 * num - number of aseqs
380 * ret_dmx - RETURN: difference matrix
382 * Return: 1 on success, 0 on failure.
383 * Caller must free diff matrix with FMX2Free(dmx)
386 MakeDiffMx(char **aseqs, int num, float ***ret_dmx)
388 float **dmx; /* RETURN: distance matrix */
389 int i,j; /* counters over sequences */
391 /* Allocate 2D float matrix
393 dmx = FMX2Alloc(num, num);
395 /* Calculate distances; symmetric matrix
396 * record difference, not identity (1 - identity)
398 for (i = 0; i < num; i++)
399 for (j = i; j < num; j++)
400 dmx[i][j] = dmx[j][i] = 1.0 - PairwiseIdentity(aseqs[i], aseqs[j]);
406 /* Function: MakeIdentityMx()
408 * Purpose: Given a set of aligned sequences, construct
409 * an NxN fractional identity matrix. (i.e. 1.0 is
410 * completely identical, 0.0 is completely different).
411 * Virtually identical to MakeDiffMx(). It's
412 * less confusing to have two distinct functions, I find.
414 * Args: aseqs - flushed, aligned sequences
415 * num - number of aseqs
416 * ret_imx - RETURN: identity matrix (caller must free)
418 * Return: 1 on success, 0 on failure.
419 * Caller must free imx using FMX2Free(imx)
422 MakeIdentityMx(char **aseqs, int num, float ***ret_imx)
424 float **imx; /* RETURN: identity matrix */
425 int i,j; /* counters over sequences */
427 /* Allocate 2D float matrix
429 imx = FMX2Alloc(num, num);
431 /* Calculate distances, symmetric matrix
433 for (i = 0; i < num; i++)
434 for (j = i; j < num; j++)
435 imx[i][j] = imx[j][i] = PairwiseIdentity(aseqs[i], aseqs[j]);
443 /* Function: PrintNewHampshireTree()
445 * Purpose: Print out a tree in the "New Hampshire" standard
446 * format. See PHYLIP's draw.doc for a definition of
447 * the New Hampshire format.
449 * Like a CFG, we generate the format string left to
450 * right by a preorder tree traversal.
452 * Args: fp - file to print to
453 * ainfo- alignment info, including sequence names
454 * tree - tree to print
455 * N - number of leaves
459 PrintNewHampshireTree(FILE *fp, AINFO *ainfo, struct phylo_s *tree, int N)
461 struct intstack_s *stack;
466 blen = (float *) MallocOrDie (sizeof(float) * (2*N-1));
467 stack = InitIntStack();
468 PushIntStack(stack, N); /* push root on stack */
472 * 0..N-1 = leaves; indexes of sequences.
473 * N..2N-2 = interior nodes; node-N = index of node in tree structure.
474 * code N is the root.
475 * 2N..3N-2 = special flags for closing interior nodes; node-2N = index in tree
477 while (PopIntStack(stack, &code))
479 if (code < N) /* we're a leaf. */
481 /* 1) print name:branchlength */
482 if (docomma) fputs(",", fp);
483 fprintf(fp, "%s:%.5f", ainfo->sqinfo[code].name, blen[code]);
487 else if (code < 2*N) /* we're an interior node */
490 if (docomma) fputs(",\n", fp);
492 /* 2) push on stack: ), rchild, lchild */
493 PushIntStack(stack, code+N);
494 PushIntStack(stack, tree[code-N].right);
495 PushIntStack(stack, tree[code-N].left);
496 /* 3) record branch lengths */
497 blen[tree[code-N].right] = tree[code-N].rblen;
498 blen[tree[code-N].left] = tree[code-N].lblen;
502 else /* we're closing an interior node */
504 /* print a ):branchlength */
505 if (code == 2*N) fprintf(fp, ");\n");
506 else fprintf(fp, "):%.5f", blen[code-N]);
517 /* Function: PrintPhylo()
519 * Purpose: Debugging output of a phylogenetic tree structure.
522 PrintPhylo(FILE *fp, AINFO *ainfo, struct phylo_s *tree, int N)
526 for (idx = 0; idx < N-1; idx++)
528 fprintf(fp, "Interior node %d (code %d)\n", idx, idx+N);
529 fprintf(fp, "\tParent: %d (code %d)\n", tree[idx].parent-N, tree[idx].parent);
530 fprintf(fp, "\tLeft: %d (%s) %f\n",
531 tree[idx].left < N ? tree[idx].left-N : tree[idx].left,
532 tree[idx].left < N ? ainfo->sqinfo[tree[idx].left].name : "interior",
534 fprintf(fp, "\tRight: %d (%s) %f\n",
535 tree[idx].right < N ? tree[idx].right-N : tree[idx].right,
536 tree[idx].right < N ? ainfo->sqinfo[tree[idx].right].name : "interior",
538 fprintf(fp, "\tHeight: %f\n", tree[idx].diff);
539 fprintf(fp, "\tIncludes:%d seqs\n", tree[idx].incnum);