1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
13 * Routines for randomizing sequences.
15 * All routines are alphabet-independent (DNA, protein, RNA, whatever);
16 * they assume that input strings are purely alphabetical [a-zA-Z], and
17 * will return strings in all upper case [A-Z].
19 * All return 1 on success, and 0 on failure; 0 status invariably
20 * means the input string was not alphabetical.
22 * StrShuffle() - shuffled string, preserve mono-symbol composition.
23 * StrDPShuffle() - shuffled string, preserve mono- and di-symbol composition.
25 * StrMarkov0() - random string, same zeroth order Markov properties.
26 * StrMarkov1() - random string, same first order Markov properties.
28 * StrReverse() - simple reversal of string
29 * StrRegionalShuffle() - mono-symbol shuffled string in regional windows
31 * There are also similar routines for shuffling alignments:
33 * AlignmentShuffle() - alignment version of StrShuffle().
34 * AlignmentBootstrap() - sample with replacement; a bootstrap dataset.
36 * CVS $Id: shuffle.c,v 1.1.1.1 2005/03/22 08:34:24 cmzmasek Exp $
44 /* Function: StrShuffle()
46 * Purpose: Returns a shuffled version of s2, in s1.
47 * (s1 and s2 can be identical, to shuffle in place.)
49 * Args: s1 - allocated space for shuffled string.
50 * s2 - string to shuffle.
52 * Return: 1 on success.
55 StrShuffle(char *s1, char *s2)
61 if (s1 != s2) strcpy(s1, s2);
62 for (len = strlen(s1); len > 1; len--)
72 /* Function: StrDPShuffle()
73 * Date: SRE, Fri Oct 29 09:15:17 1999 [St. Louis]
75 * Purpose: Returns a shuffled version of s2, in s1.
76 * (s1 and s2 may be identical; i.e. a string
77 * may be shuffled in place.) The shuffle is a
78 * "doublet-preserving" (DP) shuffle. Both
79 * mono- and di-symbol composition are preserved.
81 * Done by searching for a random Eulerian
82 * walk on a directed multigraph.
83 * Reference: S.F. Altschul and B.W. Erickson, Mol. Biol.
84 * Evol. 2:526-538, 1985. Quoted bits in my comments
85 * are from Altschul's outline of the algorithm.
87 * Args: s1 - RETURN: the string after it's been shuffled
88 * (space for s1 allocated by caller)
89 * s2 - the string to be shuffled
91 * Returns: 0 if string can't be shuffled (it's not all [a-zA-z]
96 StrDPShuffle(char *s1, char *s2)
99 int pos; /* a position in s1 or s2 */
100 int x,y; /* indices of two characters */
101 char **E; /* edge lists: E[0] is the edge list from vertex A */
102 int *nE; /* lengths of edge lists */
103 int *iE; /* positions in edge lists */
104 int n; /* tmp: remaining length of an edge list to be shuffled */
105 char sf; /* last character in s2 */
106 char Z[26]; /* connectivity in last edge graph Z */
107 int keep_connecting; /* flag used in Z connectivity algorithm */
108 int is_eulerian; /* flag used for when we've got a good Z */
110 /* First, verify that the string is entirely alphabetic.
113 for (pos = 0; pos < len; pos++)
114 if (! isalpha(s2[pos])) return 0;
116 /* "(1) Construct the doublet graph G and edge ordering E
117 * corresponding to S."
119 * Note that these also imply the graph G; and note,
120 * for any list x with nE[x] = 0, vertex x is not part
123 E = MallocOrDie(sizeof(char *) * 26);
124 nE = MallocOrDie(sizeof(int) * 26);
125 for (x = 0; x < 26; x++)
127 E[x] = MallocOrDie(sizeof(char) * (len-1));
131 x = toupper(s2[0]) - 'A';
132 for (pos = 1; pos < len; pos++)
134 y = toupper(s2[pos]) - 'A';
140 /* Now we have to find a random Eulerian edge ordering.
142 sf = toupper(s2[len-1]) - 'A';
144 while (! is_eulerian)
146 /* "(2) For each vertex s in G except s_f, randomly select
147 * one edge from the s edge list of E(S) to be the
148 * last edge of the s list in a new edge ordering."
150 * select random edges and move them to the end of each
153 for (x = 0; x < 26; x++)
155 if (nE[x] == 0 || x == sf) continue;
159 E[x][pos] = E[x][nE[x]-1];
163 /* "(3) From this last set of edges, construct the last-edge
164 * graph Z and determine whether or not all of its
165 * vertices are connected to s_f."
167 * a probably stupid algorithm for looking at the
168 * connectivity in Z: iteratively sweep through the
169 * edges in Z, and build up an array (confusing called Z[x])
170 * whose elements are 1 if x is connected to sf, else 0.
172 for (x = 0; x < 26; x++) Z[x] = 0;
173 Z[(int) sf] = keep_connecting = 1;
175 while (keep_connecting) {
177 for (x = 0; x < 26; x++)
179 y = E[x][nE[x]-1]; /* xy is an edge in Z */
180 if (Z[x] == 0 && Z[y] == 1) /* x is connected to sf in Z */
188 /* if any vertex in Z is tagged with a 0, it's
189 * not connected to sf, and we won't have a Eulerian
193 for (x = 0; x < 26; x++)
195 if (nE[x] == 0 || x == sf) continue;
202 /* "(4) If any vertex is not connected in Z to s_f, the
203 * new edge ordering will not be Eulerian, so return to
204 * (2). If all vertices are connected in Z to s_f,
205 * the new edge ordering will be Eulerian, so
208 * e.g. note infinite loop while is_eulerian is FALSE.
212 /* "(5) For each vertex s in G, randomly permute the remaining
213 * edges of the s edge list of E(S) to generate the s
214 * edge list of the new edge ordering E(S')."
216 * Essentially a StrShuffle() on the remaining nE[x]-1 elements
217 * of each edge list; unfortunately our edge lists are arrays,
218 * not strings, so we can't just call out to StrShuffle().
220 for (x = 0; x < 26; x++)
221 for (n = nE[x] - 1; n > 1; n--)
225 E[x][pos] = E[x][n-1];
229 /* "(6) Construct sequence S', a random DP permutation of
230 * S, from E(S') as follows. Start at the s_1 edge list.
231 * At each s_i edge list, add s_i to S', delete the
232 * first edge s_i,s_j of the edge list, and move to
233 * the s_j edge list. Continue this process until
234 * all edge lists are exhausted."
236 iE = MallocOrDie(sizeof(int) * 26);
237 for (x = 0; x < 26; x++) iE[x] = 0;
240 x = toupper(s2[0]) - 'A';
243 s1[pos++] = 'A' + x; /* add s_i to S' */
246 iE[x]++; /* "delete" s_i,s_j from edge list */
248 x = y; /* move to s_j edge list. */
251 break; /* the edge list is exhausted. */
253 s1[pos++] = 'A' + sf;
258 if (x != sf) Die("hey, you didn't end on s_f.");
259 if (pos != len) Die("hey, pos (%d) != len (%d).", pos, len);
263 Free2DArray((void **) E, 26);
270 /* Function: StrMarkov0()
271 * Date: SRE, Fri Oct 29 11:08:31 1999 [St. Louis]
273 * Purpose: Returns a random string s1 with the same
274 * length and zero-th order Markov properties
277 * s1 and s2 may be identical, to randomize s2
280 * Args: s1 - allocated space for random string
281 * s2 - string to base s1's properties on.
283 * Returns: 1 on success; 0 if s2 doesn't look alphabetical.
286 StrMarkov0(char *s1, char *s2)
290 float p[26]; /* symbol probabilities */
292 /* First, verify that the string is entirely alphabetic.
295 for (pos = 0; pos < len; pos++)
296 if (! isalpha(s2[pos])) return 0;
298 /* Collect zeroth order counts and convert to frequencies.
301 for (pos = 0; pos < len; pos++)
302 p[(int)(toupper(s2[pos]) - 'A')] += 1.0;
305 /* Generate a random string using those p's.
307 for (pos = 0; pos < len; pos++)
308 s1[pos] = FChoose(p, 26) + 'A';
315 /* Function: StrMarkov1()
316 * Date: SRE, Fri Oct 29 11:22:20 1999 [St. Louis]
318 * Purpose: Returns a random string s1 with the same
319 * length and first order Markov properties
322 * s1 and s2 may be identical, to randomize s2
325 * Args: s1 - allocated space for random string
326 * s2 - string to base s1's properties on.
328 * Returns: 1 on success; 0 if s2 doesn't look alphabetical.
331 StrMarkov1(char *s1, char *s2)
336 int i; /* initial symbol */
337 float p[26][26]; /* symbol probabilities */
339 /* First, verify that the string is entirely alphabetic.
342 for (pos = 0; pos < len; pos++)
343 if (! isalpha(s2[pos])) return 0;
345 /* Collect first order counts and convert to frequencies.
347 for (x = 0; x < 26; x++) FSet(p[x], 26, 0.);
349 i = x = toupper(s2[0]) - 'A';
350 for (pos = 1; pos < len; pos++)
352 y = toupper(s2[pos]) - 'A';
356 for (x = 0; x < 26; x++)
359 /* Generate a random string using those p's.
363 for (pos = 1; pos < len; pos++)
365 y = FChoose(p[x], 26);
376 /* Function: StrReverse()
377 * Date: SRE, Thu Nov 20 10:54:52 1997 [St. Louis]
379 * Purpose: Returns a reversed version of s2, in s1.
380 * (s1 and s2 can be identical, to reverse in place)
382 * Args: s1 - allocated space for reversed string.
383 * s2 - string to reverse.
388 StrReverse(char *s1, char *s2)
394 if (s1 != s2) strcpy(s1, s2);
396 for (pos = 0; pos < len/2; pos++)
399 s1[len-pos-1] = s1[pos];
405 /* Function: StrRegionalShuffle()
406 * Date: SRE, Thu Nov 20 11:02:34 1997 [St. Louis]
408 * Purpose: Returns a regionally shuffled version of s2, in s1.
409 * (s1 and s2 can be identical to regionally
410 * shuffle in place.) See [Pearson88].
412 * Args: s1 - allocated space for regionally shuffled string.
413 * s2 - string to regionally shuffle
414 * w - window size (typically 10 or 20)
419 StrRegionalShuffle(char *s1, char *s2, int w)
426 if (s1 != s2) strcpy(s1, s2);
429 for (i = 0; i < len; i += w)
430 for (j = MIN(len-1, i+w-1); j > i; j--)
432 pos = i + CHOOSE(j-i);
441 /* Function: AlignmentShuffle()
442 * Date: SRE, Sun Apr 22 18:37:15 2001 [St. Louis]
444 * Purpose: Returns a shuffled version of ali2, in ali1.
445 * (ali1 and ali2 can be identical, to shuffle
446 * in place.) The alignment columns are shuffled,
447 * preserving % identity within the columns.
449 * Args: ali1 - allocated space for shuffled alignment
450 * [0..nseq-1][0..alen-1]
451 * ali2 - alignment to be shuffled
452 * nseq - number of sequences in the alignment
453 * alen - length of alignment, in columns.
458 AlignmentShuffle(char **ali1, char **ali2, int nseq, int alen)
466 for (i = 0; i < nseq; i++) strcpy(ali1[i], ali2[i]);
469 for (i = 0; i < nseq; i++)
470 ali1[i][alen] = '\0';
472 for (; alen > 1; alen--)
475 for (i = 0; i < nseq; i++)
478 ali1[i][pos] = ali1[i][alen-1];
486 /* Function: AlignmentBootstrap()
487 * Date: SRE, Sun Apr 22 18:49:14 2001 [St. Louis]
489 * Purpose: Returns a bootstrapped alignment sample in ali1,
490 * constructed from ali2 by sampling columns with
493 * Unlike the other shuffling routines, ali1 and
494 * ali2 cannot be the same. ali2 is left unchanged.
495 * ali1 must be a properly allocated space for an
496 * alignment the same size as ali2.
498 * Args: ali1 - allocated space for bootstrapped alignment
499 * [0..nseq-1][0..alen-1]
500 * ali2 - alignment to be bootstrapped
501 * nseq - number of sequences in the alignment
502 * alen - length of alignment, in columns.
504 * Returns: 1 on success.
507 AlignmentBootstrap(char **ali1, char **ali2, int nseq, int alen)
513 for (pos = 0; pos < alen; pos++)
516 for (i = 0; i < nseq; i++)
517 ali1[i][pos] = ali2[i][col];
519 for (i = 0; i < nseq; i++)
520 ali1[i][alen] = '\0';
530 * cc -g -o testdriver -DTESTDRIVER -L. shuffle.c -lsquid -lm
533 main(int argc, char **argv)
539 strcpy(s2, "GGGGGGGGGGCCCCCCCCCC");
540 /* strcpy(s2, "AGACATAAAGTTCCGTACTGCCGGGAT");
542 StrDPShuffle(s1, s2);
543 printf("DPshuffle: %s\n", s1);
545 printf("Markov 0 : %s\n", s1);
547 printf("Markov 1 : %s\n", s1);