/***************************************************************** * SQUID - a library of functions for biological sequence analysis * Copyright (C) 1992-2002 Washington University School of Medicine * * This source code is freely distributed under the terms of the * GNU General Public License. See the files COPYRIGHT and LICENSE * for details. *****************************************************************/ /* shuffle.c * * Routines for randomizing sequences. * * All routines are alphabet-independent (DNA, protein, RNA, whatever); * they assume that input strings are purely alphabetical [a-zA-Z], and * will return strings in all upper case [A-Z]. * * All return 1 on success, and 0 on failure; 0 status invariably * means the input string was not alphabetical. * * StrShuffle() - shuffled string, preserve mono-symbol composition. * StrDPShuffle() - shuffled string, preserve mono- and di-symbol composition. * * StrMarkov0() - random string, same zeroth order Markov properties. * StrMarkov1() - random string, same first order Markov properties. * * StrReverse() - simple reversal of string * StrRegionalShuffle() - mono-symbol shuffled string in regional windows * * There are also similar routines for shuffling alignments: * * AlignmentShuffle() - alignment version of StrShuffle(). * AlignmentBootstrap() - sample with replacement; a bootstrap dataset. * QRNAShuffle() - shuffle a pairwise alignment, preserving all gap positions. * * CVS $Id: shuffle.c,v 1.6 2002/10/09 14:26:09 eddy Exp) */ #include #include #include "squid.h" #include "sre_random.h" /* Function: StrShuffle() * * Purpose: Returns a shuffled version of s2, in s1. * (s1 and s2 can be identical, to shuffle in place.) * * Args: s1 - allocated space for shuffled string. * s2 - string to shuffle. * * Return: 1 on success. */ int StrShuffle(char *s1, char *s2) { int len; int pos; char c; if (s1 != s2) strcpy(s1, s2); for (len = strlen(s1); len > 1; len--) { pos = CHOOSE(len); c = s1[pos]; s1[pos] = s1[len-1]; s1[len-1] = c; } return 1; } /* Function: StrDPShuffle() * Date: SRE, Fri Oct 29 09:15:17 1999 [St. Louis] * * Purpose: Returns a shuffled version of s2, in s1. * (s1 and s2 may be identical; i.e. a string * may be shuffled in place.) The shuffle is a * "doublet-preserving" (DP) shuffle. Both * mono- and di-symbol composition are preserved. * * Done by searching for a random Eulerian * walk on a directed multigraph. * Reference: S.F. Altschul and B.W. Erickson, Mol. Biol. * Evol. 2:526-538, 1985. Quoted bits in my comments * are from Altschul's outline of the algorithm. * * Args: s1 - RETURN: the string after it's been shuffled * (space for s1 allocated by caller) * s2 - the string to be shuffled * * Returns: 0 if string can't be shuffled (it's not all [a-zA-z] * alphabetic. * 1 on success. */ int StrDPShuffle(char *s1, char *s2) { int len; int pos; /* a position in s1 or s2 */ int x,y; /* indices of two characters */ char **E; /* edge lists: E[0] is the edge list from vertex A */ int *nE; /* lengths of edge lists */ int *iE; /* positions in edge lists */ int n; /* tmp: remaining length of an edge list to be shuffled */ char sf; /* last character in s2 */ char Z[26]; /* connectivity in last edge graph Z */ int keep_connecting; /* flag used in Z connectivity algorithm */ int is_eulerian; /* flag used for when we've got a good Z */ /* First, verify that the string is entirely alphabetic. */ len = strlen(s2); for (pos = 0; pos < len; pos++) if (! isalpha(s2[pos])) return 0; /* "(1) Construct the doublet graph G and edge ordering E * corresponding to S." * * Note that these also imply the graph G; and note, * for any list x with nE[x] = 0, vertex x is not part * of G. */ E = MallocOrDie(sizeof(char *) * 26); nE = MallocOrDie(sizeof(int) * 26); for (x = 0; x < 26; x++) { E[x] = MallocOrDie(sizeof(char) * (len-1)); nE[x] = 0; } x = toupper(s2[0]) - 'A'; for (pos = 1; pos < len; pos++) { y = toupper(s2[pos]) - 'A'; E[x][nE[x]] = y; nE[x]++; x = y; } /* Now we have to find a random Eulerian edge ordering. */ sf = toupper(s2[len-1]) - 'A'; is_eulerian = 0; while (! is_eulerian) { /* "(2) For each vertex s in G except s_f, randomly select * one edge from the s edge list of E(S) to be the * last edge of the s list in a new edge ordering." * * select random edges and move them to the end of each * edge list. */ for (x = 0; x < 26; x++) { if (nE[x] == 0 || x == sf) continue; pos = CHOOSE(nE[x]); y = E[x][pos]; E[x][pos] = E[x][nE[x]-1]; E[x][nE[x]-1] = y; } /* "(3) From this last set of edges, construct the last-edge * graph Z and determine whether or not all of its * vertices are connected to s_f." * * a probably stupid algorithm for looking at the * connectivity in Z: iteratively sweep through the * edges in Z, and build up an array (confusing called Z[x]) * whose elements are 1 if x is connected to sf, else 0. */ for (x = 0; x < 26; x++) Z[x] = 0; Z[(int) sf] = keep_connecting = 1; while (keep_connecting) { keep_connecting = 0; for (x = 0; x < 26; x++) { y = E[x][nE[x]-1]; /* xy is an edge in Z */ if (Z[x] == 0 && Z[y] == 1) /* x is connected to sf in Z */ { Z[x] = 1; keep_connecting = 1; } } } /* if any vertex in Z is tagged with a 0, it's * not connected to sf, and we won't have a Eulerian * walk. */ is_eulerian = 1; for (x = 0; x < 26; x++) { if (nE[x] == 0 || x == sf) continue; if (Z[x] == 0) { is_eulerian = 0; break; } } /* "(4) If any vertex is not connected in Z to s_f, the * new edge ordering will not be Eulerian, so return to * (2). If all vertices are connected in Z to s_f, * the new edge ordering will be Eulerian, so * continue to (5)." * * e.g. note infinite loop while is_eulerian is FALSE. */ } /* "(5) For each vertex s in G, randomly permute the remaining * edges of the s edge list of E(S) to generate the s * edge list of the new edge ordering E(S')." * * Essentially a StrShuffle() on the remaining nE[x]-1 elements * of each edge list; unfortunately our edge lists are arrays, * not strings, so we can't just call out to StrShuffle(). */ for (x = 0; x < 26; x++) for (n = nE[x] - 1; n > 1; n--) { pos = CHOOSE(n); y = E[x][pos]; E[x][pos] = E[x][n-1]; E[x][n-1] = y; } /* "(6) Construct sequence S', a random DP permutation of * S, from E(S') as follows. Start at the s_1 edge list. * At each s_i edge list, add s_i to S', delete the * first edge s_i,s_j of the edge list, and move to * the s_j edge list. Continue this process until * all edge lists are exhausted." */ iE = MallocOrDie(sizeof(int) * 26); for (x = 0; x < 26; x++) iE[x] = 0; pos = 0; x = toupper(s2[0]) - 'A'; while (1) { s1[pos++] = 'A' + x; /* add s_i to S' */ y = E[x][iE[x]]; iE[x]++; /* "delete" s_i,s_j from edge list */ x = y; /* move to s_j edge list. */ if (iE[x] == nE[x]) break; /* the edge list is exhausted. */ } s1[pos++] = 'A' + sf; s1[pos] = '\0'; /* Reality checks. */ if (x != sf) Die("hey, you didn't end on s_f."); if (pos != len) Die("hey, pos (%d) != len (%d).", pos, len); /* Free and return. */ Free2DArray((void **) E, 26); free(nE); free(iE); return 1; } /* Function: StrMarkov0() * Date: SRE, Fri Oct 29 11:08:31 1999 [St. Louis] * * Purpose: Returns a random string s1 with the same * length and zero-th order Markov properties * as s2. * * s1 and s2 may be identical, to randomize s2 * in place. * * Args: s1 - allocated space for random string * s2 - string to base s1's properties on. * * Returns: 1 on success; 0 if s2 doesn't look alphabetical. */ int StrMarkov0(char *s1, char *s2) { int len; int pos; float p[26]; /* symbol probabilities */ /* First, verify that the string is entirely alphabetic. */ len = strlen(s2); for (pos = 0; pos < len; pos++) if (! isalpha(s2[pos])) return 0; /* Collect zeroth order counts and convert to frequencies. */ FSet(p, 26, 0.); for (pos = 0; pos < len; pos++) p[(int)(toupper(s2[pos]) - 'A')] += 1.0; FNorm(p, 26); /* Generate a random string using those p's. */ for (pos = 0; pos < len; pos++) s1[pos] = FChoose(p, 26) + 'A'; s1[pos] = '\0'; return 1; } /* Function: StrMarkov1() * Date: SRE, Fri Oct 29 11:22:20 1999 [St. Louis] * * Purpose: Returns a random string s1 with the same * length and first order Markov properties * as s2. * * s1 and s2 may be identical, to randomize s2 * in place. * * Args: s1 - allocated space for random string * s2 - string to base s1's properties on. * * Returns: 1 on success; 0 if s2 doesn't look alphabetical. */ int StrMarkov1(char *s1, char *s2) { int len; int pos; int x,y; int i; /* initial symbol */ float p[26][26]; /* symbol probabilities */ /* First, verify that the string is entirely alphabetic. */ len = strlen(s2); for (pos = 0; pos < len; pos++) if (! isalpha(s2[pos])) return 0; /* Collect first order counts and convert to frequencies. */ for (x = 0; x < 26; x++) FSet(p[x], 26, 0.); i = x = toupper(s2[0]) - 'A'; for (pos = 1; pos < len; pos++) { y = toupper(s2[pos]) - 'A'; p[x][y] += 1.0; x = y; } for (x = 0; x < 26; x++) FNorm(p[x], 26); /* Generate a random string using those p's. */ x = i; s1[0] = x + 'A'; for (pos = 1; pos < len; pos++) { y = FChoose(p[x], 26); s1[pos] = y + 'A'; x = y; } s1[pos] = '\0'; return 1; } /* Function: StrReverse() * Date: SRE, Thu Nov 20 10:54:52 1997 [St. Louis] * * Purpose: Returns a reversed version of s2, in s1. * (s1 and s2 can be identical, to reverse in place) * * Args: s1 - allocated space for reversed string. * s2 - string to reverse. * * Return: 1. */ int StrReverse(char *s1, char *s2) { int len; int pos; char c; len = strlen(s2); for (pos = 0; pos < len/2; pos++) { /* swap ends */ c = s2[len-pos-1]; s1[len-pos-1] = s2[pos]; s1[pos] = c; } if (len%2) { s1[pos] = s2[pos]; } /* copy middle residue in odd-len s2 */ s1[len] = '\0'; return 1; } /* Function: StrRegionalShuffle() * Date: SRE, Thu Nov 20 11:02:34 1997 [St. Louis] * * Purpose: Returns a regionally shuffled version of s2, in s1. * (s1 and s2 can be identical to regionally * shuffle in place.) See [Pearson88]. * * Args: s1 - allocated space for regionally shuffled string. * s2 - string to regionally shuffle * w - window size (typically 10 or 20) * * Return: 1. */ int StrRegionalShuffle(char *s1, char *s2, int w) { int len; char c; int pos; int i, j; if (s1 != s2) strcpy(s1, s2); len = strlen(s1); for (i = 0; i < len; i += w) for (j = MIN(len-1, i+w-1); j > i; j--) { pos = i + CHOOSE(j-i); c = s1[pos]; s1[pos] = s1[j]; s1[j] = c; } return 1; } /* Function: AlignmentShuffle() * Date: SRE, Sun Apr 22 18:37:15 2001 [St. Louis] * * Purpose: Returns a shuffled version of ali2, in ali1. * (ali1 and ali2 can be identical, to shuffle * in place.) The alignment columns are shuffled, * preserving % identity within the columns. * * Args: ali1 - allocated space for shuffled alignment * [0..nseq-1][0..alen-1] * ali2 - alignment to be shuffled * nseq - number of sequences in the alignment * alen - length of alignment, in columns. * * Returns: int */ int AlignmentShuffle(char **ali1, char **ali2, int nseq, int alen) { int i; int pos; char c; if (ali1 != ali2) { for (i = 0; i < nseq; i++) strcpy(ali1[i], ali2[i]); } for (i = 0; i < nseq; i++) ali1[i][alen] = '\0'; for (; alen > 1; alen--) { pos = CHOOSE(alen); for (i = 0; i < nseq; i++) { c = ali1[i][pos]; ali1[i][pos] = ali1[i][alen-1]; ali1[i][alen-1] = c; } } return 1; } /* Function: AlignmentBootstrap() * Date: SRE, Sun Apr 22 18:49:14 2001 [St. Louis] * * Purpose: Returns a bootstrapped alignment sample in ali1, * constructed from ali2 by sampling columns with * replacement. * * Unlike the other shuffling routines, ali1 and * ali2 cannot be the same. ali2 is left unchanged. * ali1 must be a properly allocated space for an * alignment the same size as ali2. * * Args: ali1 - allocated space for bootstrapped alignment * [0..nseq-1][0..alen-1] * ali2 - alignment to be bootstrapped * nseq - number of sequences in the alignment * alen - length of alignment, in columns. * * Returns: 1 on success. */ int AlignmentBootstrap(char **ali1, char **ali2, int nseq, int alen) { int pos; int col; int i; for (pos = 0; pos < alen; pos++) { col = CHOOSE(alen); for (i = 0; i < nseq; i++) ali1[i][pos] = ali2[i][col]; } for (i = 0; i < nseq; i++) ali1[i][alen] = '\0'; return 1; } /* Function: QRNAShuffle() * Date: SRE, Mon Dec 10 10:14:12 2001 [St. Louis] * * Purpose: Shuffle a pairwise alignment x,y while preserving the * position of gaps; return the shuffled alignment in xs, * ys. * * Works by doing three separate * shuffles, of (1) columns with residues in both * x and y, (2) columns with residue in x and gap in y, * and (3) columns with gap in x and residue in y. * * xs,x and ys,y may be identical: that is, to shuffle * an alignment "in place", destroying the original * alignment, just call: * QRNAShuffle(x,y,x,y); * * Args: xs, ys: allocated space for shuffled pairwise ali of x,y [L+1] * x, y: pairwise alignment to be shuffled [0..L-1] * * Returns: 1 on success, 0 on failure. * The shuffled alignment is returned in xs, ys. */ int QRNAShuffle(char *xs, char *ys, char *x, char *y) { int L; int *xycol, *xcol, *ycol; int nxy, nx, ny; int i; int pos, c; char xsym, ysym; if (xs != x) strcpy(xs, x); if (ys != y) strcpy(ys, y); /* First, construct three arrays containing lists of the column positions * of the three types of columns. (If a column contains gaps in both x and y, * we've already simply copied it to the shuffled sequence.) */ L = strlen(x); xycol = MallocOrDie(sizeof(int) * L); xcol = MallocOrDie(sizeof(int) * L); ycol = MallocOrDie(sizeof(int) * L); nxy = nx = ny = 0; for (i = 0; i < L; i++) { if (isgap(x[i]) && isgap(y[i])) { continue; } else if (! isgap(x[i]) && ! isgap(y[i])) { xycol[nxy] = i; nxy++; } else if (isgap(x[i])) { ycol[ny] = i; ny++; } else if (isgap(y[i])) { xcol[nx] = i; nx++; } } /* Second, shuffle the sequences indirectly, via shuffling these arrays. * Yow, careful with those indices, and with order of the statements... */ for (; nxy > 1; nxy--) { pos = CHOOSE(nxy); xsym = xs[xycol[pos]]; ysym = ys[xycol[pos]]; c = xycol[pos]; xs[xycol[pos]] = xs[xycol[nxy-1]]; ys[xycol[pos]] = ys[xycol[nxy-1]]; xycol[pos] = xycol[nxy-1]; xs[xycol[nxy-1]] = xsym; ys[xycol[nxy-1]] = ysym; xycol[pos] = xycol[nxy-1]; } for (; nx > 1; nx--) { pos = CHOOSE(nx); xsym = xs[xcol[pos]]; ysym = ys[xcol[pos]]; c = xcol[pos]; xs[xcol[pos]] = xs[xcol[nx-1]]; ys[xcol[pos]] = ys[xcol[nx-1]]; xcol[pos] = xcol[nx-1]; xs[xcol[nx-1]] = xsym; ys[xcol[nx-1]] = ysym; xcol[nx-1] = c; } for (; ny > 1; ny--) { pos = CHOOSE(ny); xsym = xs[ycol[pos]]; ysym = ys[ycol[pos]]; c = ycol[pos]; xs[ycol[pos]] = xs[ycol[ny-1]]; ys[ycol[pos]] = ys[ycol[ny-1]]; ycol[pos] = ycol[ny-1]; xs[ycol[ny-1]] = xsym; ys[ycol[ny-1]] = ysym; ycol[ny-1] = c; } free(xycol); free(xcol); free(ycol); return 1; } #ifdef TESTDRIVER /* * cc -g -o testdriver -DTESTDRIVER -L. shuffle.c -lsquid -lm */ int main(int argc, char **argv) { char s1[100]; char s2[100]; sre_srandom(42); strcpy(s2, "GGGGGGGGGGCCCCCCCCCC"); /* strcpy(s2, "AGACATAAAGTTCCGTACTGCCGGGAT"); */ StrDPShuffle(s1, s2); printf("DPshuffle: %s\n", s1); StrMarkov0(s1,s2); printf("Markov 0 : %s\n", s1); StrMarkov1(s1,s2); printf("Markov 1 : %s\n", s1); strcpy(s1, "ACGTACGT--------ACGTACGT----ACGTACGT"); strcpy(s2, "ACGTACGTACGTACGT------------ACGTACGT"); QRNAShuffle(s1,s2,s1,s2); printf("QRNA : %s\n", s1); printf(" : %s\n", s2); return 0; } #endif