1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
13 * shuffle - generate shuffled sequences
14 * Mon Feb 26 16:56:08 1996
16 * CVS $Id: shuffle_main.c,v 1.1.1.1 2005/03/22 08:34:16 cmzmasek Exp $
24 char banner[] = "shuffle - generated shuffled (or otherwise randomized) sequence";
27 Usage: shuffle [-options] <seqfile>\n\
29 -h : help; print version and usage info\n\
30 -n <n> : make <n> samples per input seq (default 1)\n\
31 -t <n> : truncate/delete inputs to fixed length <n>\n\
33 Default: shuffle each input randomly, preserving mono-symbol composition.\n\
34 Other choices (exclusive; can't use more than one) :\n\
35 -d : shuffle but preserve both mono- and di-symbol composition\n\
36 -0 : generate with same 0th order Markov properties as each input\n\
37 -1 : generate with same 1st order Markov properties as each input\n\
38 -l : make iid sequences of same number and length as inputs\n\
39 -r : reverse inputs\n\
40 -w <n> : regionally shuffle inputs in window size <n>\n\
41 -i : make [-n] iid seqs of length [-t] of type [--dna|--amino];\n\
42 when -i is set, no <seqfile> argument is used\n\
46 --alignment : <seqfile> is an alignment; shuffle the columns\n\
47 --amino : synthesize protein sequences [default] (see -i, -l)\n\
48 --dna : synthesize DNA sequences (see -i, -l))\n\
49 --informat <s> : specify sequence file format <s>\n\
50 --nodesc : remove sequence description lines\n\
51 --seed <s> : set random number seed to <s>\n\
54 struct opt_s OPTIONS[] = {
55 { "-0", TRUE, sqdARG_NONE }, /* 0th order Markov */
56 { "-1", TRUE, sqdARG_NONE }, /* 1st order Markov */
57 { "-d", TRUE, sqdARG_NONE }, /* digram shuffle */
58 { "-h", TRUE, sqdARG_NONE }, /* help */
59 { "-i", TRUE, sqdARG_NONE }, /* make iid seq of set length */
60 { "-l", TRUE, sqdARG_NONE }, /* make iid seq of same length */
61 { "-n", TRUE, sqdARG_INT }, /* number of shuffles per input seq */
62 { "-r", TRUE, sqdARG_NONE }, /* reverse seq rather than shuffle */
63 { "-t", TRUE, sqdARG_INT }, /* truncation of inputs to fixed len */
64 { "-w", TRUE, sqdARG_INT }, /* do regional shuffling */
65 { "--alignment",FALSE, sqdARG_NONE }, /* input is alignment; shuff cols */
66 { "--amino", FALSE, sqdARG_NONE }, /* make iid protein seqs [default]*/
67 { "--dna", FALSE, sqdARG_NONE }, /* make iid DNA seqs */
68 { "--informat", FALSE, sqdARG_STRING }, /* remove desc lines */
69 { "--nodesc", FALSE, sqdARG_NONE }, /* remove desc lines */
70 { "--seed", FALSE, sqdARG_INT }, /* set the random number seed */
72 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
74 static void shuffle_alignment_file(char *afile, int fmt);
77 main(int argc, char **argv)
79 char *seqfile; /* name of sequence file */
80 SQFILE *dbfp; /* open sequence file */
81 int fmt; /* format of seqfile */
82 char *seq; /* sequence */
83 char sqname[32]; /* name of an iid sequence */
84 SQINFO sqinfo; /* additional sequence info */
85 char *shuff; /* shuffled sequence */
86 int num; /* number to generate */
87 int seed; /* random number generator seed */
89 int w; /* window size for regional shuffle (or 0) */
90 int truncation; /* fixed length for truncation option (or 0) */
91 int no_desc; /* TRUE to remove description lines */
92 enum { /* shuffling strategy */
93 DO_SHUFFLE, DO_DPSHUFFLE, DO_MARKOV0, DO_MARKOV1, DO_REVERSE, DO_REGIONAL,
94 DO_IID_SAMELEN, DO_IID_FIXEDLEN} strategy;
95 int do_dna; /* TRUE to make DNA iid seqs, not protein */
96 int do_alignment; /* TRUE to shuffle alignment columns */
98 char *optname; /* option name */
99 char *optarg; /* option argument (or NULL) */
100 int optind; /* index of next argv[] */
103 /***********************************************
105 ***********************************************/
107 fmt = SQFILE_UNKNOWN; /* autodetect file format by default */
109 seed = (int) time ((time_t *) NULL);
112 strategy = DO_SHUFFLE;
115 do_alignment = FALSE;
117 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
118 &optind, &optname, &optarg))
120 if (strcmp(optname, "-0") == 0) strategy = DO_MARKOV0;
121 else if (strcmp(optname, "-1") == 0) strategy = DO_MARKOV1;
122 else if (strcmp(optname, "-d") == 0) strategy = DO_DPSHUFFLE;
123 else if (strcmp(optname, "-n") == 0) num = atoi(optarg);
124 else if (strcmp(optname, "-w") == 0) {strategy = DO_REGIONAL; w = atoi(optarg); }
125 else if (strcmp(optname, "-i") == 0) strategy = DO_IID_FIXEDLEN;
126 else if (strcmp(optname, "-l") == 0) strategy = DO_IID_SAMELEN;
127 else if (strcmp(optname, "-r") == 0) strategy = DO_REVERSE;
128 else if (strcmp(optname, "-t") == 0) truncation = atoi(optarg);
130 else if (strcmp(optname, "--alignment")== 0) do_alignment = TRUE;
131 else if (strcmp(optname, "--amino") == 0) do_dna = FALSE;
132 else if (strcmp(optname, "--dna") == 0) do_dna = TRUE;
133 else if (strcmp(optname, "--nodesc") == 0) no_desc = TRUE;
134 else if (strcmp(optname, "--seed") == 0) seed = atoi(optarg);
135 else if (strcmp(optname, "--informat") == 0) {
136 fmt = String2SeqfileFormat(optarg);
137 if (fmt == SQFILE_UNKNOWN)
138 Die("unrecognized sequence file format \"%s\"", optarg);
140 else if (strcmp(optname, "-h") == 0) {
141 Banner(stdout, banner);
148 /*****************************************************************
149 * Special case, 1: IID sequence generation.
150 * -i option is special, because it synthesizes, rather than
151 * shuffles. Doesn't take a seqfile argument;
152 * requires -n, -t; and doesn't use the same code logic as the
153 * other shuffling strategies. Note that we misuse/overload the
154 * -t "truncation length" option to set our fixed length for
155 * generating iid sequence.
156 *****************************************************************/
158 if (strategy == DO_IID_FIXEDLEN) {
159 if (num == 0 || truncation == 0)
160 Die("-i (i.i.d. sequence generation) requires -n,-t to be set\n%s\n",
162 if (argc-optind != 0)
163 Die("-i (i.i.d. sequence generation) takes no seqfile argument\n%s\n",
166 for (i = 0; i < num; i++)
169 shuff = RandomSequence(DNA_ALPHABET, dnafq, 4, truncation);
171 shuff = RandomSequence(AMINO_ALPHABET, aafq, 20, truncation);
173 /* pedantic note: sqname has room for 31 char + \0, so
174 * there's room for 24 digits - a 32-bit integer can only run up
175 * to 10 digits, and a 64-bit integer to 20, so we don't worry
176 * about the following sprintf() overrunning its bounds.
178 sprintf(sqname, "randseq%d", i);
179 WriteSimpleFASTA(stdout, shuff, sqname, NULL);
185 /*****************************************************************
187 *****************************************************************/
189 if (argc - optind != 1)
190 Die("Incorrect number of command line arguments\n%s\n", usage);
191 seqfile = argv[optind];
192 if (num == 0) num = 1; /* set default shuffle number per sequence */
195 /*****************************************************************
196 * Special case, 2: Alignment shuffling
197 *****************************************************************/
200 shuffle_alignment_file(seqfile, fmt);
204 /*****************************************************************
205 * Main logic of the shuffling program:
206 * expect one seqfile argument
207 *****************************************************************/
209 if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL)
210 Die("Failed to open sequence file %s for reading", seqfile);
212 while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo))
214 shuff = (char *) MallocOrDie ((sqinfo.len + 1) * sizeof(char));
216 if (no_desc) strcpy(sqinfo.desc, "");
218 /* If we're truncating seq, do it now.
223 if (sqinfo.len < truncation) {
225 FreeSequence(seq, &sqinfo);
229 start = CHOOSE(sqinfo.len - truncation + 1);
230 strncpy(shuff, seq+start, truncation);
231 shuff[truncation] = '\0';
233 sqinfo.len = truncation;
236 for (i = 0; i < num; i++)
239 case DO_SHUFFLE: StrShuffle(shuff, seq); break;
240 case DO_DPSHUFFLE: StrDPShuffle(shuff, seq); break;
241 case DO_MARKOV0: StrMarkov0(shuff, seq); break;
242 case DO_MARKOV1: StrMarkov1(shuff, seq); break;
243 case DO_REVERSE: StrReverse(shuff, seq); break;
244 case DO_REGIONAL: StrRegionalShuffle(shuff, seq, w); break;
247 shuff = RandomSequence(AMINO_ALPHABET, aafq, 20, sqinfo.len);
249 default: Die("choked on a bad enum; tragic.");
252 WriteSeq(stdout, SQFILE_FASTA, shuff, &sqinfo);
255 if (shuff != NULL) free(shuff);
256 FreeSequence(seq, &sqinfo);
265 shuffle_alignment_file(char *afile, int fmt)
270 if ((afp = MSAFileOpen(afile, fmt, NULL)) == NULL)
271 Die("Alignment file %s could not be opened for reading", afile);
272 while ((msa = MSAFileRead(afp)) != NULL)
274 /* shuffle in place */
275 AlignmentShuffle(msa->aseq, msa->aseq, msa->nseq, msa->alen);
276 /* write in same format we read in */
277 MSAFileWrite(stdout, msa, afp->format, FALSE);