2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import jalview.datamodel.Alignment;
24 import jalview.datamodel.AlignmentI;
25 import jalview.datamodel.Sequence;
26 import jalview.datamodel.SequenceI;
27 import jalview.io.FastaFile;
29 import java.util.Arrays;
30 import java.util.Random;
33 * Generates, and outputs in Fasta format, a random DNA alignment for given
34 * sequence length and count. Will regenerate the same alignment each time if
35 * the same random seed is used (so may be used for reproducible unit tests).
36 * Not guaranteed to reproduce the same results between versions, as the rules
37 * may get tweaked to produce more 'realistic' results.
41 * <li>length (number of bases in each sequence)</li>
42 * <li>height (number of sequences)</li>
43 * <li>a whole number random seed</li>
44 * <li>percentage of gaps to include (0-100)</li>
45 * <li>percentage chance of variation of each position (0-100)</li>
51 public class DnaAlignmentGenerator
53 private static final char GAP = '-';
55 private static final char ZERO = '0';
57 private static final char[] BASES = new char[] { 'G', 'T', 'C', 'A' };
59 private Random random;
62 * Outputs a DNA 'alignment' where each position is a random choice from
67 public static void main(String[] args)
74 int width = Integer.parseInt(args[0]);
75 int height = Integer.parseInt(args[1]);
76 long randomSeed = Long.valueOf(args[2]);
77 int gapPercentage = Integer.valueOf(args[3]);
78 int changePercentage = Integer.valueOf(args[4]);
79 AlignmentI al = new DnaAlignmentGenerator().generate(width, height,
80 randomSeed, gapPercentage, changePercentage);
82 System.out.println("; " + height + " sequences of " + width
83 + " bases with " + gapPercentage + "% gaps and "
84 + changePercentage + "% mutations (random seed = " + randomSeed
86 System.out.println(new FastaFile().print(al.getSequencesArray()));
90 * Print parameter help.
92 private static void usage()
94 System.out.println("Usage:");
95 System.out.println("arg0: number of (non-gap) bases per sequence");
96 System.out.println("arg1: number sequences");
98 .println("arg2: an integer as random seed (same seed = same results)");
99 System.out.println("arg3: percentage of gaps to (randomly) generate");
101 .println("arg4: percentage of 'mutations' to (randomly) generate");
102 System.out.println("Example: DnaAlignmentGenerator 12 15 387 10 5");
104 .println("- 15 sequences of 12 bases each, approx 10% gaps and 5% mutations, random seed = 387");
109 * Default constructor
111 public DnaAlignmentGenerator()
117 * Outputs a DNA 'alignment' of given width and height, where each position is
118 * a random choice from 'GTCA-'.
123 * @param changePercentage
124 * @param gapPercentage
126 public AlignmentI generate(int width, int height, long randomSeed,
127 int gapPercentage, int changePercentage)
129 SequenceI[] seqs = new SequenceI[height];
130 random = new Random(randomSeed);
131 seqs[0] = generateSequence(1, width, gapPercentage);
132 for (int seqno = 1; seqno < height; seqno++)
134 seqs[seqno] = generateAnotherSequence(seqs[0].getSequence(),
135 seqno + 1, width, changePercentage);
137 AlignmentI al = new Alignment(seqs);
142 * Outputs a DNA 'sequence' of given length, with some random gaps included.
146 * @param gapPercentage
148 private SequenceI generateSequence(int seqno, int length,
151 StringBuilder seq = new StringBuilder(length);
154 * Loop till we've added 'length' bases (excluding gaps)
156 for (int count = 0; count < length;)
158 boolean addGap = random.nextInt(100) < gapPercentage;
159 char c = addGap ? GAP : BASES[random.nextInt(Integer.MAX_VALUE) % 4];
166 final String seqName = "SEQ" + seqno;
167 final String seqString = seq.toString();
168 SequenceI sq = new Sequence(seqName, seqString);
169 sq.createDatasetSequence();
174 * Generate a sequence approximately aligned to the first one.
180 * @param changePercentage
183 private SequenceI generateAnotherSequence(char[] ds, int seqno,
184 int width, int changePercentage)
186 int length = ds.length;
187 char[] seq = new char[length];
188 Arrays.fill(seq, ZERO);
189 int gapsWanted = length - width;
193 * First 'randomly' mimic gaps in model sequence.
195 for (int pos = 0; pos < length; pos++)
200 * Add a gap at the same position with changePercentage likelihood
202 seq[pos] = randomCharacter(GAP, changePercentage);
211 * Next scatter any remaining gaps (if any) at random. This gives an even
214 while (gapsAdded < gapsWanted)
216 boolean added = false;
219 int pos = random.nextInt(length);
230 * Finally fill in the rest with randomly mutated bases.
232 for (int pos = 0; pos < length; pos++)
234 if (seq[pos] == ZERO)
236 char c = randomCharacter(ds[pos], changePercentage);
240 final String seqName = "SEQ" + seqno;
241 final String seqString = new String(seq);
242 SequenceI sq = new Sequence(seqName, seqString);
243 sq.createDatasetSequence();
248 * Returns a random character that is changePercentage% likely to match the
249 * given type (as base or gap).
251 * @param changePercentage
256 private char randomCharacter(char c, int changePercentage)
258 final boolean mutation = random.nextInt(100) < changePercentage;
268 newchar = BASES[random.nextInt(Integer.MAX_VALUE) % 4];