1 package jalview.analysis;
3 import jalview.datamodel.Alignment;
4 import jalview.datamodel.AlignmentI;
5 import jalview.datamodel.Sequence;
6 import jalview.datamodel.SequenceI;
7 import jalview.io.FastaFile;
9 import java.util.Arrays;
10 import java.util.Random;
13 * Generates, and outputs in Fasta format, a random DNA alignment for given
14 * sequence length and count. Will regenerate the same alignment each time if
15 * the same random seed is used (so may be used for reproducible unit tests).
16 * Not guaranteed to reproduce the same results between versions, as the rules
17 * may get tweaked to produce more 'realistic' results.
21 * <li>length (number of bases in each sequence)</li>
22 * <li>height (number of sequences)</li>
23 * <li>a whole number random seed</li>
24 * <li>percentage of gaps to include (0-100)</li>
25 * <li>percentage chance of variation of each position (0-100)</li>
31 public class DnaAlignmentGenerator
33 private static final char GAP = '-';
35 private static final char ZERO = '0';
37 private static final char[] BASES = new char[]
38 { 'G', 'T', 'C', 'A' };
40 private Random random;
43 * Outputs a DNA 'alignment' where each position is a random choice from
48 public static void main(String[] args)
55 int width = Integer.parseInt(args[0]);
56 int height = Integer.parseInt(args[1]);
57 long randomSeed = Long.valueOf(args[2]);
58 int gapPercentage = Integer.valueOf(args[3]);
59 int changePercentage = Integer.valueOf(args[4]);
60 AlignmentI al = new DnaAlignmentGenerator().generate(width, height,
61 randomSeed, gapPercentage, changePercentage);
63 System.out.println("; " + height + " sequences of " + width
64 + " bases with " + gapPercentage + "% gaps and "
65 + changePercentage + "% mutations (random seed = " + randomSeed
67 System.out.println(new FastaFile().print(al.getSequencesArray()));
71 * Print parameter help.
73 private static void usage()
75 System.out.println("Usage:");
76 System.out.println("arg0: number of (non-gap) bases per sequence");
77 System.out.println("arg1: number sequences");
79 .println("arg2: an integer as random seed (same seed = same results)");
80 System.out.println("arg3: percentage of gaps to (randomly) generate");
82 .println("arg4: percentage of 'mutations' to (randomly) generate");
83 System.out.println("Example: DnaAlignmentGenerator 12 15 387 10 5");
85 .println("- 15 sequences of 12 bases each, approx 10% gaps and 5% mutations, random seed = 387");
92 public DnaAlignmentGenerator()
98 * Outputs a DNA 'alignment' of given width and height, where each position is
99 * a random choice from 'GTCA-'.
104 * @param changePercentage
105 * @param gapPercentage
107 public AlignmentI generate(int width, int height, long randomSeed,
108 int gapPercentage, int changePercentage)
110 SequenceI[] seqs = new SequenceI[height];
111 random = new Random(randomSeed);
112 seqs[0] = generateSequence(1, width, gapPercentage);
113 for (int seqno = 1; seqno < height; seqno++)
115 seqs[seqno] = generateAnotherSequence(seqs[0].getSequence(),
116 seqno + 1, width, changePercentage);
118 AlignmentI al = new Alignment(seqs);
123 * Outputs a DNA 'sequence' of given length, with some random gaps included.
127 * @param gapPercentage
129 private SequenceI generateSequence(int seqno, int length,
132 StringBuilder seq = new StringBuilder(length);
135 * Loop till we've added 'length' bases (excluding gaps)
137 for (int count = 0; count < length;)
139 boolean addGap = random.nextInt(100) < gapPercentage;
140 char c = addGap ? GAP : BASES[random.nextInt(Integer.MAX_VALUE) % 4];
147 final String seqName = "SEQ" + seqno;
148 final String seqString = seq.toString();
149 SequenceI sq = new Sequence(seqName, seqString);
150 sq.createDatasetSequence();
155 * Generate a sequence approximately aligned to the first one.
161 * @param changePercentage
164 private SequenceI generateAnotherSequence(char[] ds, int seqno,
165 int width, int changePercentage)
167 int length = ds.length;
168 char[] seq = new char[length];
169 Arrays.fill(seq, ZERO);
170 int gapsWanted = length - width;
174 * First 'randomly' mimic gaps in model sequence.
176 for (int pos = 0; pos < length; pos++)
181 * Add a gap at the same position with changePercentage likelihood
183 seq[pos] = randomCharacter(GAP, changePercentage);
192 * Next scatter any remaining gaps (if any) at random. This gives an even
195 while (gapsAdded < gapsWanted)
197 boolean added = false;
200 int pos = random.nextInt(length);
211 * Finally fill in the rest with randomly mutated bases.
213 for (int pos = 0; pos < length; pos++)
215 if (seq[pos] == ZERO)
217 char c = randomCharacter(ds[pos], changePercentage);
221 final String seqName = "SEQ" + seqno;
222 final String seqString = new String(seq);
223 SequenceI sq = new Sequence(seqName, seqString);
224 sq.createDatasetSequence();
229 * Returns a random character that is changePercentage% likely to match the
230 * given type (as base or gap).
232 * @param changePercentage
237 private char randomCharacter(char c, int changePercentage)
239 final boolean mutation = random.nextInt(100) < changePercentage;
249 newchar = BASES[random.nextInt(Integer.MAX_VALUE) % 4];