2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
21 package jalview.analysis;
23 import java.util.Locale;
25 import jalview.datamodel.Alignment;
26 import jalview.datamodel.AlignmentI;
27 import jalview.datamodel.Sequence;
28 import jalview.datamodel.SequenceI;
29 import jalview.gui.JvOptionPane;
30 import jalview.io.FastaFile;
33 import java.io.FileNotFoundException;
34 import java.io.PrintStream;
35 import java.util.Arrays;
36 import java.util.Random;
38 import org.testng.annotations.BeforeClass;
41 * Generates, and outputs in Fasta format, a random peptide or nucleotide
42 * alignment for given sequence length and count. Will regenerate the same
43 * alignment each time if the same random seed is used (so may be used for
44 * reproducible unit tests). Not guaranteed to reproduce the same results
45 * between versions, as the rules may get tweaked to produce more 'realistic'
50 public class AlignmentGenerator
52 private static final char GAP = '-';
54 private static final char ZERO = '0';
56 private static final char[] NUCS = "GTCA".toCharArray();
58 private static final char[] PEPS = "MILVFYWHKRDEQNTCGASNP".toCharArray();
60 private static char[] BASES;
62 private Random random;
64 private PrintStream ps;
67 * Outputs a pseudo-randomly generated nucleotide or peptide alignment
70 * <li>n (for nucleotide) or p (for peptide)</li>
71 * <li>length (number of bases in each sequence)</li>
72 * <li>height (number of sequences)</li>
73 * <li>a whole number random seed</li>
74 * <li>percentage of gaps to include (0-100)</li>
75 * <li>percentage chance of variation of each position (0-100)</li>
76 * <li>(optional) path to a file to write the alignment to</li>
81 * @throws FileNotFoundException
83 public static void main(String[] args) throws FileNotFoundException
85 if (args.length != 6 && args.length != 7)
91 PrintStream ps = System.out;
94 ps = new PrintStream(new File(args[6]));
97 boolean nucleotide = args[0].toLowerCase(Locale.ROOT).startsWith("n");
98 int width = Integer.parseInt(args[1]);
99 int height = Integer.parseInt(args[2]);
100 long randomSeed = Long.valueOf(args[3]);
101 int gapPercentage = Integer.valueOf(args[4]);
102 int changePercentage = Integer.valueOf(args[5]);
104 ps.println("; " + height + " sequences of " + width + " bases with "
105 + gapPercentage + "% gaps and " + changePercentage
106 + "% mutations (random seed = " + randomSeed + ")");
108 new AlignmentGenerator(nucleotide, ps).generate(width, height,
109 randomSeed, gapPercentage, changePercentage);
111 if (ps != System.out)
118 * Prints parameter help
120 private static void usage()
122 System.out.println("Usage:");
123 System.out.println("arg0: n (for nucleotide) or p (for peptide)");
124 System.out.println("arg1: number of (non-gap) bases per sequence");
125 System.out.println("arg2: number of sequences");
127 "arg3: an integer as random seed (same seed = same results)");
128 System.out.println("arg4: percentage of gaps to (randomly) generate");
130 "arg5: percentage of 'mutations' to (randomly) generate");
132 "arg6: (optional) path to output file (default is sysout)");
133 System.out.println("Example: AlignmentGenerator n 12 15 387 10 5");
135 "- 15 nucleotide sequences of 12 bases each, approx 10% gaps and 5% mutations, random seed = 387");
140 * Constructor that sets nucleotide or peptide symbol set, and also writes the
141 * generated alignment to sysout
143 public AlignmentGenerator(boolean nuc)
145 this(nuc, System.out);
149 * Constructor that sets nucleotide or peptide symbol set, and also writes the
150 * generated alignment to the specified output stream (if not null). This can
151 * be used to write the alignment to a file or sysout.
153 public AlignmentGenerator(boolean nucleotide, PrintStream printStream)
155 BASES = nucleotide ? NUCS : PEPS;
160 * Outputs an 'alignment' of given width and height, where each position is a
161 * random choice from the symbol alphabet, or - for gap
166 * @param changePercentage
167 * @param gapPercentage
169 public AlignmentI generate(int width, int height, long randomSeed,
170 int gapPercentage, int changePercentage)
172 SequenceI[] seqs = new SequenceI[height];
173 random = new Random(randomSeed);
174 seqs[0] = generateSequence(1, width, gapPercentage);
175 for (int seqno = 1; seqno < height; seqno++)
177 seqs[seqno] = generateAnotherSequence(seqs[0].getSequence(),
178 seqno + 1, width, changePercentage);
180 AlignmentI al = new Alignment(seqs);
184 ps.println(new FastaFile().print(al.getSequencesArray(), true));
191 * Outputs a DNA 'sequence' of given length, with some random gaps included.
195 * @param gapPercentage
197 private SequenceI generateSequence(int seqno, int length,
200 StringBuilder seq = new StringBuilder(length);
203 * Loop till we've added 'length' bases (excluding gaps)
205 for (int count = 0; count < length;)
207 boolean addGap = random.nextInt(100) < gapPercentage;
208 char c = addGap ? GAP
209 : BASES[random.nextInt(Integer.MAX_VALUE) % BASES.length];
216 final String seqName = "SEQ" + seqno;
217 final String seqString = seq.toString();
218 SequenceI sq = new Sequence(seqName, seqString);
219 sq.createDatasetSequence();
224 * Generate a sequence approximately aligned to the first one.
230 * @param changePercentage
233 private SequenceI generateAnotherSequence(char[] ds, int seqno, int width,
234 int changePercentage)
236 int length = ds.length;
237 char[] seq = new char[length];
238 Arrays.fill(seq, ZERO);
239 int gapsWanted = length - width;
243 * First 'randomly' mimic gaps in model sequence.
245 for (int pos = 0; pos < length; pos++)
250 * Add a gap at the same position with changePercentage likelihood
252 seq[pos] = randomCharacter(GAP, changePercentage);
261 * Next scatter any remaining gaps (if any) at random. This gives an even
264 while (gapsAdded < gapsWanted)
266 boolean added = false;
269 int pos = random.nextInt(length);
280 * Finally fill in the rest with randomly mutated bases.
282 for (int pos = 0; pos < length; pos++)
284 if (seq[pos] == ZERO)
286 char c = randomCharacter(ds[pos], changePercentage);
290 final String seqName = "SEQ" + seqno;
291 final String seqString = new String(seq);
292 SequenceI sq = new Sequence(seqName, seqString);
293 sq.createDatasetSequence();
298 * Returns a random character that is changePercentage% likely to match the
299 * given type (as base or gap).
301 * @param changePercentage
306 private char randomCharacter(char c, int changePercentage)
308 final boolean mutation = random.nextInt(100) < changePercentage;
318 newchar = BASES[random.nextInt(Integer.MAX_VALUE) % BASES.length];