--- /dev/null
+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ *
+ * This file is part of Jalview.
+ *
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *
+ * Jalview is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ */
+package jalview.analysis;
+
+import jalview.datamodel.Alignment;
+import jalview.datamodel.AlignmentI;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceI;
+import jalview.gui.JvOptionPane;
+import jalview.io.FastaFile;
+
+import java.util.Arrays;
+import java.util.Random;
+
+import org.testng.annotations.BeforeClass;
+
+/**
+ * Generates, and outputs in Fasta format, a random DNA alignment for given
+ * sequence length and count. Will regenerate the same alignment each time if
+ * the same random seed is used (so may be used for reproducible unit tests).
+ * Not guaranteed to reproduce the same results between versions, as the rules
+ * may get tweaked to produce more 'realistic' results.
+ *
+ * Arguments:
+ * <ul>
+ * <li>length (number of bases in each sequence)</li>
+ * <li>height (number of sequences)</li>
+ * <li>a whole number random seed</li>
+ * <li>percentage of gaps to include (0-100)</li>
+ * <li>percentage chance of variation of each position (0-100)</li>
+ * </ul>
+ *
+ * @author gmcarstairs
+ *
+ */
+public class AlignmentGenerator
+{
+
+ @BeforeClass(alwaysRun = true)
+ public void setUpJvOptionPane()
+ {
+ JvOptionPane.setInteractiveMode(false);
+ JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION);
+ }
+
+ private static final char GAP = '-';
+
+ private static final char ZERO = '0';
+
+ private static final char[] NUCS = "GTCA".toCharArray();
+
+ private static final char[] PEPS = "MILVFYWHKRDEQNTCGASNP".toCharArray();
+
+ private static char[] BASES;
+
+ private Random random;
+
+
+ /**
+ * Outputs a DNA 'alignment' where each position is a random choice from
+ * 'GTCA-'.
+ *
+ * @param args
+ */
+ public static void main(String[] args)
+ {
+ if (args.length != 6)
+ {
+ usage();
+ return;
+ }
+ BASES = args[0].toLowerCase().startsWith("n") ? NUCS : PEPS;
+ int width = Integer.parseInt(args[1]);
+ int height = Integer.parseInt(args[2]);
+ long randomSeed = Long.valueOf(args[3]);
+ int gapPercentage = Integer.valueOf(args[4]);
+ int changePercentage = Integer.valueOf(args[5]);
+ AlignmentI al = new AlignmentGenerator().generate(width, height,
+ randomSeed, gapPercentage, changePercentage);
+
+ System.out.println("; " + height + " sequences of " + width
+ + " bases with " + gapPercentage + "% gaps and "
+ + changePercentage + "% mutations (random seed = " + randomSeed
+ + ")");
+ System.out.println(new FastaFile().print(al.getSequencesArray(), true));
+ }
+
+ /**
+ * Print parameter help.
+ */
+ private static void usage()
+ {
+ System.out.println("Usage:");
+ System.out.println("arg0: n (for nucleotide) or p (for peptide)");
+ System.out.println("arg1: number of (non-gap) bases per sequence");
+ System.out.println("arg2: number sequences");
+ System.out
+ .println("arg3: an integer as random seed (same seed = same results)");
+ System.out.println("arg4: percentage of gaps to (randomly) generate");
+ System.out
+ .println("arg5: percentage of 'mutations' to (randomly) generate");
+ System.out.println("Example: AlignmentGenerator n 12 15 387 10 5");
+ System.out
+ .println("- 15 nucleotide sequences of 12 bases each, approx 10% gaps and 5% mutations, random seed = 387");
+
+ }
+
+ /**
+ * Default constructor
+ */
+ public AlignmentGenerator()
+ {
+
+ }
+
+ /**
+ * Outputs a DNA 'alignment' of given width and height, where each position is
+ * a random choice from 'GTCA-'.
+ *
+ * @param width
+ * @param height
+ * @param randomSeed
+ * @param changePercentage
+ * @param gapPercentage
+ */
+ public AlignmentI generate(int width, int height, long randomSeed,
+ int gapPercentage, int changePercentage)
+ {
+ SequenceI[] seqs = new SequenceI[height];
+ random = new Random(randomSeed);
+ seqs[0] = generateSequence(1, width, gapPercentage);
+ for (int seqno = 1; seqno < height; seqno++)
+ {
+ seqs[seqno] = generateAnotherSequence(seqs[0].getSequence(),
+ seqno + 1, width, changePercentage);
+ }
+ AlignmentI al = new Alignment(seqs);
+ return al;
+ }
+
+ /**
+ * Outputs a DNA 'sequence' of given length, with some random gaps included.
+ *
+ * @param seqno
+ * @param length
+ * @param gapPercentage
+ */
+ private SequenceI generateSequence(int seqno, int length,
+ int gapPercentage)
+ {
+ StringBuilder seq = new StringBuilder(length);
+
+ /*
+ * Loop till we've added 'length' bases (excluding gaps)
+ */
+ for (int count = 0; count < length;)
+ {
+ boolean addGap = random.nextInt(100) < gapPercentage;
+ char c = addGap ? GAP : BASES[random.nextInt(Integer.MAX_VALUE)
+ % BASES.length];
+ seq.append(c);
+ if (!addGap)
+ {
+ count++;
+ }
+ }
+ final String seqName = "SEQ" + seqno;
+ final String seqString = seq.toString();
+ SequenceI sq = new Sequence(seqName, seqString);
+ sq.createDatasetSequence();
+ return sq;
+ }
+
+ /**
+ * Generate a sequence approximately aligned to the first one.
+ *
+ * @param ds
+ * @param seqno
+ * @param width
+ * number of bases
+ * @param changePercentage
+ * @return
+ */
+ private SequenceI generateAnotherSequence(char[] ds, int seqno,
+ int width, int changePercentage)
+ {
+ int length = ds.length;
+ char[] seq = new char[length];
+ Arrays.fill(seq, ZERO);
+ int gapsWanted = length - width;
+ int gapsAdded = 0;
+
+ /*
+ * First 'randomly' mimic gaps in model sequence.
+ */
+ for (int pos = 0; pos < length; pos++)
+ {
+ if (ds[pos] == GAP)
+ {
+ /*
+ * Add a gap at the same position with changePercentage likelihood
+ */
+ seq[pos] = randomCharacter(GAP, changePercentage);
+ if (seq[pos] == GAP)
+ {
+ gapsAdded++;
+ }
+ }
+ }
+
+ /*
+ * Next scatter any remaining gaps (if any) at random. This gives an even
+ * distribution.
+ */
+ while (gapsAdded < gapsWanted)
+ {
+ boolean added = false;
+ while (!added)
+ {
+ int pos = random.nextInt(length);
+ if (seq[pos] != GAP)
+ {
+ seq[pos] = GAP;
+ added = true;
+ gapsAdded++;
+ }
+ }
+ }
+
+ /*
+ * Finally fill in the rest with randomly mutated bases.
+ */
+ for (int pos = 0; pos < length; pos++)
+ {
+ if (seq[pos] == ZERO)
+ {
+ char c = randomCharacter(ds[pos], changePercentage);
+ seq[pos] = c;
+ }
+ }
+ final String seqName = "SEQ" + seqno;
+ final String seqString = new String(seq);
+ SequenceI sq = new Sequence(seqName, seqString);
+ sq.createDatasetSequence();
+ return sq;
+ }
+
+ /**
+ * Returns a random character that is changePercentage% likely to match the
+ * given type (as base or gap).
+ *
+ * @param changePercentage
+ *
+ * @param c
+ * @return
+ */
+ private char randomCharacter(char c, int changePercentage)
+ {
+ final boolean mutation = random.nextInt(100) < changePercentage;
+
+ if (!mutation)
+ {
+ return c;
+ }
+
+ char newchar = c;
+ while (newchar == c)
+ {
+ newchar = BASES[random.nextInt(Integer.MAX_VALUE) % BASES.length];
+ }
+ return newchar;
+ }
+}