package jalview.analysis; import jalview.datamodel.Alignment; import jalview.datamodel.AlignmentI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; import jalview.io.FastaFile; import java.util.Arrays; import java.util.Random; /** * Generates, and outputs in Fasta format, a random DNA alignment for given * sequence length and count. Will regenerate the same alignment each time if * the same random seed is used (so may be used for reproducible unit tests). * Not guaranteed to reproduce the same results between versions, as the rules * may get tweaked to produce more 'realistic' results. * * Arguments: * * * @author gmcarstairs * */ public class DnaAlignmentGenerator { private static final char GAP = '-'; private static final char ZERO = '0'; private static final char[] BASES = new char[] { 'G', 'T', 'C', 'A' }; private Random random; /** * Outputs a DNA 'alignment' where each position is a random choice from * 'GTCA-'. * * @param args */ public static void main(String[] args) { if (args.length != 5) { usage(); return; } int width = Integer.parseInt(args[0]); int height = Integer.parseInt(args[1]); long randomSeed = Long.valueOf(args[2]); int gapPercentage = Integer.valueOf(args[3]); int changePercentage = Integer.valueOf(args[4]); AlignmentI al = new DnaAlignmentGenerator().generate(width, height, randomSeed, gapPercentage, changePercentage); System.out.println("; " + height + " sequences of " + width + " bases with " + gapPercentage + "% gaps and " + changePercentage + "% mutations (random seed = " + randomSeed + ")"); System.out.println(new FastaFile().print(al.getSequencesArray())); } /** * Print parameter help. */ private static void usage() { System.out.println("Usage:"); System.out.println("arg0: number of (non-gap) bases per sequence"); System.out.println("arg1: number sequences"); System.out .println("arg2: an integer as random seed (same seed = same results)"); System.out.println("arg3: percentage of gaps to (randomly) generate"); System.out .println("arg4: percentage of 'mutations' to (randomly) generate"); System.out.println("Example: DnaAlignmentGenerator 12 15 387 10 5"); System.out .println("- 15 sequences of 12 bases each, approx 10% gaps and 5% mutations, random seed = 387"); } /** * Default constructor */ public DnaAlignmentGenerator() { } /** * Outputs a DNA 'alignment' of given width and height, where each position is * a random choice from 'GTCA-'. * * @param width * @param height * @param randomSeed * @param changePercentage * @param gapPercentage */ public AlignmentI generate(int width, int height, long randomSeed, int gapPercentage, int changePercentage) { SequenceI[] seqs = new SequenceI[height]; random = new Random(randomSeed); seqs[0] = generateSequence(1, width, gapPercentage); for (int seqno = 1; seqno < height; seqno++) { seqs[seqno] = generateAnotherSequence(seqs[0].getSequence(), seqno + 1, width, changePercentage); } AlignmentI al = new Alignment(seqs); return al; } /** * Outputs a DNA 'sequence' of given length, with some random gaps included. * * @param seqno * @param length * @param gapPercentage */ private SequenceI generateSequence(int seqno, int length, int gapPercentage) { StringBuilder seq = new StringBuilder(length); /* * Loop till we've added 'length' bases (excluding gaps) */ for (int count = 0; count < length;) { boolean addGap = random.nextInt(100) < gapPercentage; char c = addGap ? GAP : BASES[random.nextInt(Integer.MAX_VALUE) % 4]; seq.append(c); if (!addGap) { count++; } } final String seqName = "SEQ" + seqno; final String seqString = seq.toString(); SequenceI sq = new Sequence(seqName, seqString); sq.createDatasetSequence(); return sq; } /** * Generate a sequence approximately aligned to the first one. * * @param ds * @param seqno * @param width * number of bases * @param changePercentage * @return */ private SequenceI generateAnotherSequence(char[] ds, int seqno, int width, int changePercentage) { int length = ds.length; char[] seq = new char[length]; Arrays.fill(seq, ZERO); int gapsWanted = length - width; int gapsAdded = 0; /* * First 'randomly' mimic gaps in model sequence. */ for (int pos = 0; pos < length; pos++) { if (ds[pos] == GAP) { /* * Add a gap at the same position with changePercentage likelihood */ seq[pos] = randomCharacter(GAP, changePercentage); if (seq[pos] == GAP) { gapsAdded++; } } } /* * Next scatter any remaining gaps (if any) at random. This gives an even * distribution. */ while (gapsAdded < gapsWanted) { boolean added = false; while (!added) { int pos = random.nextInt(length); if (seq[pos] != GAP) { seq[pos] = GAP; added = true; gapsAdded++; } } } /* * Finally fill in the rest with randomly mutated bases. */ for (int pos = 0; pos < length; pos++) { if (seq[pos] == ZERO) { char c = randomCharacter(ds[pos], changePercentage); seq[pos] = c; } } final String seqName = "SEQ" + seqno; final String seqString = new String(seq); SequenceI sq = new Sequence(seqName, seqString); sq.createDatasetSequence(); return sq; } /** * Returns a random character that is changePercentage% likely to match the * given type (as base or gap). * * @param changePercentage * * @param c * @return */ private char randomCharacter(char c, int changePercentage) { final boolean mutation = random.nextInt(100) < changePercentage; if (!mutation) { return c; } char newchar = c; while (newchar == c) { newchar = BASES[random.nextInt(Integer.MAX_VALUE) % 4]; } return newchar; } }