From: gmungoc Date: Fri, 23 Jan 2015 11:27:47 +0000 (+0000) Subject: Random DNA generator tarted up X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=6ca7520368a1e7fc774ec77d4f1dfc3ea940ead2;p=jalview.git Random DNA generator tarted up --- diff --git a/test/jalview/analysis/DnaAlignmentGenerator.java b/test/jalview/analysis/DnaAlignmentGenerator.java index 4215df7..1b8964f 100644 --- a/test/jalview/analysis/DnaAlignmentGenerator.java +++ b/test/jalview/analysis/DnaAlignmentGenerator.java @@ -6,19 +6,23 @@ import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; import jalview.io.FastaFile; +import java.util.Arrays; import java.util.Random; /** * Generates, and outputs in Fasta format, a random DNA alignment for given * sequence length and count. Will regenerate the same alignment each time if * the same random seed is used (so may be used for reproducible unit tests). - * The alignment is not padded with gaps. + * Not guaranteed to reproduce the same results between versions, as the rules + * may get tweaked to produce more 'realistic' results. * * Arguments: * * * @author gmcarstairs @@ -27,31 +31,62 @@ import java.util.Random; public class DnaAlignmentGenerator { private static final char GAP = '-'; - + + private static final char ZERO = '0'; + private static final char[] BASES = new char[] - { 'G', 'T', 'C', 'A', GAP }; + { 'G', 'T', 'C', 'A' }; private Random random; /** - * Given args for sequence length and count, output a DNA 'alignment' where - * each position is a random choice from 'GTCA-'. + * Outputs a DNA 'alignment' where each position is a random choice from + * 'GTCA-'. * * @param args - * the width (base count) and height (sequence count) to generate - * plus an integer random seed value */ public static void main(String[] args) { + if (args.length != 5) + { + usage(); + return; + } int width = Integer.parseInt(args[0]); int height = Integer.parseInt(args[1]); long randomSeed = Long.valueOf(args[2]); + int gapPercentage = Integer.valueOf(args[3]); + int changePercentage = Integer.valueOf(args[4]); AlignmentI al = new DnaAlignmentGenerator().generate(width, height, - randomSeed); + randomSeed, gapPercentage, changePercentage); + + System.out.println("; " + height + " sequences of " + width + + " bases with " + gapPercentage + "% gaps and " + + changePercentage + "% mutations (random seed = " + randomSeed + + ")"); System.out.println(new FastaFile().print(al.getSequencesArray())); } /** + * Print parameter help. + */ + private static void usage() + { + System.out.println("Usage:"); + System.out.println("arg0: number of (non-gap) bases per sequence"); + System.out.println("arg1: number sequences"); + System.out + .println("arg2: an integer as random seed (same seed = same results)"); + System.out.println("arg3: percentage of gaps to (randomly) generate"); + System.out + .println("arg4: percentage of 'mutations' to (randomly) generate"); + System.out.println("Example: DnaAlignmentGenerator 12 15 387 10 5"); + System.out + .println("- 15 sequences of 12 bases each, approx 10% gaps and 5% mutations, random seed = 387"); + + } + + /** * Default constructor */ public DnaAlignmentGenerator() @@ -66,14 +101,19 @@ public class DnaAlignmentGenerator * @param width * @param height * @param randomSeed + * @param changePercentage + * @param gapPercentage */ - public AlignmentI generate(int width, int height, long randomSeed) + public AlignmentI generate(int width, int height, long randomSeed, + int gapPercentage, int changePercentage) { SequenceI[] seqs = new SequenceI[height]; random = new Random(randomSeed); - for (int seqno = 0; seqno < height; seqno++) + seqs[0] = generateSequence(1, width, gapPercentage); + for (int seqno = 1; seqno < height; seqno++) { - seqs[seqno] = generateSequence(seqno + 1, width); + seqs[seqno] = generateAnotherSequence(seqs[0].getSequence(), + seqno + 1, width, changePercentage); } AlignmentI al = new Alignment(seqs); return al; @@ -84,26 +124,130 @@ public class DnaAlignmentGenerator * * @param seqno * @param length + * @param gapPercentage */ - private SequenceI generateSequence(int seqno, int length) + private SequenceI generateSequence(int seqno, int length, + int gapPercentage) { StringBuilder seq = new StringBuilder(length); /* * Loop till we've added 'length' bases (excluding gaps) */ - for (int count = 0 ; count < length ; ) { - char c = BASES[random.nextInt(Integer.MAX_VALUE) % 5]; + for (int count = 0; count < length;) + { + boolean addGap = random.nextInt(100) < gapPercentage; + char c = addGap ? GAP : BASES[random.nextInt(Integer.MAX_VALUE) % 4]; seq.append(c); - if (c != GAP) + if (!addGap) { count++; } } - final String seqName = ">SEQ" + seqno; + final String seqName = "SEQ" + seqno; final String seqString = seq.toString(); SequenceI sq = new Sequence(seqName, seqString); sq.createDatasetSequence(); return sq; } + + /** + * Generate a sequence approximately aligned to the first one. + * + * @param ds + * @param seqno + * @param width + * number of bases + * @param changePercentage + * @return + */ + private SequenceI generateAnotherSequence(char[] ds, int seqno, + int width, int changePercentage) + { + int length = ds.length; + char[] seq = new char[length]; + Arrays.fill(seq, ZERO); + int gapsWanted = length - width; + int gapsAdded = 0; + + /* + * First 'randomly' mimic gaps in model sequence. + */ + for (int pos = 0; pos < length; pos++) + { + if (ds[pos] == GAP) + { + /* + * Add a gap at the same position with changePercentage likelihood + */ + seq[pos] = randomCharacter(GAP, changePercentage); + if (seq[pos] == GAP) + { + gapsAdded++; + } + } + } + + /* + * Next scatter any remaining gaps (if any) at random. This gives an even + * distribution. + */ + while (gapsAdded < gapsWanted) + { + boolean added = false; + while (!added) + { + int pos = random.nextInt(length); + if (seq[pos] != GAP) + { + seq[pos] = GAP; + added = true; + gapsAdded++; + } + } + } + + /* + * Finally fill in the rest with randomly mutated bases. + */ + for (int pos = 0; pos < length; pos++) + { + if (seq[pos] == ZERO) + { + char c = randomCharacter(ds[pos], changePercentage); + seq[pos] = c; + } + } + final String seqName = "SEQ" + seqno; + final String seqString = new String(seq); + SequenceI sq = new Sequence(seqName, seqString); + sq.createDatasetSequence(); + return sq; + } + + /** + * Returns a random character that is changePercentage% likely to match the + * given type (as base or gap). + * + * @param changePercentage + * + * @param c + * @return + */ + private char randomCharacter(char c, int changePercentage) + { + final boolean mutation = random.nextInt(100) < changePercentage; + + if (!mutation) + { + return c; + } + + char newchar = c; + while (newchar == c) + { + newchar = BASES[random.nextInt(Integer.MAX_VALUE) % 4]; + } + return newchar; + } } diff --git a/test/jalview/analysis/DnaTest.java b/test/jalview/analysis/DnaTest.java index 59938ae..01ed183 100644 --- a/test/jalview/analysis/DnaTest.java +++ b/test/jalview/analysis/DnaTest.java @@ -264,7 +264,7 @@ public class DnaTest /* * Generate cDNA - 8 sequences of 12 bases each. */ - AlignmentI cdna = new DnaAlignmentGenerator().generate(12, 8, 97); + AlignmentI cdna = new DnaAlignmentGenerator().generate(12, 8, 97, 5, 5); ColumnSelection cs = new ColumnSelection(); AlignViewportI av = new AlignViewport(cdna, cs); Dna dna = new Dna(av, new int[]