From cc795f8e69446b0ba752ed84c0aab340538c0fab Mon Sep 17 00:00:00 2001 From: gmungoc Date: Thu, 12 Jan 2017 17:10:00 +0000 Subject: [PATCH] JAL-1270 renamed AlignmentGenerator, and added peptide option --- test/jalview/analysis/AlignmentGenerator.java | 291 +++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 test/jalview/analysis/AlignmentGenerator.java diff --git a/test/jalview/analysis/AlignmentGenerator.java b/test/jalview/analysis/AlignmentGenerator.java new file mode 100644 index 0000000..0e3bbbb --- /dev/null +++ b/test/jalview/analysis/AlignmentGenerator.java @@ -0,0 +1,291 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ +package jalview.analysis; + +import jalview.datamodel.Alignment; +import jalview.datamodel.AlignmentI; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceI; +import jalview.gui.JvOptionPane; +import jalview.io.FastaFile; + +import java.util.Arrays; +import java.util.Random; + +import org.testng.annotations.BeforeClass; + +/** + * Generates, and outputs in Fasta format, a random DNA alignment for given + * sequence length and count. Will regenerate the same alignment each time if + * the same random seed is used (so may be used for reproducible unit tests). + * Not guaranteed to reproduce the same results between versions, as the rules + * may get tweaked to produce more 'realistic' results. + * + * Arguments: + * + * + * @author gmcarstairs + * + */ +public class AlignmentGenerator +{ + + @BeforeClass(alwaysRun = true) + public void setUpJvOptionPane() + { + JvOptionPane.setInteractiveMode(false); + JvOptionPane.setMockResponse(JvOptionPane.CANCEL_OPTION); + } + + private static final char GAP = '-'; + + private static final char ZERO = '0'; + + private static final char[] NUCS = "GTCA".toCharArray(); + + private static final char[] PEPS = "MILVFYWHKRDEQNTCGASNP".toCharArray(); + + private static char[] BASES; + + private Random random; + + + /** + * Outputs a DNA 'alignment' where each position is a random choice from + * 'GTCA-'. + * + * @param args + */ + public static void main(String[] args) + { + if (args.length != 6) + { + usage(); + return; + } + BASES = args[0].toLowerCase().startsWith("n") ? NUCS : PEPS; + int width = Integer.parseInt(args[1]); + int height = Integer.parseInt(args[2]); + long randomSeed = Long.valueOf(args[3]); + int gapPercentage = Integer.valueOf(args[4]); + int changePercentage = Integer.valueOf(args[5]); + AlignmentI al = new AlignmentGenerator().generate(width, height, + randomSeed, gapPercentage, changePercentage); + + System.out.println("; " + height + " sequences of " + width + + " bases with " + gapPercentage + "% gaps and " + + changePercentage + "% mutations (random seed = " + randomSeed + + ")"); + System.out.println(new FastaFile().print(al.getSequencesArray(), true)); + } + + /** + * Print parameter help. + */ + private static void usage() + { + System.out.println("Usage:"); + System.out.println("arg0: n (for nucleotide) or p (for peptide)"); + System.out.println("arg1: number of (non-gap) bases per sequence"); + System.out.println("arg2: number sequences"); + System.out + .println("arg3: an integer as random seed (same seed = same results)"); + System.out.println("arg4: percentage of gaps to (randomly) generate"); + System.out + .println("arg5: percentage of 'mutations' to (randomly) generate"); + System.out.println("Example: AlignmentGenerator n 12 15 387 10 5"); + System.out + .println("- 15 nucleotide sequences of 12 bases each, approx 10% gaps and 5% mutations, random seed = 387"); + + } + + /** + * Default constructor + */ + public AlignmentGenerator() + { + + } + + /** + * Outputs a DNA 'alignment' of given width and height, where each position is + * a random choice from 'GTCA-'. + * + * @param width + * @param height + * @param randomSeed + * @param changePercentage + * @param gapPercentage + */ + public AlignmentI generate(int width, int height, long randomSeed, + int gapPercentage, int changePercentage) + { + SequenceI[] seqs = new SequenceI[height]; + random = new Random(randomSeed); + seqs[0] = generateSequence(1, width, gapPercentage); + for (int seqno = 1; seqno < height; seqno++) + { + seqs[seqno] = generateAnotherSequence(seqs[0].getSequence(), + seqno + 1, width, changePercentage); + } + AlignmentI al = new Alignment(seqs); + return al; + } + + /** + * Outputs a DNA 'sequence' of given length, with some random gaps included. + * + * @param seqno + * @param length + * @param gapPercentage + */ + private SequenceI generateSequence(int seqno, int length, + int gapPercentage) + { + StringBuilder seq = new StringBuilder(length); + + /* + * Loop till we've added 'length' bases (excluding gaps) + */ + for (int count = 0; count < length;) + { + boolean addGap = random.nextInt(100) < gapPercentage; + char c = addGap ? GAP : BASES[random.nextInt(Integer.MAX_VALUE) + % BASES.length]; + seq.append(c); + if (!addGap) + { + count++; + } + } + final String seqName = "SEQ" + seqno; + final String seqString = seq.toString(); + SequenceI sq = new Sequence(seqName, seqString); + sq.createDatasetSequence(); + return sq; + } + + /** + * Generate a sequence approximately aligned to the first one. + * + * @param ds + * @param seqno + * @param width + * number of bases + * @param changePercentage + * @return + */ + private SequenceI generateAnotherSequence(char[] ds, int seqno, + int width, int changePercentage) + { + int length = ds.length; + char[] seq = new char[length]; + Arrays.fill(seq, ZERO); + int gapsWanted = length - width; + int gapsAdded = 0; + + /* + * First 'randomly' mimic gaps in model sequence. + */ + for (int pos = 0; pos < length; pos++) + { + if (ds[pos] == GAP) + { + /* + * Add a gap at the same position with changePercentage likelihood + */ + seq[pos] = randomCharacter(GAP, changePercentage); + if (seq[pos] == GAP) + { + gapsAdded++; + } + } + } + + /* + * Next scatter any remaining gaps (if any) at random. This gives an even + * distribution. + */ + while (gapsAdded < gapsWanted) + { + boolean added = false; + while (!added) + { + int pos = random.nextInt(length); + if (seq[pos] != GAP) + { + seq[pos] = GAP; + added = true; + gapsAdded++; + } + } + } + + /* + * Finally fill in the rest with randomly mutated bases. + */ + for (int pos = 0; pos < length; pos++) + { + if (seq[pos] == ZERO) + { + char c = randomCharacter(ds[pos], changePercentage); + seq[pos] = c; + } + } + final String seqName = "SEQ" + seqno; + final String seqString = new String(seq); + SequenceI sq = new Sequence(seqName, seqString); + sq.createDatasetSequence(); + return sq; + } + + /** + * Returns a random character that is changePercentage% likely to match the + * given type (as base or gap). + * + * @param changePercentage + * + * @param c + * @return + */ + private char randomCharacter(char c, int changePercentage) + { + final boolean mutation = random.nextInt(100) < changePercentage; + + if (!mutation) + { + return c; + } + + char newchar = c; + while (newchar == c) + { + newchar = BASES[random.nextInt(Integer.MAX_VALUE) % BASES.length]; + } + return newchar; + } +} -- 1.7.10.2