From: gmungoc Date: Wed, 30 Sep 2015 15:56:55 +0000 (+0100) Subject: JAL-1499 initial tests working, can export to / import from textbox ok X-Git-Url: http://source.jalview.org/gitweb/?p=jalview.git;a=commitdiff_plain;h=2f72b0e35076b189cde387bbe66643c898f7f015 JAL-1499 initial tests working, can export to / import from textbox ok --- diff --git a/src/jalview/io/AlignFile.java b/src/jalview/io/AlignFile.java index 2c42de0..0d611fb 100755 --- a/src/jalview/io/AlignFile.java +++ b/src/jalview/io/AlignFile.java @@ -345,6 +345,17 @@ public abstract class AlignFile extends FileParse */ public abstract String print(); + /** + * Print out the given alignment in the file format represented by this class. + * Default action is just to print the formatted sequences, but this can be + * overridden to use additional properties of the alignment. + */ + public String print(AlignmentI al) + { + setSeqs(al.getSequencesArray()); + return print(); + } + public void addJVSuffix(boolean b) { jvSuffix = b; diff --git a/src/jalview/io/AppletFormatAdapter.java b/src/jalview/io/AppletFormatAdapter.java index d2d607b..4b7e26e 100755 --- a/src/jalview/io/AppletFormatAdapter.java +++ b/src/jalview/io/AppletFormatAdapter.java @@ -636,10 +636,12 @@ public class AppletFormatAdapter } else { + // MC this is pointless? Only performed if viewpanel.getAlignment() == + // alignment i.e. the same case as the if condition afile.setSeqs(viewpanel.getAlignment().getSequencesArray()); } - String afileresp = afile.print(); + String afileresp = afile.print(alignment); if (afile.hasWarningMessage()) { System.err.println("Warning raised when writing as " + format diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java index 90693f1..238061a 100644 --- a/src/jalview/io/MegaFile.java +++ b/src/jalview/io/MegaFile.java @@ -48,6 +48,28 @@ import java.util.Set; */ public class MegaFile extends AlignFile { + private static final int DEFAULT_LINE_LENGTH = 60; + + private static final String INDENT = " "; + + private static final String N_SITES = "NSites"; + + private static final String N_SEQS = "NSeqs"; + + private static final String MISSING = "Missing"; + + private static final String IDENTICAL = "Identical"; + + private static final String INDEL = "Indel"; + + private static final String CODETABLE = "CodeTable"; + + private static final String PROTEIN = "Protein"; + + private static final String NUCLEOTIDE = "Nucleotide"; + + private static final String DATATYPE = "DataType"; + private static final char COMMENT_START = '['; private static final char COMMENT_END = ']'; @@ -62,7 +84,7 @@ public class MegaFile extends AlignFile private static final String MEGA_ID = HASHSIGN + "MEGA"; - private static final String TITLE = "TITLE"; + private static final String TITLE = "Title"; private static final String FORMAT = "Format"; @@ -72,8 +94,6 @@ public class MegaFile extends AlignFile private static final String DOMAIN = "Domain"; - private static final String INTERLEAVED = "Interleaved"; - /* * names of properties to save to the alignment (may affect eventual output * format) @@ -90,6 +110,11 @@ public class MegaFile extends AlignFile static final String PROP_MISSING = "MEGA_MISSING"; + static final String PROP_DATATYPE = "MEGA_DATATYPE"; + + // number of bases per line of file (value is inferred) + static final String PROP_LINELENGTH = "MEGA_LINELENGTH"; + // TODO: need a controlled name for Gene as a feature if we want to be able to // output the MEGA file with !Gene headers // WTF do we do if the sequences get realigned? @@ -99,7 +124,10 @@ public class MegaFile extends AlignFile private static final String SPACE = " "; - private static final int POSITIONS_PER_LINE = 50; + /* + * number of sequence positions output per line + */ + private int positionsPerLine; private String title; @@ -181,6 +209,9 @@ public class MegaFile extends AlignFile dataLine = nextNonCommentLine(); } + // remember the (longest) line length read in, so we can output the same + setAlignmentProperty(PROP_LINELENGTH, String.valueOf(positionsPerLine)); + setSequences(seqData); } @@ -435,6 +466,8 @@ public class MegaFile extends AlignFile * Add the current line of data to the sequence. */ sb.append(dataLine); + + setPositionsPerLine(Math.max(positionsPerLine, dataLine.length())); } /** @@ -489,7 +522,7 @@ public class MegaFile extends AlignFile /* * Do nothing if this line is _only_ a sequence id with no data following. * - * Remove any internal spaces (present in the 'fancy' file format) + * Remove any internal spaces */ if (data != null && data.length() > 0) { @@ -498,6 +531,7 @@ public class MegaFile extends AlignFile data = data.replace(SPACE, ""); } sb.append(data); + setPositionsPerLine(Math.max(positionsPerLine, data.length())); assertInterleaved(true, dataLine); } } @@ -554,7 +588,8 @@ public class MegaFile extends AlignFile if (isTitle(inputLine)) { - setAlignmentProperty(PROP_TITLE, getValue(inputLine)); + this.title = getValue(inputLine); + setAlignmentProperty(PROP_TITLE, title); } else if (inputLine.startsWith(BANG + DESCRIPTION)) { @@ -616,6 +651,10 @@ public class MegaFile extends AlignFile { inputLine = inputLine.substring(0, inputLine.length() - 1); } + if (inputLine.length() == 0) + { + return; + } String[] tokens = inputLine.trim().split("\\s"); // any whitespace for (String token : tokens) { @@ -656,7 +695,7 @@ public class MegaFile extends AlignFile /* * Jalview will work out whether nucleotide or not anyway */ - if (keyword.equalsIgnoreCase("DataType")) + if (keyword.equalsIgnoreCase(DATATYPE)) { if (value.equalsIgnoreCase("DNA") || value.equalsIgnoreCase("RNA") || value.equalsIgnoreCase("Nucleotide")) @@ -664,7 +703,7 @@ public class MegaFile extends AlignFile this.nucleotide = true; // alignment computes whether or not it is nucleotide when created } - else if (value.equalsIgnoreCase("Protein")) + else if (value.equalsIgnoreCase(PROTEIN)) { this.nucleotide = false; } @@ -672,13 +711,14 @@ public class MegaFile extends AlignFile { throw new FileFormatException(msg); } + setAlignmentProperty(PROP_DATATYPE, value); } /* * accept non-Standard code table but save in case we want to disable * 'translate as cDNA' */ - else if (keyword.equalsIgnoreCase("CodeTable")) + else if (keyword.equalsIgnoreCase(CODETABLE)) { setAlignmentProperty(PROP_CODETABLE, value); } @@ -686,23 +726,23 @@ public class MegaFile extends AlignFile /* * save gap char to set later on alignment once created */ - else if (keyword.equalsIgnoreCase("Indel")) + else if (keyword.equalsIgnoreCase(INDEL)) { this.gapCharacter = value.charAt(0); } - else if (keyword.equalsIgnoreCase("Identical") + else if (keyword.equalsIgnoreCase(IDENTICAL) || keyword.equalsIgnoreCase("MatchChar")) { + setAlignmentProperty(PROP_IDENTITY, value); if (!".".equals(value)) { - setAlignmentProperty(PROP_IDENTITY, value); System.err.println("Warning: " + token + " not supported, Jalview uses '.' for identity"); } } - else if (keyword.equalsIgnoreCase("Missing")) + else if (keyword.equalsIgnoreCase(MISSING)) { setAlignmentProperty(PROP_MISSING, value); System.err.println("Warning: " + token + " not supported"); @@ -715,8 +755,8 @@ public class MegaFile extends AlignFile setAlignmentProperty(PROP_MISSING, value); } - else if (!keyword.equalsIgnoreCase("NSeqs") - && !keyword.equalsIgnoreCase("NSites")) + else if (!keyword.equalsIgnoreCase(N_SEQS) + && !keyword.equalsIgnoreCase(N_SITES)) { System.err.println("Warning: " + msg); } @@ -781,7 +821,8 @@ public class MegaFile extends AlignFile return false; } String upper = inputLine.toUpperCase(); - return (upper.startsWith(TITLE) || upper.startsWith(BANG + TITLE)); + return (upper.startsWith(TITLE.toUpperCase()) || upper.startsWith(BANG + + TITLE.toUpperCase())); } /** @@ -799,7 +840,7 @@ public class MegaFile extends AlignFile { if (line.endsWith(SEMICOLON)) { - desc.append(line.substring(0, line.length() - 1)).append(newline); + desc.append(line.substring(0, line.length() - 1)); break; } else if (line.length() > 0) @@ -812,24 +853,21 @@ public class MegaFile extends AlignFile } /** - * Write out the alignment sequences in Mega format. + * Returns the alignment sequences in Mega format. */ @Override public String print() { - return print(getSeqsAsArray()); + return MEGA_ID + newline + print(getSeqsAsArray()); } /** * Write out the alignment sequences in Mega format - interleaved unless * explicitly noninterleaved. */ - public String print(SequenceI[] s) + protected String print(SequenceI[] s) { - // TODO: is there a way to preserve the 'interleaved' property so it can - // affect output? - - String result = null; + String result; if (this.interleaved != null && !this.interleaved) { result = printNonInterleaved(s); @@ -842,21 +880,8 @@ public class MegaFile extends AlignFile } /** - * Print the sequences in interleaved format, each row 15 space-separated - * triplets. - * - * @param s - * @return - */ - protected String printInterleavedCodons(SequenceI[] s) - { - // TODO not coded yet - defaulting to the 'simple' format output - return printInterleaved(s); - } - - /** - * Print to string in Interleaved format - blocks of next 50 characters of - * each sequence in turn. + * Print to string in Interleaved format - blocks of next N characters of each + * sequence in turn. * * @param s */ @@ -864,58 +889,119 @@ public class MegaFile extends AlignFile { int maxIdLength = getMaxIdLength(s); int maxSequenceLength = getMaxSequenceLength(s); - int numLines = maxSequenceLength / POSITIONS_PER_LINE + 3; // approx + int numLines = maxSequenceLength / positionsPerLine + 3; // approx /* * Size a buffer to hold the whole output */ StringBuilder sb = new StringBuilder(numLines - * (maxIdLength + 2 + POSITIONS_PER_LINE)); - printHeaders(sb); + * (maxIdLength + 2 + positionsPerLine)); + + int numDataBlocks = (maxSequenceLength - 1) / positionsPerLine + 1; + int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10; + int chunksPerLine = (positionsPerLine + spaceEvery - 1) / spaceEvery; - int numDataBlocks = (maxSequenceLength - 1) / POSITIONS_PER_LINE + 1; + /* + * Output as: #Seqid CGT AGC ACT ... or blocks of 10 for peptide + */ + int from = 0; for (int i = 0; i < numDataBlocks; i++) { sb.append(newline); + boolean first = true; + int advancedBy = 0; for (SequenceI seq : s) { - - String seqId = String.format("#%-" + maxIdLength + "s ", + int seqFrom = from; + String seqId = String.format("#%-" + maxIdLength + "s", seq.getName()); - char[] subSequence = seq.getSequence(i * POSITIONS_PER_LINE, - (i + 1) * POSITIONS_PER_LINE); + + /* + * output next line for this sequence + */ sb.append(seqId); - sb.append(subSequence); + int lastPos = seqFrom + positionsPerLine; // exclusive + for (int j = 0; j < chunksPerLine; j++) + { + char[] subSequence = seq.getSequence(seqFrom, + Math.min(lastPos, seqFrom + spaceEvery)); + if (subSequence.length > 0) + { + sb.append(SPACE).append(subSequence); + } + seqFrom += subSequence.length; + if (first) + { + // all sequences should be the same length in MEGA + advancedBy += subSequence.length; + } + } sb.append(newline); + first = false; } + from += advancedBy; } return new String(sb); } /** - * Append the MEGA header and any other known properties + * Outputs to string the MEGA header and any other known and relevant + * alignment properties * - * @param sb + * @param al */ - private void printHeaders(StringBuilder sb) + protected String printHeaders(AlignmentI al) { - sb.append(MEGA_ID); - sb.append(newline); + StringBuilder sb = new StringBuilder(128); + sb.append(MEGA_ID).append(newline); + printProperty(al, sb, PROP_TITLE, TITLE); + printProperty(al, sb, PROP_DESCRIPTION, DESCRIPTION); - String ttle = getAlignmentProperty(PROP_TITLE); - if (ttle != null) + /* + * !Format DataType CodeTable + */ + sb.append(BANG).append(FORMAT).append(newline); + String dataType = (String) al.getProperty(PROP_DATATYPE); + if (dataType == null) { - sb.append(BANG).append(TITLE).append(SPACE).append(ttle) - .append(SEMICOLON).append(newline); + dataType = al.isNucleotide() ? NUCLEOTIDE : PROTEIN; } + sb.append(INDENT).append(DATATYPE).append(EQUALS).append(dataType); + String codeTable = (String) al.getProperty(PROP_CODETABLE); + sb.append(SPACE).append(CODETABLE).append(EQUALS) + .append(codeTable == null ? "Standard" : codeTable) + .append(newline); + + /* + * !Format NSeqs NSites + * NSites the length of any sequence (they should all be the same), excluding + * gaps?!? + */ + sb.append(INDENT).append(N_SEQS).append(EQUALS).append(al.getHeight()); + SequenceI seq = al.getSequenceAt(0); + sb.append(SPACE).append(N_SITES).append(EQUALS) + .append(seq.getEnd() - seq.getStart() + 1); + sb.append(newline); - String desc = getAlignmentProperty(PROP_DESCRIPTION); - if (desc != null) + /* + * !Format Indel Identical Missing + */ + sb.append(INDENT); + sb.append(INDEL).append(EQUALS).append(al.getGapCharacter()); + String identity = (String) al.getProperty(PROP_IDENTITY); + if (identity != null) + { + sb.append(SPACE).append(IDENTICAL).append(EQUALS).append(identity); + } + String missing = (String) al.getProperty(PROP_MISSING); + if (missing != null) { - sb.append(BANG).append(DESCRIPTION).append(SPACE).append(desc) - .append(SEMICOLON).append(newline); + sb.append(SPACE).append(MISSING).append(EQUALS).append(missing); } + sb.append(SEMICOLON).append(newline); + + return sb.toString(); } /** @@ -971,26 +1057,43 @@ public class MegaFile extends AlignFile { int maxSequenceLength = getMaxSequenceLength(s); // approx - int numLines = maxSequenceLength / POSITIONS_PER_LINE + 2 + s.length; + int numLines = maxSequenceLength / positionsPerLine + 2 + s.length; /* * Roughly size a buffer to hold the whole output */ - StringBuilder sb = new StringBuilder(numLines * POSITIONS_PER_LINE); - printHeaders(sb); + StringBuilder sb = new StringBuilder(numLines * positionsPerLine); + int spaceEvery = this.nucleotide != null && this.nucleotide ? 3 : 10; + int chunksPerLine = positionsPerLine / spaceEvery; for (SequenceI seq : s) { sb.append(newline); sb.append(HASHSIGN + seq.getName()).append(newline); int startPos = 0; - while (startPos <= seq.getLength()) + while (startPos < seq.getLength()) { - char[] subSequence = seq.getSequence(startPos, startPos - + POSITIONS_PER_LINE); - sb.append(subSequence); + boolean firstChunk = true; + /* + * print next line for this sequence + */ + int lastPos = startPos + positionsPerLine; // exclusive + for (int j = 0; j < chunksPerLine; j++) + { + char[] subSequence = seq.getSequence(startPos, + Math.min(lastPos, startPos + positionsPerLine)); + if (subSequence.length > 0) + { + if (!firstChunk) + { + sb.append(SPACE); + } + sb.append(subSequence); + firstChunk = false; + } + startPos += subSequence.length; + } sb.append(newline); - startPos += POSITIONS_PER_LINE; } } @@ -1006,15 +1109,16 @@ public class MegaFile extends AlignFile * @throws IOException */ protected void assertInterleaved(boolean isIt, String dataLine) - throws IOException + throws FileFormatException { if (this.interleaved != null && isIt != this.interleaved.booleanValue()) { - throw new IOException( + throw new FileFormatException( "Parse error: mix of interleaved and noninterleaved detected, at line: " + dataLine); } this.interleaved = new Boolean(isIt); + setAlignmentProperty(PROP_INTERLEAVED, interleaved.toString()); } public boolean isInterleaved() @@ -1045,4 +1149,61 @@ public class MegaFile extends AlignFile + (nucleotide ? " not" : "")); } } + + /** + * Print the given alignment in MEGA format. If the alignment was created by + * parsing a MEGA file, it should have properties set (e.g. Title) which can + * influence the output. + */ + @Override + public String print(AlignmentI al) + { + this.nucleotide = al.isNucleotide(); + String lineLength = (String) al.getProperty(PROP_LINELENGTH); + this.positionsPerLine = lineLength == null ? DEFAULT_LINE_LENGTH : Integer + .parseInt(lineLength); + return printHeaders(al) + print(al.getSequencesArray()); + } + + /** + * Helper method to append a property e.g. !Title to the output buffer, if the + * property is set on the alignment. + * + * @param al + * @param headers + * @param propertyName + * @param propertyKeyword + */ + protected void printProperty(AlignmentI al, StringBuilder headers, + String propertyName, String propertyKeyword) + { + String propertyValue = (String) al.getProperty(propertyName); + if (propertyValue != null) + { + headers.append(BANG).append(propertyKeyword).append(SPACE) + .append(propertyValue).append(SEMICOLON) + .append(newline); + } + } + + /** + * Returns the number of sequence positions output per line + * + * @return + */ + public int getPositionsPerLine() + { + return positionsPerLine; + } + + /** + * Sets the number of sequence positions output per line. Note these will be + * formatted in blocks of 3 (nucleotide) or 10 (peptide). + * + * @param p + */ + public void setPositionsPerLine(int p) + { + this.positionsPerLine = p; + } } diff --git a/test/jalview/io/MegaFileTest.java b/test/jalview/io/MegaFileTest.java index 92a3c3c..5caa50e 100644 --- a/test/jalview/io/MegaFileTest.java +++ b/test/jalview/io/MegaFileTest.java @@ -6,6 +6,7 @@ import static org.testng.AssertJUnit.assertNull; import static org.testng.AssertJUnit.assertTrue; import static org.testng.AssertJUnit.fail; +import jalview.datamodel.AlignmentI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; @@ -19,87 +20,70 @@ import org.testng.annotations.Test; */ public class MegaFileTest { - private static final String THIRTY_CHARS = "012345678901234567890123456789"; + private static final String TWENTY_CHARS = "9876543210abcdefghij"; + + private static final String THIRTY_CHARS = "0123456789klmnopqrstABCDEFGHIJ"; //@formatter:off private static final String INTERLEAVED = "#MEGA\n"+ "TITLE: Interleaved sequence data\n\n" + "#U455 ABCDEF\n" + - "#CPZANT MNOPQR\n\n" + "#U455 KLMNOP\n" + - "#CPZANT WXYZ"; + "#CPZANT MNOPQR\n\n" + + "#U455 KLMNOP\n" + + "#CPZANT WXYZGC"; private static final String INTERLEAVED_NOHEADERS = "#U455 ABCDEF\n" - + "#CPZANT MNOPQR\n\n" + + "#CPZANT MNOPQR\n\n" + "#U455 KLMNOP\n" - + "#CPZANT WXYZ\n"; + + "#CPZANT WXYZGC\n"; - // interleaved sequences, one with 60 one with 120 characters (on overlong - // input lines) - private static final String INTERLEAVED_LONGERTHAN50 = + // interleaved sequences, with 50 residues + private static final String INTERLEAVED_50RESIDUES = "#MEGA\n" - + "TITLE: Interleaved sequence data\n\n" - + "#U455 " + THIRTY_CHARS + THIRTY_CHARS + "\n" - + "#CPZANT " - + THIRTY_CHARS + THIRTY_CHARS + THIRTY_CHARS + THIRTY_CHARS; + + "!TITLE Interleaved sequence data\n\n" + + "#U455 " + THIRTY_CHARS + TWENTY_CHARS + "\n" + + "#CPZANT " + TWENTY_CHARS + THIRTY_CHARS + "\n"; private static final String NONINTERLEAVED = "#MEGA\n" - + "TITLE: Noninterleaved sequence data\n\n" + + "!TITLE Noninterleaved sequence data\n\n" + "#U455 \n" + "ABCFEDHIJ\n" + "MNOPQR\n\n" + "#CPZANT \n" + "KLMNOPWXYZ\n" + "CGATC\n"; - - // Sequence length 60 (split over two lines) - private static final String NONINTERLEAVED_LONGERTHAN50 = - "#SIXTY\n" + THIRTY_CHARS + "\n" + THIRTY_CHARS; - - // this one starts noninterleaved then switches to interleaved + + // this one starts interleaved then switches to non-interleaved private static final String MIXED = "#MEGA\n" - + "TITLE: This is a mess\n\n" + "#CPZANT KLMNOPWXYZCGATC\n\n" + + "!TITLE This is a mess\n\n" + + "#CPZANT KLMNOPWXYZCGATC\n\n" + "#U455\n " + "ABCFEDHIJ\n"; // interleaved with a new sequence appearing in the second block :-O private static final String INTERLEAVED_SEQUENCE_ERROR = "#MEGA" + "\n" - + "TITLE: Interleaved sequence data\n\n" + + "!TITLE Interleaved sequence data\n\n" + "#U455 ABCDEF\n" + "#CPZANT MNOPQR\n\n" + "#U456 KLMNOP\n"; // the 'fancy' format, different header format, bases in triplet groups - private static final String FANCY_FORMAT = + private static final String INTERLEAVED_WITH_DESCRIPTION = "#MEGA\n" - + "!Title Fancy format data;\n" - + "!Format DataType=DNA indel=- CodeTable=Standard;\n\n" + + "!Title Data with description;\n" + + "!Format DataType=DNA indel=- CodeTable=Standard Missing=? MatchChar=.;\n\n" + "!Description\n" + " Line one of description\n" + " Line two of description;\n\n" - + "!Gene=Adh Property=Coding CodonStart=1;\n" - + "#U455 ABC DEF\n" - + "#CPZANT MNO PQR\n\n" - + "#U455 KLM NOP\n" - + "#CPZANT WXY Z\n"; - - // interleaved sequence data for two genes - private static final String TWO_GENES = - "#MEGA\n" - + "!Title Fancy format data;\n" - + "!Format DataType=DNA indel=- CodeTable=Standard;\n\n" - + "!Description\n" - + " Line one of description\n" - + " Line two of description;\n\n" - + "!Gene=Adh Property=Coding CodonStart=1;\n" - + "#U455 ABC DEF\n" - + "#CPZANT MNO PQR\n\n" - + "#U455 KLM NOP\n" - + "#CPZANT WXY Z\n"; //TODO complete + + "#U455 CGC GTA\n" + + "#CPZANT ATC GGG\n\n" + + "#U455 CGA TTT\n" + + "#CPZANT CAA TGC\n"; //@formatter:on @@ -124,7 +108,7 @@ public class MegaFileTest // check sequence data assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0) .getSequenceAsString()); - assertEquals("Second sequence data wrong", "MNOPQRWXYZ", seqs.get(1) + assertEquals("Second sequence data wrong", "MNOPQRWXYZGC", seqs.get(1) .getSequenceAsString()); assertTrue("File format is not flagged as interleaved", testee.isInterleaved()); @@ -246,8 +230,9 @@ public class MegaFileTest System.out.println(printed); // normally output should match input // we cheated here with a number of short input lines - String expected = "#MEGA\n" + "!TITLE Interleaved sequence data;\n\n" - + "#U455 ABCDEFKLMNOP\n" + "#CPZANT MNOPQRWXYZ" + // nb don't get Title in output if not calling print(AlignmentI) + String expected = "#MEGA\n\n" + "#U455 ABCDEF\n" + + "#CPZANT MNOPQR\n\n" + "#U455 KLMNOP\n" + "#CPZANT WXYZGC" + "\n"; assertEquals("Print format wrong", expected, printed); } @@ -264,11 +249,9 @@ public class MegaFileTest AppletFormatAdapter.PASTE); String printed = testee.print(); System.out.println(printed); - // normally output should match input - // we cheated here with a number of short input lines - String expected = "#MEGA\n\n" + "#U455 ABCDEFKLMNOP" + "\n" - + "#CPZANT MNOPQRWXYZ\n"; - assertEquals("Print format wrong", expected, printed); + + assertEquals("Print format wrong", "#MEGA\n\n" + INTERLEAVED_NOHEADERS, + printed); } /** @@ -281,14 +264,14 @@ public class MegaFileTest { MegaFile testee = new MegaFile(NONINTERLEAVED, AppletFormatAdapter.PASTE); + assertEquals(10, testee.getPositionsPerLine()); String printed = testee.print(); System.out.println(printed); // normally output should match input // we cheated here with a number of short input lines - String expected = "#MEGA\n" - + "!TITLE Noninterleaved sequence data;\n\n" - + "#U455\n" + "ABCFEDHIJMNOPQR\n\n" + "#CPZANT\n" - + "KLMNOPWXYZCGATC\n"; + String expected = "#MEGA\n\n" + + "#U455\n" + "ABCFEDHIJM\nNOPQR\n\n" + + "#CPZANT\n" + "KLMNOPWXYZ\nCGATC\n"; assertEquals("Print format wrong", expected, printed); } @@ -301,20 +284,26 @@ public class MegaFileTest @Test(groups = { "Functional" }) public void testPrint_interleavedMultiLine() throws IOException { - MegaFile testee = new MegaFile(INTERLEAVED_LONGERTHAN50, + MegaFile testee = new MegaFile(INTERLEAVED_50RESIDUES, AppletFormatAdapter.PASTE); + assertEquals(50, testee.getPositionsPerLine()); + /* + * now simulate choosing 20 residues per line on output + */ + testee.setPositionsPerLine(20); String printed = testee.print(); System.out.println(printed); - // first sequence is length 60, second length 120 - // should be output as 50 + 10 + 0 and as 50 + 50 + 20 character lines - // respectively - String expected = "#MEGA\n" + "!TITLE Interleaved sequence data;\n\n" - + "#U455 " + THIRTY_CHARS + "01234567890123456789\n" - + "#CPZANT " + THIRTY_CHARS + "01234567890123456789\n" + "\n" - + "#U455 " + "0123456789\n" + "#CPZANT " + THIRTY_CHARS - + "01234567890123456789\n\n" + "#U455 \n" + "#CPZANT " - + "01234567890123456789" - + "\n"; + //@formatter:off + //0123456789klmnopqrstABCDEFGHIJ9876543210abcdefghij + String expected = + "#MEGA\n\n" + + "#U455 0123456789 klmnopqrst\n" + // first 20 + "#CPZANT 9876543210 abcdefghij\n\n" + + "#U455 ABCDEFGHIJ 9876543210\n" + // next 20 + "#CPZANT 0123456789 klmnopqrst\n\n" + + "#U455 abcdefghij\n" + // last 10 + "#CPZANT ABCDEFGHIJ\n"; + //@formatter:on assertEquals("Print format wrong", expected, printed); } @@ -327,31 +316,33 @@ public class MegaFileTest @Test(groups = { "Functional" }) public void testPrint_noninterleavedMultiLine() throws IOException { + final String NONINTERLEAVED_LONGERTHAN50 = "#SIXTY\n" + THIRTY_CHARS + + "\n" + TWENTY_CHARS + "9993332221\n"; MegaFile testee = new MegaFile(NONINTERLEAVED_LONGERTHAN50, AppletFormatAdapter.PASTE); + assertEquals(30, testee.getPositionsPerLine()); + testee.setPositionsPerLine(25); String printed = testee.print(); - System.out.println(printed); // 60 character sequence should be output as 50 on first line then 10 more - String expected = "#MEGA\n\n" + "#SIXTY\n" + THIRTY_CHARS - + "01234567890123456789\n" + "0123456789\n"; + String expected = "#MEGA\n\n" + "#SIXTY\n" + + "0123456789klmnopqrstABCDE\n" + "FGHIJ9876543210abcdefghij\n" + + "9993332221\n"; assertEquals("Print format wrong", expected, printed); } /** - * Test paste / parse of 'fancy format' data. + * Test parse of data including description * * @throws IOException */ @Test(groups = { "Functional" }) - public void testParse_fancyFormat() throws IOException + public void testParse_withDescription() throws IOException { - MegaFile testee = new MegaFile(FANCY_FORMAT, AppletFormatAdapter.PASTE); - assertEquals("Title not as expected", "Fancy format data", + MegaFile testee = new MegaFile(INTERLEAVED_WITH_DESCRIPTION, + AppletFormatAdapter.PASTE); + assertEquals("Title not as expected", "Data with description", testee.getAlignmentProperty(MegaFile.PROP_TITLE)); - // assertEquals("Format property not parsed", - // "DataType=DNA indel=- CodeTable=Standard;", - // testee.getAlignmentProperty(MegaFile.PROP_FORMAT)); Vector seqs = testee.getSeqs(); // should be 2 sequences assertEquals("Expected two sequences", 2, seqs.size()); @@ -360,16 +351,16 @@ public class MegaFileTest assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1) .getName()); // check sequence data - assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0) + assertEquals("First sequence data wrong", "CGCGTACGATTT", seqs.get(0) .getSequenceAsString()); - assertEquals("Second sequence data wrong", "MNOPQRWXYZ", seqs.get(1) + assertEquals("Second sequence data wrong", "ATCGGGCAATGC", seqs.get(1) .getSequenceAsString()); assertTrue("File format is not flagged as interleaved", testee.isInterleaved()); - assertEquals("Description property not parsed", - " Line one of description\n" - + " Line two of description\n", + assertEquals( + "Description property not parsed", + " Line one of description\n" + " Line two of description", testee.getAlignmentProperty(MegaFile.PROP_DESCRIPTION)); } @@ -410,4 +401,36 @@ public class MegaFileTest assertEquals("Mega", MegaFile.getValue("!Name \t\t Mega; ")); assertEquals("", MegaFile.getValue("Name")); } + + /** + * Test reading a MEGA file to an alignment then writing it out in MEGA + * format. Verify the output is (functionally) the same as the input. + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testRoundTrip_Interleaved() throws IOException + { + AppletFormatAdapter fa = new AppletFormatAdapter(); + AlignmentI al = fa.readFile(INTERLEAVED_WITH_DESCRIPTION, + AppletFormatAdapter.PASTE, "MEGA"); + MegaFile output = new MegaFile(); + String formatted = output.print(al); + //@formatter:off + String expected = + "#MEGA\n!Title Data with description;\n" + + "!Description Line one of description\n" + + " Line two of description;\n" + + "!Format\n" + + " DataType=DNA CodeTable=Standard\n" + + " NSeqs=2 NSites=12\n" + + " Indel=- Identical=. Missing=?;\n\n" + + "#U455 CGC GTA\n" + + "#CPZANT ATC GGG\n\n" + + "#U455 CGA TTT\n" + + "#CPZANT CAA TGC\n"; + //@formatter:on + assertEquals("Roundtrip didn't match", expected, + formatted); + } }