package jalview.io; import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertFalse; import static org.testng.AssertJUnit.assertNull; import static org.testng.AssertJUnit.assertTrue; import static org.testng.AssertJUnit.fail; import jalview.datamodel.AlignmentI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceI; import java.io.IOException; import java.util.Vector; import org.testng.annotations.Test; /* * Unit tests for MegaFile - read and write in MEGA format(s). */ public class MegaFileTest { private static final String TWENTY_CHARS = "9876543210abcdefghij"; private static final String THIRTY_CHARS = "0123456789klmnopqrstABCDEFGHIJ"; //@formatter:off private static final String INTERLEAVED = "#MEGA\n"+ "TITLE: Interleaved sequence data\n\n" + "#U455 ABCDEF\n" + "#CPZANT MNOPQR\n\n" + "#U455 KLMNOP\n" + "#CPZANT WXYZGC"; private static final String INTERLEAVED_NOHEADERS = "#U455 ABCDEF\n" + "#CPZANT MNOPQR\n\n" + "#U455 KLMNOP\n" + "#CPZANT WXYZGC\n"; // interleaved sequences, with 50 residues private static final String INTERLEAVED_50RESIDUES = "#MEGA\n" + "!TITLE Interleaved sequence data\n\n" + "#U455 " + THIRTY_CHARS + TWENTY_CHARS + "\n" + "#CPZANT " + TWENTY_CHARS + THIRTY_CHARS + "\n"; private static final String NONINTERLEAVED = "#MEGA\n" + "!TITLE Noninterleaved sequence data\n\n" + "#U455 \n" + "ABCFEDHIJ\n" + "MNOPQR\n\n" + "#CPZANT \n" + "KLMNOPWXYZ\n" + "CGATC\n"; // this one starts interleaved then switches to non-interleaved private static final String MIXED = "#MEGA\n" + "!TITLE This is a mess\n\n" + "#CPZANT KLMNOPWXYZCGATC\n\n" + "#U455\n " + "ABCFEDHIJ\n"; // interleaved with a new sequence appearing in the second block :-O private static final String INTERLEAVED_SEQUENCE_ERROR = "#MEGA" + "\n" + "!TITLE Interleaved sequence data\n\n" + "#U455 ABCDEF\n" + "#CPZANT MNOPQR\n\n" + "#U456 KLMNOP\n"; // the 'fancy' format, different header format, bases in triplet groups private static final String INTERLEAVED_WITH_DESCRIPTION = "#MEGA\n" + "!Title Data with description;\n" + "!Format DataType=DNA indel=- CodeTable=Standard Missing=? MatchChar=.;\n\n" + "!Description\n" + " Line one of description\n" + " Line two of description;\n\n" + "#U455 CGC GTA\n" + "#CPZANT ATC GGG\n\n" + "#U455 CGA TTT\n" + "#CPZANT CAA TGC\n"; //@formatter:on /** * Test paste of interleaved mega format data. * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_interleaved() throws IOException { MegaFile testee = new MegaFile(INTERLEAVED, AppletFormatAdapter.PASTE); assertEquals("Title not as expected", "Interleaved sequence data", testee.getAlignmentProperty(MegaFile.PROP_TITLE)); Vector seqs = testee.getSeqs(); // should be 2 sequences assertEquals("Expected two sequences", 2, seqs.size()); // check sequence names correct and order preserved assertEquals("First sequence id wrong", "U455", seqs.get(0).getName()); assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1) .getName()); // check sequence data assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0) .getSequenceAsString()); assertEquals("Second sequence data wrong", "MNOPQRWXYZGC", seqs.get(1) .getSequenceAsString()); assertTrue("File format is not flagged as interleaved", testee.isInterleaved()); } /** * Test paste of noninterleaved mega format data. * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_nonInterleaved() throws IOException { MegaFile testee = new MegaFile(NONINTERLEAVED, AppletFormatAdapter.PASTE); assertEquals("Title not as expected", "Noninterleaved sequence data", testee.getAlignmentProperty(MegaFile.PROP_TITLE)); Vector seqs = testee.getSeqs(); // should be 2 sequences assertEquals("Expected two sequences", 2, seqs.size()); // check sequence names correct and order preserved assertEquals("First sequence id wrong", "U455", seqs.get(0).getName()); assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1) .getName()); // check sequence data assertEquals("First sequence data wrong", "ABCFEDHIJMNOPQR", seqs .get(0).getSequenceAsString()); assertEquals("Second sequence data wrong", "KLMNOPWXYZCGATC", seqs.get(1).getSequenceAsString()); assertFalse("File format is not flagged as noninterleaved", testee.isInterleaved()); } /** * Test parsing an interleaved file with an extra sequence appearing after the * first block - should fail. */ @Test(groups = { "Functional" }) public void testParse_interleavedExtraSequenceError() { try { new MegaFile(INTERLEAVED_SEQUENCE_ERROR, AppletFormatAdapter.PASTE); fail("Expected extra sequence IOException"); } catch (IOException e) { assertEquals( "Unexpected exception message", "Parse error: misplaced new sequence starting at #U456 KLMNOP", e.getMessage()); } } /** * Test a mixed up file. */ @Test(groups = { "Functional" }) public void testParse_mixedInterleavedNonInterleaved() { try { new MegaFile(MIXED, AppletFormatAdapter.PASTE); fail("Expected mixed content exception"); } catch (IOException e) { assertEquals( "Unexpected exception message", "Parse error: mix of interleaved and noninterleaved detected, at line: ABCFEDHIJ", e.getMessage()); } } @Test(groups = { "Functional" }) public void testGetSequenceId() { assertEquals("AB123", MegaFile.getSequenceId("#AB123 CGATC")); assertEquals("AB123", MegaFile.getSequenceId("#AB123 CGATC")); assertEquals("AB123", MegaFile.getSequenceId("#AB123 CGC TAC")); assertEquals("AB123", MegaFile.getSequenceId("#AB123")); assertNull(MegaFile.getSequenceId("AB123 CTAG")); assertNull(MegaFile.getSequenceId("AB123")); assertNull(MegaFile.getSequenceId("")); assertNull(MegaFile.getSequenceId(null)); } @Test(groups = { "Functional" }) public void testGetMaxIdLength() { SequenceI[] seqs = new Sequence[2]; seqs[0] = new Sequence("Something", "GCATAC"); seqs[1] = new Sequence("SomethingElse", "GCATAC"); assertEquals(13, MegaFile.getMaxIdLength(seqs)); seqs[1] = new Sequence("DNA", "GCATAC"); assertEquals(9, MegaFile.getMaxIdLength(seqs)); } @Test(groups = { "Functional" }) public void testGetMaxSequenceLength() { SequenceI[] seqs = new Sequence[2]; seqs[0] = new Sequence("Seq1", "GCATAC"); seqs[1] = new Sequence("Seq2", "GCATACTAG"); assertEquals(9, MegaFile.getMaxSequenceLength(seqs)); seqs[1] = new Sequence("Seq2", "GCA"); assertEquals(6, MegaFile.getMaxSequenceLength(seqs)); } /** * Test (parse and) print of interleaved mega format data. * * @throws IOException */ @Test(groups = { "Functional" }) public void testPrint_interleaved() throws IOException { MegaFile testee = new MegaFile(INTERLEAVED, AppletFormatAdapter.PASTE); String printed = testee.print(); System.out.println(printed); // normally output should match input // we cheated here with a number of short input lines // nb don't get Title in output if not calling print(AlignmentI) String expected = "#MEGA\n\n" + "#U455 ABCDEF\n" + "#CPZANT MNOPQR\n\n" + "#U455 KLMNOP\n" + "#CPZANT WXYZGC" + "\n"; assertEquals("Print format wrong", expected, printed); } /** * Test (parse and) print of interleaved data with no headers (acceptable). * * @throws IOException */ @Test(groups = { "Functional" }) public void testPrint_interleavedNoHeaders() throws IOException { MegaFile testee = new MegaFile(INTERLEAVED_NOHEADERS, AppletFormatAdapter.PASTE); String printed = testee.print(); System.out.println(printed); assertEquals("Print format wrong", "#MEGA\n\n" + INTERLEAVED_NOHEADERS, printed); } /** * Test (parse and) print of noninterleaved mega format data. * * @throws IOException */ @Test(groups = { "Functional" }) public void testPrint_noninterleaved() throws IOException { MegaFile testee = new MegaFile(NONINTERLEAVED, AppletFormatAdapter.PASTE); assertEquals(10, testee.getPositionsPerLine()); String printed = testee.print(); System.out.println(printed); // normally output should match input // we cheated here with a number of short input lines String expected = "#MEGA\n\n" + "#U455\n" + "ABCFEDHIJM\nNOPQR\n\n" + "#CPZANT\n" + "KLMNOPWXYZ\nCGATC\n"; assertEquals("Print format wrong", expected, printed); } /** * Test (parse and) print of interleaved mega format data extending to more * than one line of output. * * @throws IOException */ @Test(groups = { "Functional" }) public void testPrint_interleavedMultiLine() throws IOException { MegaFile testee = new MegaFile(INTERLEAVED_50RESIDUES, AppletFormatAdapter.PASTE); assertEquals(50, testee.getPositionsPerLine()); /* * now simulate choosing 20 residues per line on output */ testee.setPositionsPerLine(20); String printed = testee.print(); System.out.println(printed); //@formatter:off //0123456789klmnopqrstABCDEFGHIJ9876543210abcdefghij String expected = "#MEGA\n\n" + "#U455 0123456789 klmnopqrst\n" + // first 20 "#CPZANT 9876543210 abcdefghij\n\n" + "#U455 ABCDEFGHIJ 9876543210\n" + // next 20 "#CPZANT 0123456789 klmnopqrst\n\n" + "#U455 abcdefghij\n" + // last 10 "#CPZANT ABCDEFGHIJ\n"; //@formatter:on assertEquals("Print format wrong", expected, printed); } /** * Test (parse and) print of noninterleaved mega format data extending to more * than one line of output. * * @throws IOException */ @Test(groups = { "Functional" }) public void testPrint_noninterleavedMultiLine() throws IOException { final String NONINTERLEAVED_LONGERTHAN50 = "#SIXTY\n" + THIRTY_CHARS + "\n" + TWENTY_CHARS + "9993332221\n"; MegaFile testee = new MegaFile(NONINTERLEAVED_LONGERTHAN50, AppletFormatAdapter.PASTE); assertEquals(30, testee.getPositionsPerLine()); testee.setPositionsPerLine(25); String printed = testee.print(); // 60 character sequence should be output as 50 on first line then 10 more String expected = "#MEGA\n\n" + "#SIXTY\n" + "0123456789klmnopqrstABCDE\n" + "FGHIJ9876543210abcdefghij\n" + "9993332221\n"; assertEquals("Print format wrong", expected, printed); } /** * Test parse of data including description * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_withDescription() throws IOException { MegaFile testee = new MegaFile(INTERLEAVED_WITH_DESCRIPTION, AppletFormatAdapter.PASTE); assertEquals("Title not as expected", "Data with description", testee.getAlignmentProperty(MegaFile.PROP_TITLE)); Vector seqs = testee.getSeqs(); // should be 2 sequences assertEquals("Expected two sequences", 2, seqs.size()); // check sequence names correct and order preserved assertEquals("First sequence id wrong", "U455", seqs.get(0).getName()); assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1) .getName()); // check sequence data assertEquals("First sequence data wrong", "CGCGTACGATTT", seqs.get(0) .getSequenceAsString()); assertEquals("Second sequence data wrong", "ATCGGGCAATGC", seqs.get(1) .getSequenceAsString()); assertTrue("File format is not flagged as interleaved", testee.isInterleaved()); assertEquals( "Description property not parsed", " Line one of description\n" + " Line two of description", testee.getAlignmentProperty(MegaFile.PROP_DESCRIPTION)); } @Test(groups = { "Functional" }) public void testGetNonCommentContent() throws FileFormatException { assertEquals("abcde", MegaFile.getNonCommentContent("abcde", 0)); assertEquals("CGT ACG GAC ", MegaFile.getNonCommentContent("CGT ACG GAC [9]", 0)); assertEquals("", MegaFile.getNonCommentContent("abcde", 1)); assertEquals(" abcde", MegaFile.getNonCommentContent("and others ] abcde", 1)); assertEquals(" abcde", MegaFile.getNonCommentContent( "and others [including refs] ] abcde", 1)); assertEquals(" x ] abcde", MegaFile.getNonCommentContent("and others ] x ] abcde", 1)); } @Test(groups = { "Functional" }) public void testCommentDepth() throws FileFormatException { assertEquals(0, MegaFile.commentDepth("abcde", 0)); assertEquals(1, MegaFile.commentDepth("abc[de", 0)); assertEquals(3, MegaFile.commentDepth("ab[c[de", 1)); assertEquals(1, MegaFile.commentDepth("ab]c[d]e[f", 1)); assertEquals(0, MegaFile.commentDepth("a]b[c]d]e", 1)); } @Test(groups = { "Functional" }) public void testGetValue() { assertEquals("Mega", MegaFile.getValue("Name=Mega")); assertEquals("Mega", MegaFile.getValue("Name =Mega")); assertEquals("Mega", MegaFile.getValue(" Name = Mega ")); assertEquals("Mega", MegaFile.getValue("Name = Mega; ")); assertEquals("Mega", MegaFile.getValue(" Name = Mega ; ")); assertEquals("Mega", MegaFile.getValue("\t!Name \t= \tMega ; ")); assertEquals("Mega", MegaFile.getValue("!Name \t\t Mega; ")); assertEquals("", MegaFile.getValue("Name")); } /** * Test reading a MEGA file to an alignment then writing it out in MEGA * format. Verify the output is (functionally) the same as the input. * * @throws IOException */ @Test(groups = "Functional") public void testRoundTrip_Interleaved() throws IOException { AppletFormatAdapter fa = new AppletFormatAdapter(); AlignmentI al = fa.readFile(INTERLEAVED_WITH_DESCRIPTION, AppletFormatAdapter.PASTE, "MEGA"); MegaFile output = new MegaFile(); String formatted = output.print(al); //@formatter:off String expected = "#MEGA\n!Title Data with description;\n" + "!Description\n" + " Line one of description\n" + " Line two of description;\n" + "!Format\n" + " DataType=DNA CodeTable=Standard\n" + " NSeqs=2 NSites=12\n" + " Indel=- Identical=. Missing=?;\n\n" + "#U455 CGC GTA\n" + "#CPZANT ATC GGG\n\n" + "#U455 CGA TTT\n" + "#CPZANT CAA TGC\n"; //@formatter:on assertEquals("Roundtrip didn't match", expected, formatted); } /** * Test reading a MEGA file to an alignment then writing it out in MEGA * format. Verify the output is (functionally) the same as the input. * * @throws IOException */ @Test(groups = "Functional") public void testRoundTrip_multilineFormatWithComments() throws IOException { AppletFormatAdapter fa = new AppletFormatAdapter(); //@formatter:off AlignmentI al = fa.readFile("#MEGA\n" + "!Title Data with description;\n" + "[ this comment should be ignored\n" + "including [this nested comment]\n" + "]\n" + "!Format \n" + "DataType=DNA CodeTable=Standard\n" + "indel=- Missing=? MatchChar=.;\n\n" + "!Description\n" + " Line one of description\n" + " Line two of description;\n\n" + "#U455 CGC GTA\n" + "#CPZANT ATC GGG\n\n" + "#U455 CGA TTT\n" + "#CPZANT CAA TGC\n", AppletFormatAdapter.PASTE, "MEGA"); //@formatter:on MegaFile output = new MegaFile(); String formatted = output.print(al); //@formatter:off String expected = "#MEGA\n!Title Data with description;\n" + "!Description\n" + " Line one of description\n" + " Line two of description;\n" + "!Format\n" + " DataType=DNA CodeTable=Standard\n" + " NSeqs=2 NSites=12\n" + " Indel=- Identical=. Missing=?;\n\n" + "#U455 CGC GTA\n" + "#CPZANT ATC GGG\n\n" + "#U455 CGA TTT\n" + "#CPZANT CAA TGC\n"; //@formatter:on assertEquals("Roundtrip didn't match", expected, formatted); } }