package jalview.io; import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertFalse; import static org.testng.AssertJUnit.assertNull; import static org.testng.AssertJUnit.assertTrue; import static org.testng.AssertJUnit.fail; import jalview.datamodel.AlignmentAnnotation; import jalview.datamodel.AlignmentI; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import java.io.IOException; import java.util.List; import java.util.Vector; import org.testng.annotations.Test; /* * Unit tests for MegaFile - read and write in MEGA format(s). */ public class MegaFileTest { private static final String TWENTY_CHARS = "9876543210abcdefghij"; private static final String THIRTY_CHARS = "0123456789klmnopqrstABCDEFGHIJ"; //@formatter:off private static final String INTERLEAVED = "#MEGA\n"+ "TITLE: Interleaved sequence data\n\n" + "#U455 ABCDEF\n" + "#CPZANT MNOPQR\n\n" + "#U455 KLMNOP\n" + "#CPZANT WXYZGC"; private static final String INTERLEAVED_NOHEADERS = "#U455 ABCDEF\n" + "#CPZANT MNOPQR\n\n" + "#U455 KLMNOP\n" + "#CPZANT WXYZGC\n"; // interleaved sequences, with 50 residues private static final String INTERLEAVED_50RESIDUES = "#MEGA\n" + "!TITLE Interleaved sequence data\n\n" + "#U455 " + THIRTY_CHARS + TWENTY_CHARS + "\n" + "#CPZANT " + TWENTY_CHARS + THIRTY_CHARS + "\n"; private static final String NONINTERLEAVED = "#MEGA\n" + "!TITLE Noninterleaved sequence data\n\n" + "#U455 \n" + "ABCFEDHIJ\n" + "MNOPQR\n\n" + "#CPZANT \n" + "KLMNOPWXYZ\n" + "CGATC\n"; // this one starts interleaved then switches to non-interleaved private static final String MIXED = "#MEGA\n" + "!TITLE This is a mess\n\n" + "#CPZANT KLMNOPWXYZCGATC\n\n" + "#U455\n " + "ABCFEDHIJ\n"; // interleaved with a new sequence appearing in the second block :-O private static final String INTERLEAVED_SEQUENCE_ERROR = "#MEGA" + "\n" + "!TITLE Interleaved sequence data\n\n" + "#U455 ABCDEF\n" + "#CPZANT MNOPQR\n\n" + "#U456 KLMNOP\n"; // interleaved with description, bases/gaps in triplet groups private static final String INTERLEAVED_WITH_DESCRIPTION = "#MEGA\n" + "!Title Data with description;\n" + "!Format DataType=DNA indel=-\tCodeTable=Standard Missing=? MatchChar=.;\n\n" + "!Description\n" + " Line one of description\n" + " Line two of description;\n\n" + "#U455 C-- GTA\n" + "#CPZANT ATC -G-\n\n" + "#U455 CGA --T\n" + "#CPZANT CA- -GC\n"; //@formatter:on /** * Test parse of interleaved mega format data. * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_interleaved() throws IOException { MegaFile testee = new MegaFile(INTERLEAVED, AppletFormatAdapter.PASTE); assertEquals("Title not as expected", "Interleaved sequence data", testee.getAlignmentProperty(MegaFile.PROP_TITLE)); Vector seqs = testee.getSeqs(); // should be 2 sequences assertEquals("Expected two sequences", 2, seqs.size()); // check sequence names correct and order preserved assertEquals("First sequence id wrong", "U455", seqs.get(0).getName()); assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1) .getName()); // check sequence data assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0) .getSequenceAsString()); assertEquals("Second sequence data wrong", "MNOPQRWXYZGC", seqs.get(1) .getSequenceAsString()); assertTrue("File format is not flagged as interleaved", testee.isInterleaved()); } /** * Test parse of noninterleaved mega format data. * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_nonInterleaved() throws IOException { MegaFile testee = new MegaFile(NONINTERLEAVED, AppletFormatAdapter.PASTE); assertEquals("Title not as expected", "Noninterleaved sequence data", testee.getAlignmentProperty(MegaFile.PROP_TITLE)); Vector seqs = testee.getSeqs(); // should be 2 sequences assertEquals("Expected two sequences", 2, seqs.size()); // check sequence names correct and order preserved assertEquals("First sequence id wrong", "U455", seqs.get(0).getName()); assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1) .getName()); // check sequence data assertEquals("First sequence data wrong", "ABCFEDHIJMNOPQR", seqs .get(0).getSequenceAsString()); assertEquals("Second sequence data wrong", "KLMNOPWXYZCGATC", seqs.get(1).getSequenceAsString()); assertFalse("File format is not flagged as noninterleaved", testee.isInterleaved()); } /** * Test parsing an interleaved file with an extra sequence appearing after the * first block - should fail. */ @Test(groups = { "Functional" }) public void testParse_interleavedExtraSequenceError() { try { new MegaFile(INTERLEAVED_SEQUENCE_ERROR, AppletFormatAdapter.PASTE); fail("Expected extra sequence IOException"); } catch (IOException e) { assertEquals( "Unexpected exception message", "Parse error: misplaced new sequence starting at #U456 KLMNOP", e.getMessage()); } } /** * Test a mixed up file. */ @Test(groups = { "Functional" }) public void testParse_mixedInterleavedNonInterleaved() { try { new MegaFile(MIXED, AppletFormatAdapter.PASTE); fail("Expected mixed content exception"); } catch (IOException e) { assertEquals( "Unexpected exception message", "Parse error: interleaved was true but now seems to be false, at line: ABCFEDHIJ", e.getMessage()); } } @Test(groups = { "Functional" }) public void testGetSequenceId() { assertEquals("AB123", MegaFile.getSequenceId("#AB123 CGATC")); assertEquals("AB123", MegaFile.getSequenceId("#AB123 CGATC")); assertEquals("AB123", MegaFile.getSequenceId("#AB123 CGC TAC")); assertEquals("AB123", MegaFile.getSequenceId("#AB123")); assertNull(MegaFile.getSequenceId("AB123 CTAG")); assertNull(MegaFile.getSequenceId("AB123")); assertNull(MegaFile.getSequenceId("")); assertNull(MegaFile.getSequenceId(null)); } @Test(groups = { "Functional" }) public void testGetMaxIdLength() { SequenceI[] seqs = new Sequence[2]; seqs[0] = new Sequence("Something", "GCATAC"); seqs[1] = new Sequence("SomethingElse", "GCATAC"); assertEquals(13, MegaFile.getMaxIdLength(seqs)); seqs[1] = new Sequence("DNA", "GCATAC"); assertEquals(9, MegaFile.getMaxIdLength(seqs)); } @Test(groups = { "Functional" }) public void testGetMaxSequenceLength() { SequenceI[] seqs = new Sequence[2]; seqs[0] = new Sequence("Seq1", "GCATAC"); seqs[1] = new Sequence("Seq2", "GCATACTAG"); assertEquals(9, MegaFile.getMaxSequenceLength(seqs)); seqs[1] = new Sequence("Seq2", "GCA"); assertEquals(6, MegaFile.getMaxSequenceLength(seqs)); } /** * Test (parse and) print of interleaved mega format data. * * @throws IOException */ @Test(groups = { "Functional" }) public void testPrint_interleaved() throws IOException { MegaFile testee = new MegaFile(INTERLEAVED, AppletFormatAdapter.PASTE); String printed = testee.print(); System.out.println(printed); // normally output should match input // we cheated here with a number of short input lines // nb don't get Title in output if not calling print(AlignmentI) String expected = "#MEGA\n\n" + "#U455 ABCDEF [6]\n" + "#CPZANT MNOPQR [6]\n\n" + "#U455 KLMNOP [12]\n" + "#CPZANT WXYZGC [12]" + "\n"; assertEquals("Print format wrong", expected, printed); } /** * Test (parse and) print of interleaved data with no headers (acceptable). * * @throws IOException */ @Test(groups = { "Functional" }) public void testPrint_interleavedNoHeaders() throws IOException { MegaFile testee = new MegaFile(INTERLEAVED_NOHEADERS, AppletFormatAdapter.PASTE); String printed = testee.print(); System.out.println(printed); //@formatter:off assertEquals("Print format wrong", "#MEGA\n\n" + "#U455 ABCDEF [6]\n" + "#CPZANT MNOPQR [6]\n\n" + "#U455 KLMNOP [12]\n" + "#CPZANT WXYZGC [12]\n", printed); //@formatter:on } /** * Test (parse and) print of noninterleaved mega format data. * * @throws IOException */ @Test(groups = { "Functional" }) public void testPrint_noninterleaved() throws IOException { MegaFile testee = new MegaFile(NONINTERLEAVED, AppletFormatAdapter.PASTE); assertEquals(10, testee.getPositionsPerLine()); String printed = testee.print(); System.out.println(printed); // normally output should match input // we cheated here with a number of short input lines String expected = "#MEGA\n\n" + "#U455\n" + "ABCFEDHIJM [10]\nNOPQR [15]\n\n" + "#CPZANT\n" + "KLMNOPWXYZ [10]\nCGATC [15]\n"; assertEquals("Print format wrong", expected, printed); } /** * Test (parse and) print of interleaved mega format data extending to more * than one line of output. * * @throws IOException */ @Test(groups = { "Functional" }) public void testPrint_interleavedMultiLine() throws IOException { MegaFile testee = new MegaFile(INTERLEAVED_50RESIDUES, AppletFormatAdapter.PASTE); assertEquals(50, testee.getPositionsPerLine()); /* * now simulate choosing 20 residues per line on output */ testee.setPositionsPerLine(20); String printed = testee.print(); System.out.println(printed); //@formatter:off String expected = "#MEGA\n\n" + "#U455 0123456789 klmnopqrst [20]\n" + // first 20 "#CPZANT 9876543210 abcdefghij [20]\n\n" + "#U455 ABCDEFGHIJ 9876543210 [40]\n" + // next 20 "#CPZANT 0123456789 klmnopqrst [40]\n\n" + "#U455 abcdefghij [50]\n" + // last 10 "#CPZANT ABCDEFGHIJ [50]\n"; //@formatter:on assertEquals("Print format wrong", expected, printed); } /** * Test (parse and) print of noninterleaved mega format data extending to more * than one line of output. * * @throws IOException */ @Test(groups = { "Functional" }) public void testPrint_noninterleavedMultiLine() throws IOException { final String NONINTERLEAVED_LONGERTHAN50 = "#SIXTY\n" + THIRTY_CHARS + "\n" + TWENTY_CHARS + "9993332221\n"; MegaFile testee = new MegaFile(NONINTERLEAVED_LONGERTHAN50, AppletFormatAdapter.PASTE); assertEquals(30, testee.getPositionsPerLine()); testee.setPositionsPerLine(25); String printed = testee.print(); /* * 25 positions per line is rounded down to 20 (two blocks of 10) */ String expected = "#MEGA\n\n" + "#SIXTY\n" + "0123456789 klmnopqrst [20]\n" + "ABCDEFGHIJ 9876543210 [40]\n" + "abcdefghij 9993332221 [60]\n"; assertEquals("Print format wrong", expected, printed); } /** * Test parse of data including description * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_withDescription() throws IOException { MegaFile testee = new MegaFile(INTERLEAVED_WITH_DESCRIPTION, AppletFormatAdapter.PASTE); assertEquals("Title not as expected", "Data with description", testee.getAlignmentProperty(MegaFile.PROP_TITLE)); Vector seqs = testee.getSeqs(); // should be 2 sequences assertEquals("Expected two sequences", 2, seqs.size()); // check sequence names correct and order preserved assertEquals("First sequence id wrong", "U455", seqs.get(0).getName()); assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1) .getName()); // check sequence data assertEquals("First sequence data wrong", "C--GTACGA--T", seqs.get(0) .getSequenceAsString()); assertEquals("Second sequence data wrong", "ATC-G-CA--GC", seqs.get(1) .getSequenceAsString()); assertTrue("File format is not flagged as interleaved", testee.isInterleaved()); assertEquals( "Description property not parsed", " Line one of description\n" + " Line two of description", testee.getAlignmentProperty(MegaFile.PROP_DESCRIPTION)); } @Test(groups = { "Functional" }) public void testGetNonCommentContent() throws FileFormatException { assertEquals("abcde", MegaFile.getNonCommentContent("abcde", 0)); assertEquals("CGT ACG GAC ", MegaFile.getNonCommentContent("CGT ACG GAC [9]", 0)); assertEquals("", MegaFile.getNonCommentContent("abcde", 1)); assertEquals(" abcde", MegaFile.getNonCommentContent("and others ] abcde", 1)); assertEquals(" abcde", MegaFile.getNonCommentContent( "and others [including refs] ] abcde", 1)); assertEquals(" x ] abcde", MegaFile.getNonCommentContent("and others ] x ] abcde", 1)); } @Test(groups = { "Functional" }) public void testCommentDepth() throws FileFormatException { assertEquals(0, MegaFile.commentDepth("abcde", 0)); assertEquals(1, MegaFile.commentDepth("abc[de", 0)); assertEquals(3, MegaFile.commentDepth("ab[c[de", 1)); assertEquals(1, MegaFile.commentDepth("ab]c[d]e[f", 1)); assertEquals(0, MegaFile.commentDepth("a]b[c]d]e", 1)); } @Test(groups = { "Functional" }) public void testGetValue() { assertEquals("Mega", MegaFile.getValue("Name=Mega")); assertEquals("Mega", MegaFile.getValue("Name =Mega")); assertEquals("Mega", MegaFile.getValue(" Name = Mega ")); assertEquals("Mega", MegaFile.getValue("Name = Mega; ")); assertEquals("Mega", MegaFile.getValue(" Name = Mega ; ")); assertEquals("Mega", MegaFile.getValue("\t!Name \t= \tMega ; ")); assertEquals("Mega", MegaFile.getValue("!Name \t\t Mega; ")); assertEquals("", MegaFile.getValue("Name")); } /** * Test reading a MEGA file to an alignment then writing it out in MEGA * format. Verify the output is (functionally) the same as the input. * * @throws IOException */ @Test(groups = "Functional") public void testRoundTrip_Interleaved() throws IOException { AppletFormatAdapter fa = new AppletFormatAdapter(); AlignmentI al = fa.readFile(INTERLEAVED_WITH_DESCRIPTION, AppletFormatAdapter.PASTE, "MEGA"); MegaFile output = new MegaFile(); String formatted = output.print(al); //@formatter:off String expected = "#MEGA\n!Title Data with description;\n" + "!Description\n" + " Line one of description\n" + " Line two of description;\n" + "!Format\n" + " DataType=DNA CodeTable=Standard\n" + " NSeqs=2 NSites=12\n" + // NSites includes gaps " Indel=- Identical=. Missing=?;\n\n" + "#U455 C-- GTA [6]\n" + "#CPZANT ATC -G- [6]\n\n" + "#U455 CGA --T [12]\n" + "#CPZANT CA- -GC [12]\n"; //@formatter:on assertEquals("Roundtrip didn't match", expected, formatted); } /** * Test reading a MEGA file to an alignment then writing it out in MEGA * format. Verify the output is (functionally) the same as the input. * * @throws IOException */ @Test(groups = "Functional") public void testRoundTrip_multilineFormatWithComments() throws IOException { AppletFormatAdapter fa = new AppletFormatAdapter(); //@formatter:off AlignmentI al = fa.readFile("#MEGA\n" + "!Title Data with description;\n" + "[ this comment should be ignored\n" + "including [this nested comment]\n" + "]\n" + "!Format \n" + "DataType=DNA CodeTable=Standard\n" + "indel=- Missing=? MatchChar=.;\n\n" + "!Description\n" + " Line one of description\n" + " Line two of description;\n\n" + "#U455 CGC GTA\n" + "#CPZANT ATC GGG\n\n" + "#U455 CGA TTT\n" + "#CPZANT CAA TGC\n", AppletFormatAdapter.PASTE, "MEGA"); //@formatter:on MegaFile output = new MegaFile(); String formatted = output.print(al); //@formatter:off String expected = "#MEGA\n!Title Data with description;\n" + "!Description\n" + " Line one of description\n" + " Line two of description;\n" + "!Format\n" + " DataType=DNA CodeTable=Standard\n" + " NSeqs=2 NSites=12\n" + " Indel=- Identical=. Missing=?;\n\n" + "#U455 CGC GTA [6]\n" + "#CPZANT ATC GGG [6]\n\n" + "#U455 CGA TTT [12]\n" + "#CPZANT CAA TGC [12]\n"; //@formatter:on assertEquals("Roundtrip didn't match", expected, formatted); } //@formatter:on /** * Test parse of interleaved mega format data where the identity character is * used in sequences after the first * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_interleavedWithIdentityAndTabs() throws IOException { //@formatter:off // uses tab instead of space separators to check robustness MegaFile testee = new MegaFile("#MEGA\n"+ "!TITLE\tInterleaved sequence data;\n" + "!Format\tIdentical=.;\n\n" + "#U455\tABCDEF\n" + "#CPZANT\tM..P.R\n\n" + "#U455\t\tKLMNOP\n" + "#CPZANT\t..YZ..", AppletFormatAdapter.PASTE); //@formatter:on assertEquals("Title not as expected", "Interleaved sequence data", testee.getAlignmentProperty(MegaFile.PROP_TITLE)); Vector seqs = testee.getSeqs(); // should be 2 sequences assertEquals("Expected two sequences", 2, seqs.size()); // check sequence names correct and order preserved assertEquals("First sequence id wrong", "U455", seqs.get(0).getName()); assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1) .getName()); // check sequence data assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0) .getSequenceAsString()); assertEquals("Second sequence data wrong", "MBCPERKLYZOP", seqs.get(1) .getSequenceAsString()); assertTrue("File format is not flagged as interleaved", testee.isInterleaved()); } /** * Test parse of noninterleaved format data including identity symbol * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_nonInterleavedWithIdentity() throws IOException { //@formatter:off MegaFile testee = new MegaFile("#MEGA\n" + "!TITLE Noninterleaved sequence data;\n" + "!Format MatchChar=.;\n" + "#U455 \n" + "ABCFEDHIJ\n" + "MNOPQR\n\n" + "#CPZANT \n" + "KL..O..XYZ\n" + "CG..C\n", AppletFormatAdapter.PASTE); //@formatter:on assertEquals("Title not as expected", "Noninterleaved sequence data", testee.getAlignmentProperty(MegaFile.PROP_TITLE)); Vector seqs = testee.getSeqs(); // should be 2 sequences assertEquals("Expected two sequences", 2, seqs.size()); // check sequence names correct and order preserved assertEquals("First sequence id wrong", "U455", seqs.get(0).getName()); assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1) .getName()); // check sequence data assertEquals("First sequence data wrong", "ABCFEDHIJMNOPQR", seqs .get(0).getSequenceAsString()); assertEquals("Second sequence data wrong", "KLCFODHXYZCGPQC", seqs.get(1).getSequenceAsString()); assertFalse("File format is not flagged as noninterleaved", testee.isInterleaved()); } //@formatter:on /** * Test parse of interleaved format data including position number comments. * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_interleavedWithPositionNumber() throws IOException { //@formatter:off MegaFile testee = new MegaFile("#MEGA\n"+ "TITLE: Interleaved sequence data\n\n" + "#U455 ABCDEF [6]\n" + "#CPZANT MNOPQR [6]\n\n" + "#U455 KLMNOP [12]\n" + "#CPZANT WXYZGC [12]\n", AppletFormatAdapter.PASTE); //@formatter:on assertEquals("Title not as expected", "Interleaved sequence data", testee.getAlignmentProperty(MegaFile.PROP_TITLE)); Vector seqs = testee.getSeqs(); // should be 2 sequences assertEquals("Expected two sequences", 2, seqs.size()); // check sequence names correct and order preserved assertEquals("First sequence id wrong", "U455", seqs.get(0).getName()); assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1) .getName()); // check sequence data assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0) .getSequenceAsString()); assertEquals("Second sequence data wrong", "MNOPQRWXYZGC", seqs.get(1) .getSequenceAsString()); assertTrue("File format is not flagged as interleaved", testee.isInterleaved()); } //@formatter:on /** * Test parse of data with !Gene and !Domain statements. * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_geneDomains() throws IOException { //@formatter:off String data = "#MEGA\n"+ "TITLE: Interleaved sequence data\n\n" + "#U455 CCCCCC\n" + "#CPZANT TTTTTT\n\n" + "!Domain=Exon1 Gene=Adh Property=Coding CodonStart=1;\n" + "#U455 GGGGGG\n" + "#CPZANT AAAAAA\n\n" + "!domain=Intron1 Property=Intron Gene=Adh;\n" + "#U455 tttttt\n" + "#CPZANT cccccc\n\n" + "!Domain=Exon2 Gene=Adh Property=Exon CodonStart=1;\n" + "#U455 aaaaaa\n" + "#CPZANT gggggg\n\n" + // explicit end of Exon2, implicit end of Adh: "!Domain=Exon2 Property=domainend;\n" + "!Domain=Intron1 Gene=Opsin Property=Noncoding;\n" + "#U455 GGGGGG\n" + "#CPZANT AAAAAA\n\n" + // end Opsin, start MEF2A "!Domain=Exon1 Gene=MEF2A Property=Coding CodonStart=1;\n" + "#U455 tttttt\n" + "#CPZANT cccccc\n\n" + // end MEF2A "!Domain=BindingSite;\n" + "#U455 CCCCCC\n" + "#CPZANT TTTTTT\n\n"; //@formatter:on MegaFile testee = new MegaFile(data, AppletFormatAdapter.PASTE); Vector seqs = testee.getSeqs(); // should be 2 sequences assertEquals("Expected two sequences", 2, seqs.size()); // check sequence data assertEquals("First sequence data wrong", "CCCCCCGGGGGGttttttaaaaaaGGGGGGttttttCCCCCC", seqs.get(0) .getSequenceAsString()); assertEquals("Second sequence data wrong", "TTTTTTAAAAAAccccccggggggAAAAAAccccccTTTTTT", seqs.get(1) .getSequenceAsString()); /* * sequences should have features for Gene=Adh 7-24, Exon1 7-12, Intron1 * 13-18, Exon2 19-24, BindingSite 25-30 */ for (SequenceI seq : seqs) { SequenceFeature[] sfs = seq.getSequenceFeatures(); // features are added in the order in which their end is found // (Domain before Gene when they end together) assertEquals(9, sfs.length); // TODO settle which way round type/description go! verifySequenceFeature(sfs[0], "Exon1 (Adh Coding)", "Domain", 7, 12); verifySequenceFeature(sfs[1], "Intron1 (Adh Noncoding)", "Domain", 13, 18); verifySequenceFeature(sfs[2], "Exon2 (Adh Coding)", "Domain", 19, 24); verifySequenceFeature(sfs[3], "Adh", "Gene", 7, 24); verifySequenceFeature(sfs[4], "Intron1 (Opsin Noncoding)", "Domain", 25, 30); verifySequenceFeature(sfs[5], "Opsin", "Gene", 25, 30); verifySequenceFeature(sfs[6], "Exon1 (MEF2A Coding)", "Domain", 31, 36); verifySequenceFeature(sfs[7], "MEF2A", "Gene", 31, 36); verifySequenceFeature(sfs[8], "BindingSite", "Domain", 37, 42); } /* * verify gene and domain alignment annotations */ assertEquals(2, testee.annotations.size()); AlignmentAnnotation ann = testee.annotations.get(0); assertEquals("MEGA Gene", ann.label); assertEquals(42, ann.annotations.length); verifyAnnotation(ann, 0, 6, null); verifyAnnotation(ann, 6, 24, "Adh"); verifyAnnotation(ann, 24, 30, "Opsin"); verifyAnnotation(ann, 30, 36, "MEF2A"); verifyAnnotation(ann, 37, 42, null); ann = testee.annotations.get(1); assertEquals("MEGA Domain", ann.label); assertEquals(42, ann.annotations.length); verifyAnnotation(ann, 0, 6, null); verifyAnnotation(ann, 6, 12, "Exon1 (Adh Coding)"); verifyAnnotation(ann, 12, 18, "Intron1 (Adh Noncoding)"); verifyAnnotation(ann, 19, 24, "Exon2 (Adh Coding)"); verifyAnnotation(ann, 25, 30, "Intron1 (Opsin Noncoding)"); verifyAnnotation(ann, 31, 36, "Exon1 (MEF2A Coding)"); verifyAnnotation(ann, 37, 42, "BindingSite"); } /** * Helper method to verify a range of annotation positions all have the given * description * * @param ann * array of annotations to check * @param from * start index to check * @param to * end index to check (exclusive) * @param description * value to assert */ protected void verifyAnnotation(AlignmentAnnotation ann, int from, int to, String description) { for (int pos = from; pos < to; pos++) { if (description == null) { assertNull(ann.annotations[pos]); } else { assertEquals(description, ann.annotations[pos].description); } } } /** * Helper method to assert properties of a SequenceFeature * * @param sf * @param description * @param type * @param begin * @param end */ protected void verifySequenceFeature(SequenceFeature sf, String description, String type, int begin, int end) { assertEquals(description, sf.type); assertEquals(type, sf.description); assertEquals(begin, sf.begin); assertEquals(end, sf.end); } //@formatter:on /** * Test parse of data including !Label statements. An underscore means no * label, other characters are treated as alignment annotation. * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_withLabels() throws IOException { //@formatter:off MegaFile testee = new MegaFile("#MEGA\n"+ "TITLE: Interleaved sequence data\n\n" + "#U455 ABC DEF\n" + "#CPZANT MNO PQR\n" + "!Label +-_ 23_\n\n" + // a row with no labels = null annotation "#U455 abc def\n" + "#CPZANT mno pqr\n\n" + "#U455 KLM NOP\n" + "#CPZANT WXY ZGC\n" + "!label __3 +X_\n", AppletFormatAdapter.PASTE); //@formatter:on Vector seqs = testee.getSeqs(); assertEquals("Expected two sequences", 2, seqs.size()); assertEquals("First sequence data wrong", "ABCDEFabcdefKLMNOP", seqs .get(0) .getSequenceAsString()); assertEquals("Second sequence data wrong", "MNOPQRmnopqrWXYZGC", seqs .get(1) .getSequenceAsString()); // check AlignmentAnnotation added with expected values assertEquals(1, testee.annotations.size()); AlignmentAnnotation aa = testee.annotations.get(0); assertNull(aa.sequenceRef); assertEquals("MEGA Label", aa.label); assertEquals(18, aa.annotations.length); assertEquals("+, -, , 2, 3, , , , , , , , , , 3, +, X, , ", aa.toString()); } //@formatter:on /** * Test case where a domain is implicitly terminated by starting a new gene * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_changeOfGeneEndsDomain() throws IOException { //@formatter:off // uses tab instead of space separators to check robustness MegaFile testee = new MegaFile("#MEGA\n"+ "!TITLE Interleaved sequence data;\n" + "!Format Identical=.;\n\n" + "!Gene=gene1 Domain=Exon1 Property=Coding;\n" + "#U455 ABCDEF\n" + "#CPZANT M..P.R\n\n" + "!Gene=gene2;\n" + "#U455 KLMNOP\n" + "#CPZANT ..YZ..", AppletFormatAdapter.PASTE); //@formatter:on Vector seqs = testee.getSeqs(); assertEquals("Expected two sequences", 2, seqs.size()); assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0) .getSequenceAsString()); assertEquals("Second sequence data wrong", "MBCPERKLYZOP", seqs.get(1) .getSequenceAsString()); assertTrue("File format is not flagged as interleaved", testee.isInterleaved()); for (SequenceI seq : seqs) { SequenceFeature[] sfs = seq.getSequenceFeatures(); assertEquals(3, sfs.length); verifySequenceFeature(sfs[0], "Exon1 (gene1 Coding)", "Domain", 1, 6); verifySequenceFeature(sfs[1], "gene1", "Gene", 1, 6); verifySequenceFeature(sfs[2], "gene2", "Gene", 7, 12); } } //@formatter:on /** * Test case where the declared gap character is one Jalview does not support; * it should be converted to a '-' * * @throws IOException */ @Test(groups = { "Functional" }) public void testParse_weirdGapCharacter() throws IOException { //@formatter:off String data = "#MEGA\n"+ "!TITLE Interleaved sequence data;\n" + "!Format Identical=. Indel=%;\n\n" + "#U455 %BC%EF\n" + "#CPZANT M..P.R\n\n" + "#U455 KLMNOP\n" + "#CPZANT .%%Z.."; AppletFormatAdapter fa = new AppletFormatAdapter(); AlignmentI al = fa.readFile(data, AppletFormatAdapter.PASTE, "MEGA"); //@formatter:on List seqs = al.getSequences(); assertEquals("First sequence data wrong", "-BC-EFKLMNOP", seqs.get(0) .getSequenceAsString()); assertEquals("Second sequence data wrong", "MBCPERK--ZOP", seqs.get(1) .getSequenceAsString()); assertEquals('-', al.getGapCharacter()); } /** * Test reading a MEGA file to an alignment then writing it out in MEGA * format. Includes !Label statements which should be converted to * AlignmentAnnotation and back again. * * @throws IOException */ @Test(groups = "Functional") public void testRoundTrip_withLabels() throws IOException { AppletFormatAdapter fa = new AppletFormatAdapter(); //@formatter:off String data = "#MEGA\n" + "#U455 C-- GTA\n" + "#CPZANT ATC -G-\n" + "!Label F__E_H\n\n" + "#U455 CGA --T\n" + "#CPZANT CA- -GC\n" + "!Label FFH__E\n"; AlignmentI al = fa.readFile(data, AppletFormatAdapter.PASTE, "MEGA"); AlignmentAnnotation aa = al.getAlignmentAnnotation()[0]; assertEquals("MEGA Label", aa.label); assertEquals("F, , , E, , H, F, F, H, , , E, ", aa.toString()); MegaFile output = new MegaFile(); String formatted = output.print(al); String expected = "#MEGA\n" + "!Format\n" + " DataType=Nucleotide CodeTable=Standard\n" + " NSeqs=2 NSites=12\n" + " Indel=-;\n\n" + "#U455 C-- GTA [6]\n" + "#CPZANT ATC -G- [6]\n" + "!Label F__ E_H;\n\n" + "#U455 CGA --T [12]\n" + "#CPZANT CA- -GC [12]\n" + "!Label FFH __E;\n"; //@formatter:on assertEquals("Roundtrip didn't match", expected, formatted); } }