package jalview.io; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import static org.testng.Assert.assertNull; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.List; import java.util.Set; import org.testng.annotations.Test; import jalview.datamodel.DBRefEntry; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import jalview.datamodel.features.SequenceFeatures; public class EmblFlatFileTest { /** * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features, * one of them reverse strand * * @throws MalformedURLException * @throws IOException */ @Test(groups = "Functional") public void testParse() throws MalformedURLException, IOException { File dataFile = new File("test/jalview/io/J03321.embl.txt"); FileParse fp = new FileParse(dataFile, DataSourceType.FILE); EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest"); parser.parse(); List seqs = parser.getSeqs(); assertEquals(seqs.size(), 1); SequenceI seq = seqs.get(0); assertEquals(seq.getName(), "EmblTest|J03321"); assertEquals(seq.getLength(), 7502); /* * should be 9 CDS features (one is a 'join' of two exons) */ Set featureTypes = seq.getFeatures().getFeatureTypes(); assertEquals(featureTypes.size(), 1); assertTrue(featureTypes.contains("CDS")); /* * inspect some features (sort them for convenience of test assertions) */ List features = seq.getFeatures() .getAllFeatures("CDS"); SequenceFeatures.sortFeatures(features, true); assertEquals(features.size(), 9); SequenceFeature sf = features.get(0); assertEquals(sf.getBegin(), 1); assertEquals(sf.getEnd(), 437); assertEquals(sf.getDescription(), "Exon 2 for protein EMBLCDS:AAA91567.1"); assertEquals(sf.getFeatureGroup(), "EmblTest"); assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)"); assertEquals(sf.getPhase(), "0"); assertEquals(sf.getStrand(), 1); assertEquals(sf.getValue("note"), "pGP7-D"); // second exon of circular DNA! assertEquals(sf.getValue("exon number"), 2); assertEquals(sf.getValue("product"), "hypothetical protein"); assertEquals(sf.getValue("transl_table"), "11"); sf = features.get(1); assertEquals(sf.getBegin(), 488); assertEquals(sf.getEnd(), 1480); assertEquals(sf.getDescription(), "Exon 1 for protein EMBLCDS:AAA91568.1"); assertEquals(sf.getFeatureGroup(), "EmblTest"); assertEquals(sf.getEnaLocation(), "complement(488..1480)"); assertEquals(sf.getPhase(), "0"); assertEquals(sf.getStrand(), -1); // reverse strand! assertEquals(sf.getValue("note"), "pGP8-D"); assertEquals(sf.getValue("exon number"), 1); assertEquals(sf.getValue("product"), "hypothetical protein"); sf = features.get(7); assertEquals(sf.getBegin(), 6045); assertEquals(sf.getEnd(), 6788); assertEquals(sf.getDescription(), "Exon 1 for protein EMBLCDS:AAA91574.1"); assertEquals(sf.getFeatureGroup(), "EmblTest"); assertEquals(sf.getEnaLocation(), "6045..6788"); assertEquals(sf.getPhase(), "0"); assertEquals(sf.getStrand(), 1); assertEquals(sf.getValue("note"), "pGP6-D (gtg start codon)"); assertEquals(sf.getValue("exon number"), 1); assertEquals(sf.getValue("product"), "hypothetical protein"); /* * CDS at 7022-7502 is the first exon of the circular DNA CDS */ sf = features.get(8); assertEquals(sf.getBegin(), 7022); assertEquals(sf.getEnd(), 7502); assertEquals(sf.getDescription(), "Exon 1 for protein EMBLCDS:AAA91567.1"); assertEquals(sf.getFeatureGroup(), "EmblTest"); assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)"); assertEquals(sf.getPhase(), "0"); assertEquals(sf.getStrand(), 1); assertEquals(sf.getValue("note"), "pGP7-D"); assertEquals(sf.getValue("exon number"), 1); assertEquals(sf.getValue("product"), "hypothetical protein"); /* * there are 4 'direct' (DR) dbrefs, and numerous CDS /db_xref entries, * some of them (e.g. INTERPRO) duplicates; sample a few here * Note DBRefEntry constructor capitalises source */ List dbrefs = seq.getDBRefs(); assertEquals(dbrefs.size(), 31); // 1st DR line; note trailing period is removed assertTrue(dbrefs.contains(new DBRefEntry("MD5", "0", "d4c4942a634e3df4995fd5ac75c26a61"))); // the 4th DR line: assertTrue( dbrefs.contains(new DBRefEntry("EuropePMC", "0", "PMC87941"))); // from the first CDS feature; note canonicalisation to "UNIPROT" assertTrue(dbrefs.contains(new DBRefEntry("GOA", "0", "P0CE19"))); assertTrue(dbrefs.contains(new DBRefEntry("UNIPROT", "0", "P0CE19"))); // from the last CDS feature assertTrue(dbrefs.contains(new DBRefEntry("INTERPRO", "0", "IPR005350"))); // todo: mappings to, and sequences for, UNIPROT proteins } }