X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=test%2Fjalview%2Fio%2FEmblFlatFileTest.java;fp=test%2Fjalview%2Fio%2FEmblFlatFileTest.java;h=6d9874ebffce236d154e542bdb4088d69902f3a4;hb=3338d9ab2b7587db8f1899cdf42e0d666dd0f1a8;hp=0000000000000000000000000000000000000000;hpb=0a37e3b824b46b026916e124b42400590242d145;p=jalview.git diff --git a/test/jalview/io/EmblFlatFileTest.java b/test/jalview/io/EmblFlatFileTest.java new file mode 100644 index 0000000..6d9874e --- /dev/null +++ b/test/jalview/io/EmblFlatFileTest.java @@ -0,0 +1,136 @@ +package jalview.io; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertNull; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.List; +import java.util.Set; + +import org.testng.annotations.Test; + +import jalview.datamodel.DBRefEntry; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; +import jalview.datamodel.features.SequenceFeatures; + +public class EmblFlatFileTest +{ + /** + * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features, + * one of them reverse strand + * + * @throws MalformedURLException + * @throws IOException + */ + @Test(groups = "Functional") + public void testParse() throws MalformedURLException, IOException + { + File dataFile = new File("test/jalview/io/J03321.embl.txt"); + FileParse fp = new FileParse(dataFile, DataSourceType.FILE); + EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest"); + parser.parse(); + List seqs = parser.getSeqs(); + + assertEquals(seqs.size(), 1); + SequenceI seq = seqs.get(0); + assertEquals(seq.getName(), "EmblTest|J03321"); + assertEquals(seq.getLength(), 7502); + + /* + * should be 9 CDS features (one is a 'join' of two exons) + */ + Set featureTypes = seq.getFeatures().getFeatureTypes(); + assertEquals(featureTypes.size(), 1); + assertTrue(featureTypes.contains("CDS")); + + /* + * inspect some features (sort them for convenience of test assertions) + */ + List features = seq.getFeatures() + .getAllFeatures("CDS"); + SequenceFeatures.sortFeatures(features, true); + assertEquals(features.size(), 9); + + SequenceFeature sf = features.get(0); + assertEquals(sf.getBegin(), 1); + assertEquals(sf.getEnd(), 437); + assertEquals(sf.getDescription(), + "Exon 2 for protein EMBLCDS:AAA91567.1"); + assertEquals(sf.getFeatureGroup(), "EmblTest"); + assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)"); + assertEquals(sf.getPhase(), "0"); + assertEquals(sf.getStrand(), 1); + assertEquals(sf.getValue("note"), "pGP7-D"); + // second exon of circular DNA! + assertEquals(sf.getValue("exon number"), 2); + assertEquals(sf.getValue("product"), "hypothetical protein"); + assertEquals(sf.getValue("transl_table"), "11"); + + sf = features.get(1); + assertEquals(sf.getBegin(), 488); + assertEquals(sf.getEnd(), 1480); + assertEquals(sf.getDescription(), + "Exon 1 for protein EMBLCDS:AAA91568.1"); + assertEquals(sf.getFeatureGroup(), "EmblTest"); + assertEquals(sf.getEnaLocation(), "complement(488..1480)"); + assertEquals(sf.getPhase(), "0"); + assertEquals(sf.getStrand(), -1); // reverse strand! + assertEquals(sf.getValue("note"), "pGP8-D"); + assertEquals(sf.getValue("exon number"), 1); + assertEquals(sf.getValue("product"), "hypothetical protein"); + + sf = features.get(7); + assertEquals(sf.getBegin(), 6045); + assertEquals(sf.getEnd(), 6788); + assertEquals(sf.getDescription(), + "Exon 1 for protein EMBLCDS:AAA91574.1"); + assertEquals(sf.getFeatureGroup(), "EmblTest"); + assertEquals(sf.getEnaLocation(), "6045..6788"); + assertEquals(sf.getPhase(), "0"); + assertEquals(sf.getStrand(), 1); + assertEquals(sf.getValue("note"), "pGP6-D (gtg start codon)"); + assertEquals(sf.getValue("exon number"), 1); + assertEquals(sf.getValue("product"), "hypothetical protein"); + + /* + * CDS at 7022-7502 is the first exon of the circular DNA CDS + */ + sf = features.get(8); + assertEquals(sf.getBegin(), 7022); + assertEquals(sf.getEnd(), 7502); + assertEquals(sf.getDescription(), + "Exon 1 for protein EMBLCDS:AAA91567.1"); + assertEquals(sf.getFeatureGroup(), "EmblTest"); + assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)"); + assertEquals(sf.getPhase(), "0"); + assertEquals(sf.getStrand(), 1); + assertEquals(sf.getValue("note"), "pGP7-D"); + assertEquals(sf.getValue("exon number"), 1); + assertEquals(sf.getValue("product"), "hypothetical protein"); + + /* + * there are 4 'direct' (DR) dbrefs, and numerous CDS /db_xref entries, + * some of them (e.g. INTERPRO) duplicates; sample a few here + * Note DBRefEntry constructor capitalises source + */ + List dbrefs = seq.getDBRefs(); + assertEquals(dbrefs.size(), 31); + // 1st DR line; note trailing period is removed + assertTrue(dbrefs.contains(new DBRefEntry("MD5", "0", + "d4c4942a634e3df4995fd5ac75c26a61"))); + // the 4th DR line: + assertTrue( + dbrefs.contains(new DBRefEntry("EuropePMC", "0", "PMC87941"))); + // from the first CDS feature; note canonicalisation to "UNIPROT" + assertTrue(dbrefs.contains(new DBRefEntry("GOA", "0", "P0CE19"))); + assertTrue(dbrefs.contains(new DBRefEntry("UNIPROT", "0", "P0CE19"))); + // from the last CDS feature + assertTrue(dbrefs.contains(new DBRefEntry("INTERPRO", "0", "IPR005350"))); + + // todo: mappings to, and sequences for, UNIPROT proteins + } +}