3 import static org.testng.Assert.assertEquals;
4 import static org.testng.Assert.assertTrue;
7 import java.io.IOException;
8 import java.net.MalformedURLException;
12 import org.testng.annotations.Test;
14 import jalview.datamodel.DBRefEntry;
15 import jalview.datamodel.Mapping;
16 import jalview.datamodel.SequenceFeature;
17 import jalview.datamodel.SequenceI;
18 import jalview.datamodel.features.SequenceFeatures;
20 public class EmblFlatFileTest
23 * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features,
24 * one of them reverse strand
26 * @throws MalformedURLException
29 @Test(groups = "Functional")
30 public void testParse() throws MalformedURLException, IOException
32 File dataFile = new File("test/jalview/io/J03321.embl.txt");
33 FileParse fp = new FileParse(dataFile, DataSourceType.FILE);
34 EmblFlatFile parser = new EmblFlatFile(fp, "EmblTest");
36 List<SequenceI> seqs = parser.getSeqs();
38 assertEquals(seqs.size(), 1);
39 SequenceI seq = seqs.get(0);
40 assertEquals(seq.getName(), "EmblTest|J03321");
41 assertEquals(seq.getLength(), 7502);
42 assertEquals(seq.getDescription(), "Chlamydia trachomatis plasmid pCHL1, complete sequence");
45 * should be 9 CDS features (one is a 'join' of two exons)
47 Set<String> featureTypes = seq.getFeatures().getFeatureTypes();
48 assertEquals(featureTypes.size(), 1);
49 assertTrue(featureTypes.contains("CDS"));
52 * inspect some features (sorted just for convenience of test assertions)
54 List<SequenceFeature> features = seq.getFeatures()
55 .getAllFeatures("CDS");
56 SequenceFeatures.sortFeatures(features, true);
57 assertEquals(features.size(), 9);
59 SequenceFeature sf = features.get(0);
60 assertEquals(sf.getBegin(), 1);
61 assertEquals(sf.getEnd(), 437);
62 assertEquals(sf.getDescription(),
63 "Exon 2 for protein EMBLCDS:AAA91567.1");
64 assertEquals(sf.getFeatureGroup(), "EmblTest");
65 assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)");
66 assertEquals(sf.getPhase(), "0");
67 assertEquals(sf.getStrand(), 1);
68 assertEquals(sf.getValue("note"), "pGP7-D");
69 // this is the second exon of circular CDS!
70 assertEquals(sf.getValue("exon number"), 2);
71 assertEquals(sf.getValue("product"), "hypothetical protein");
72 assertEquals(sf.getValue("transl_table"), "11");
75 assertEquals(sf.getBegin(), 488);
76 assertEquals(sf.getEnd(), 1480);
77 assertEquals(sf.getDescription(),
78 "Exon 1 for protein EMBLCDS:AAA91568.1");
79 assertEquals(sf.getFeatureGroup(), "EmblTest");
80 assertEquals(sf.getEnaLocation(), "complement(488..1480)");
81 assertEquals(sf.getPhase(), "0");
82 assertEquals(sf.getStrand(), -1); // reverse strand!
83 assertEquals(sf.getValue("note"), "pGP8-D");
84 assertEquals(sf.getValue("exon number"), 1);
85 assertEquals(sf.getValue("product"), "hypothetical protein");
88 assertEquals(sf.getBegin(), 6045);
89 assertEquals(sf.getEnd(), 6788);
90 assertEquals(sf.getDescription(),
91 "Exon 1 for protein EMBLCDS:AAA91574.1");
92 assertEquals(sf.getFeatureGroup(), "EmblTest");
93 assertEquals(sf.getEnaLocation(), "6045..6788");
94 assertEquals(sf.getPhase(), "0");
95 assertEquals(sf.getStrand(), 1);
96 assertEquals(sf.getValue("note"), "pGP6-D (gtg start codon)");
97 assertEquals(sf.getValue("exon number"), 1);
98 assertEquals(sf.getValue("product"), "hypothetical protein");
101 * CDS at 7022-7502 is the first exon of the circular CDS
103 sf = features.get(8);
104 assertEquals(sf.getBegin(), 7022);
105 assertEquals(sf.getEnd(), 7502);
106 assertEquals(sf.getDescription(),
107 "Exon 1 for protein EMBLCDS:AAA91567.1");
108 assertEquals(sf.getFeatureGroup(), "EmblTest");
109 assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)");
110 assertEquals(sf.getPhase(), "0");
111 assertEquals(sf.getStrand(), 1);
112 assertEquals(sf.getValue("note"), "pGP7-D");
113 assertEquals(sf.getValue("exon number"), 1);
114 assertEquals(sf.getValue("product"), "hypothetical protein");
117 * Jalview adds a dbref to 'self', and there are 4 'direct' (DR) dbrefs,
118 * and numerous CDS /db_xref entries (some e.g. INTERPRO are duplicates)
120 * Note DBRefEntry constructor capitalises source
122 List<DBRefEntry> dbrefs = seq.getDBRefs();
123 assertEquals(dbrefs.size(), 32);
125 DBRefEntry selfRef = new DBRefEntry("EMBLTEST", "1", "J03321");
126 int[] range = new int[] {1, seq.getLength()};
127 selfRef.setMap(new Mapping(null, range, range, 1, 1));
128 assertTrue(dbrefs.contains(selfRef));
130 // 1st DR line; note trailing period is removed
131 assertTrue(dbrefs.contains(new DBRefEntry("MD5", "0",
132 "d4c4942a634e3df4995fd5ac75c26a61")));
135 dbrefs.contains(new DBRefEntry("EUROPEPMC", "0", "PMC87941")));
136 // from the first CDS feature; note canonicalisation to "UNIPROT"
137 assertTrue(dbrefs.contains(new DBRefEntry("GOA", "0", "P0CE19")));
138 assertTrue(dbrefs.contains(new DBRefEntry("UNIPROT", "0", "P0CE19")));
139 // from the last CDS feature
140 assertTrue(dbrefs.contains(new DBRefEntry("INTERPRO", "0", "IPR005350")));
142 // todo: mappings to, and sequences for, UNIPROT proteins