3 import static org.testng.Assert.assertEquals;
4 import static org.testng.Assert.assertTrue;
5 import static org.testng.AssertJUnit.assertNull;
8 import java.io.IOException;
9 import java.net.MalformedURLException;
10 import java.util.List;
13 import org.testng.annotations.BeforeClass;
14 import org.testng.annotations.Test;
16 import jalview.bin.Cache;
17 import jalview.datamodel.DBRefEntry;
18 import jalview.datamodel.Mapping;
19 import jalview.datamodel.SequenceFeature;
20 import jalview.datamodel.SequenceI;
21 import jalview.datamodel.features.SequenceFeatures;
22 import jalview.util.MapList;
24 public class GenBankFileTest
26 @BeforeClass(alwaysRun = true)
33 * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features,
34 * one of them reverse strand
36 * @throws MalformedURLException
39 @Test(groups = "Functional")
40 public void testParse() throws MalformedURLException, IOException
42 File dataFile = new File("test/jalview/io/J03321.gb");
43 FileParse fp = new FileParse(dataFile.getAbsolutePath(),
45 EMBLLikeFlatFile parser = new GenBankFile(fp, "GenBankTest");
46 List<SequenceI> seqs = parser.getSeqs();
48 assertEquals(seqs.size(), 1);
49 SequenceI seq = seqs.get(0);
50 assertEquals(seq.getName(), "GenBankTest|J03321");
51 assertEquals(seq.getLength(), 7502);
52 assertEquals(seq.getDescription(),
53 "Chlamydia trachomatis plasmid pCHL1, complete sequence");
56 * should be 9 CDS features (one is a 'join' of two exons)
58 Set<String> featureTypes = seq.getFeatures().getFeatureTypes();
59 assertEquals(featureTypes.size(), 1);
60 assertTrue(featureTypes.contains("CDS"));
63 * inspect some features (sorted just for convenience of test assertions)
65 List<SequenceFeature> features = seq.getFeatures()
66 .getAllFeatures("CDS");
67 SequenceFeatures.sortFeatures(features, true);
68 assertEquals(features.size(), 9);
70 SequenceFeature sf = features.get(0);
71 assertEquals(sf.getBegin(), 1);
72 assertEquals(sf.getEnd(), 437);
73 assertEquals(sf.getDescription(),
74 "Exon 2 for protein EMBLCDS:AAA91567.1");
75 assertEquals(sf.getFeatureGroup(), "GenBankTest");
76 assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)");
77 assertEquals(sf.getPhase(), "0");
78 assertEquals(sf.getStrand(), 1);
79 assertEquals(sf.getValue("note"), "pGP7-D");
80 // this is the second exon of circular CDS!
81 assertEquals(sf.getValue("exon number"), 2);
82 assertEquals(sf.getValue("product"), "hypothetical protein");
83 assertEquals(sf.getValue("transl_table"), "11");
86 assertEquals(sf.getBegin(), 488);
87 assertEquals(sf.getEnd(), 1480);
88 assertEquals(sf.getDescription(),
89 "Exon 1 for protein EMBLCDS:AAA91568.1");
90 assertEquals(sf.getFeatureGroup(), "GenBankTest");
91 assertEquals(sf.getEnaLocation(), "complement(488..1480)");
92 assertEquals(sf.getPhase(), "0");
93 assertEquals(sf.getStrand(), -1); // reverse strand!
94 assertEquals(sf.getValue("note"), "pGP8-D");
95 assertEquals(sf.getValue("exon number"), 1);
96 assertEquals(sf.getValue("product"), "hypothetical protein");
99 assertEquals(sf.getBegin(), 6045);
100 assertEquals(sf.getEnd(), 6788);
101 assertEquals(sf.getDescription(),
102 "Exon 1 for protein EMBLCDS:AAA91574.1");
103 assertEquals(sf.getFeatureGroup(), "GenBankTest");
104 assertEquals(sf.getEnaLocation(), "6045..6788");
105 assertEquals(sf.getPhase(), "0");
106 assertEquals(sf.getStrand(), 1);
107 assertEquals(sf.getValue("note"), "pGP6-D (gtg start codon)");
108 assertEquals(sf.getValue("exon number"), 1);
109 assertEquals(sf.getValue("product"), "hypothetical protein");
112 * CDS at 7022-7502 is the first exon of the circular CDS
114 sf = features.get(8);
115 assertEquals(sf.getBegin(), 7022);
116 assertEquals(sf.getEnd(), 7502);
117 assertEquals(sf.getDescription(),
118 "Exon 1 for protein EMBLCDS:AAA91567.1");
119 assertEquals(sf.getFeatureGroup(), "GenBankTest");
120 assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)");
121 assertEquals(sf.getPhase(), "0");
122 assertEquals(sf.getStrand(), 1);
123 assertEquals(sf.getValue("note"), "pGP7-D");
124 assertEquals(sf.getValue("exon number"), 1);
125 assertEquals(sf.getValue("product"), "hypothetical protein");
128 * GenBank doesn't declare accession or CDS xrefs;
129 * dbrefs are added by Jalview for
131 * protein products: 8
133 List<DBRefEntry> dbrefs = seq.getDBRefs();
135 assertEquals(dbrefs.size(), 9);
137 DBRefEntry selfRef = new DBRefEntry("GENBANKTEST", "1", "J03321");
138 int[] range = new int[] { 1, seq.getLength() };
139 selfRef.setMap(new Mapping(null, range, range, 1, 1));
140 assertTrue(dbrefs.contains(selfRef));
143 * dna should have dbref to itself, and to EMBLCDSPROTEIN
144 * for each /protein_id (synthesized as no UNIPROT xref)
146 // TODO check if we should synthesize EMBLCDSPROTEIN dbrefs
147 DBRefEntry dbref = dbrefs.get(0);
148 assertEquals(dbref.getSource(), "GENBANKTEST");
149 assertEquals(dbref.getAccessionId(), "J03321");
150 Mapping mapping = dbref.getMap();
151 assertNull(mapping.getTo());
152 MapList map = mapping.getMap();
153 assertEquals(map.getFromLowest(), 1);
154 assertEquals(map.getFromHighest(), 7502);
155 assertEquals(map.getToLowest(), 1);
156 assertEquals(map.getToHighest(), 7502);
157 assertEquals(map.getFromRatio(), 1);
158 assertEquals(map.getToRatio(), 1);
160 // dbref to inferred EMBLCDSPROTEIN for first CDS
161 dbref = dbrefs.get(1);
162 assertEquals(dbref.getSource(), "EMBLCDSPROTEIN");
163 assertEquals(dbref.getAccessionId(), "AAA91567.1");
164 mapping = dbref.getMap();
165 SequenceI mapTo = mapping.getTo();
166 assertEquals(mapTo.getName(), "AAA91567.1");
167 // the /product qualifier transfers to protein product description
168 assertEquals(mapTo.getDescription(), "hypothetical protein");
169 String seqString = mapTo.getSequenceAsString();
170 assertEquals(seqString.length(), 305);
171 assertTrue(seqString.startsWith("MGSMAF"));
172 assertTrue(seqString.endsWith("QTPTIL"));
173 map = mapping.getMap();
174 assertEquals(map.getFromLowest(), 1);
175 assertEquals(map.getFromHighest(), 7502);
176 assertEquals(map.getToLowest(), 1);
177 assertEquals(map.getToHighest(), 305);
178 assertEquals(map.getFromRatio(), 3);
179 assertEquals(map.getToRatio(), 1);
181 // dbref to inferred EMBLCDSPROTEIN for last CDS
182 dbref = dbrefs.get(8);
183 assertEquals(dbref.getSource(), "EMBLCDSPROTEIN");
184 assertEquals(dbref.getAccessionId(), "AAA91574.1");
185 mapping = dbref.getMap();
186 mapTo = mapping.getTo();
187 assertEquals(mapTo.getName(), "AAA91574.1");
188 // the /product qualifier transfers to protein product description
189 assertEquals(mapTo.getDescription(), "hypothetical protein");
190 seqString = mapTo.getSequenceAsString();
191 assertEquals(seqString.length(), 247);
192 assertTrue(seqString.startsWith("MNKLK"));
193 assertTrue(seqString.endsWith("FKQKS"));
194 map = mapping.getMap();
195 assertEquals(map.getFromLowest(), 6045);
196 assertEquals(map.getFromHighest(), 6788);
197 assertEquals(map.getToLowest(), 1);
198 assertEquals(map.getToHighest(), 247);
199 assertEquals(map.getFromRatio(), 3);
200 assertEquals(map.getToRatio(), 1);