2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 import static org.testng.Assert.assertEquals;
24 import static org.testng.Assert.assertTrue;
25 import static org.testng.AssertJUnit.assertNull;
28 import java.io.IOException;
29 import java.net.MalformedURLException;
30 import java.util.List;
33 import org.testng.annotations.BeforeClass;
34 import org.testng.annotations.Test;
36 import jalview.bin.Console;
37 import jalview.datamodel.DBRefEntry;
38 import jalview.datamodel.Mapping;
39 import jalview.datamodel.SequenceFeature;
40 import jalview.datamodel.SequenceI;
41 import jalview.datamodel.features.SequenceFeatures;
42 import jalview.util.MapList;
44 public class GenBankFileTest
46 @BeforeClass(alwaysRun = true)
53 * A fairly tough test, using J03321 (circular DNA), which has 8 CDS features,
54 * one of them reverse strand
56 * @throws MalformedURLException
59 @Test(groups = "Functional")
60 public void testParse() throws MalformedURLException, IOException
62 File dataFile = new File("test/jalview/io/J03321.gb");
63 FileParse fp = new FileParse(dataFile.getAbsolutePath(),
65 EMBLLikeFlatFile parser = new GenBankFile(fp, "GenBankTest");
66 List<SequenceI> seqs = parser.getSeqs();
68 assertEquals(seqs.size(), 1);
69 SequenceI seq = seqs.get(0);
70 assertEquals(seq.getName(), "GenBankTest|J03321");
71 assertEquals(seq.getLength(), 7502);
72 assertEquals(seq.getDescription(),
73 "Chlamydia trachomatis plasmid pCHL1, complete sequence");
76 * should be 9 CDS features (one is a 'join' of two exons)
78 Set<String> featureTypes = seq.getFeatures().getFeatureTypes();
79 assertEquals(featureTypes.size(), 1);
80 assertTrue(featureTypes.contains("CDS"));
83 * inspect some features (sorted just for convenience of test assertions)
85 List<SequenceFeature> features = seq.getFeatures()
86 .getAllFeatures("CDS");
87 SequenceFeatures.sortFeatures(features, true);
88 assertEquals(features.size(), 9);
90 SequenceFeature sf = features.get(0);
91 assertEquals(sf.getBegin(), 1);
92 assertEquals(sf.getEnd(), 437);
93 assertEquals(sf.getDescription(),
94 "Exon 2 for protein EMBLCDS:AAA91567.1");
95 assertEquals(sf.getFeatureGroup(), "GenBankTest");
96 assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)");
97 assertEquals(sf.getPhase(), "0");
98 assertEquals(sf.getStrand(), 1);
99 assertEquals(sf.getValue("note"), "pGP7-D");
100 // this is the second exon of circular CDS!
101 assertEquals(sf.getValue("exon number"), 2);
102 assertEquals(sf.getValue("product"), "hypothetical protein");
103 assertEquals(sf.getValue("transl_table"), "11");
105 sf = features.get(1);
106 assertEquals(sf.getBegin(), 488);
107 assertEquals(sf.getEnd(), 1480);
108 assertEquals(sf.getDescription(),
109 "Exon 1 for protein EMBLCDS:AAA91568.1");
110 assertEquals(sf.getFeatureGroup(), "GenBankTest");
111 assertEquals(sf.getEnaLocation(), "complement(488..1480)");
112 assertEquals(sf.getPhase(), "0");
113 assertEquals(sf.getStrand(), -1); // reverse strand!
114 assertEquals(sf.getValue("note"), "pGP8-D");
115 assertEquals(sf.getValue("exon number"), 1);
116 assertEquals(sf.getValue("product"), "hypothetical protein");
118 sf = features.get(7);
119 assertEquals(sf.getBegin(), 6045);
120 assertEquals(sf.getEnd(), 6788);
121 assertEquals(sf.getDescription(),
122 "Exon 1 for protein EMBLCDS:AAA91574.1");
123 assertEquals(sf.getFeatureGroup(), "GenBankTest");
124 assertEquals(sf.getEnaLocation(), "6045..6788");
125 assertEquals(sf.getPhase(), "0");
126 assertEquals(sf.getStrand(), 1);
127 assertEquals(sf.getValue("note"), "pGP6-D (gtg start codon)");
128 assertEquals(sf.getValue("exon number"), 1);
129 assertEquals(sf.getValue("product"), "hypothetical protein");
132 * CDS at 7022-7502 is the first exon of the circular CDS
134 sf = features.get(8);
135 assertEquals(sf.getBegin(), 7022);
136 assertEquals(sf.getEnd(), 7502);
137 assertEquals(sf.getDescription(),
138 "Exon 1 for protein EMBLCDS:AAA91567.1");
139 assertEquals(sf.getFeatureGroup(), "GenBankTest");
140 assertEquals(sf.getEnaLocation(), "join(7022..7502,1..437)");
141 assertEquals(sf.getPhase(), "0");
142 assertEquals(sf.getStrand(), 1);
143 assertEquals(sf.getValue("note"), "pGP7-D");
144 assertEquals(sf.getValue("exon number"), 1);
145 assertEquals(sf.getValue("product"), "hypothetical protein");
148 * GenBank doesn't declare accession or CDS xrefs;
149 * dbrefs are added by Jalview for
151 * protein products: 8
153 List<DBRefEntry> dbrefs = seq.getDBRefs();
155 assertEquals(dbrefs.size(), 9);
157 DBRefEntry selfRef = new DBRefEntry("GENBANKTEST", "1", "J03321");
158 int[] range = new int[] { 1, seq.getLength() };
159 selfRef.setMap(new Mapping(null, range, range, 1, 1));
160 assertTrue(dbrefs.contains(selfRef));
163 * dna should have dbref to itself, and to EMBLCDSPROTEIN
164 * for each /protein_id (synthesized as no UNIPROT xref)
166 // TODO check if we should synthesize EMBLCDSPROTEIN dbrefs
167 DBRefEntry dbref = dbrefs.get(0);
168 assertEquals(dbref.getSource(), "GENBANKTEST");
169 assertEquals(dbref.getAccessionId(), "J03321");
170 Mapping mapping = dbref.getMap();
171 assertNull(mapping.getTo());
172 MapList map = mapping.getMap();
173 assertEquals(map.getFromLowest(), 1);
174 assertEquals(map.getFromHighest(), 7502);
175 assertEquals(map.getToLowest(), 1);
176 assertEquals(map.getToHighest(), 7502);
177 assertEquals(map.getFromRatio(), 1);
178 assertEquals(map.getToRatio(), 1);
180 // dbref to inferred EMBLCDSPROTEIN for first CDS
181 dbref = dbrefs.get(1);
182 assertEquals(dbref.getSource(), "EMBLCDSPROTEIN");
183 assertEquals(dbref.getAccessionId(), "AAA91567.1");
184 mapping = dbref.getMap();
185 SequenceI mapTo = mapping.getTo();
186 assertEquals(mapTo.getName(), "AAA91567.1");
187 // the /product qualifier transfers to protein product description
188 assertEquals(mapTo.getDescription(), "hypothetical protein");
189 String seqString = mapTo.getSequenceAsString();
190 assertEquals(seqString.length(), 305);
191 assertTrue(seqString.startsWith("MGSMAF"));
192 assertTrue(seqString.endsWith("QTPTIL"));
193 map = mapping.getMap();
194 assertEquals(map.getFromLowest(), 1);
195 assertEquals(map.getFromHighest(), 7502);
196 assertEquals(map.getToLowest(), 1);
197 assertEquals(map.getToHighest(), 305);
198 assertEquals(map.getFromRatio(), 3);
199 assertEquals(map.getToRatio(), 1);
201 // dbref to inferred EMBLCDSPROTEIN for last CDS
202 dbref = dbrefs.get(8);
203 assertEquals(dbref.getSource(), "EMBLCDSPROTEIN");
204 assertEquals(dbref.getAccessionId(), "AAA91574.1");
205 mapping = dbref.getMap();
206 mapTo = mapping.getTo();
207 assertEquals(mapTo.getName(), "AAA91574.1");
208 // the /product qualifier transfers to protein product description
209 assertEquals(mapTo.getDescription(), "hypothetical protein");
210 seqString = mapTo.getSequenceAsString();
211 assertEquals(seqString.length(), 247);
212 assertTrue(seqString.startsWith("MNKLK"));
213 assertTrue(seqString.endsWith("FKQKS"));
214 map = mapping.getMap();
215 assertEquals(map.getFromLowest(), 6045);
216 assertEquals(map.getFromHighest(), 6785); // excludes stop at 6788
217 assertEquals(map.getToLowest(), 1);
218 assertEquals(map.getToHighest(), 247);
219 assertEquals(map.getFromRatio(), 3);
220 assertEquals(map.getToRatio(), 1);