JAL-1705 additional tests, validation regexp tweaks, javadoc
[jalview.git] / test / jalview / ext / ensembl / EnsemblSeqProxyTest.java
1 package jalview.ext.ensembl;
2
3 import static org.testng.AssertJUnit.assertEquals;
4
5 import jalview.datamodel.Alignment;
6 import jalview.datamodel.AlignmentI;
7 import jalview.datamodel.Sequence;
8 import jalview.datamodel.SequenceFeature;
9 import jalview.datamodel.SequenceI;
10 import jalview.io.AppletFormatAdapter;
11 import jalview.io.FastaFile;
12 import jalview.io.FileParse;
13 import jalview.io.gff.SequenceOntologyFactory;
14 import jalview.io.gff.SequenceOntologyLite;
15
16 import java.lang.reflect.Method;
17 import java.net.MalformedURLException;
18 import java.net.URL;
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.List;
22
23 import org.testng.Assert;
24 import org.testng.annotations.AfterClass;
25 import org.testng.annotations.BeforeClass;
26 import org.testng.annotations.DataProvider;
27 import org.testng.annotations.Test;
28
29
30 public class EnsemblSeqProxyTest
31 {
32   private static final Object[][] allSeqs = new Object[][] {
33       {
34           new EnsemblProtein(),
35           "CCDS5863.1",
36           ">CCDS5863.1\n"
37                   + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
38                   + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
39                   + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
40                   + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
41                   + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
42                   + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
43                   + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
44                   + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
45                   + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
46                   + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
47                   + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
48                   + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
49                   + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" },
50       {
51           new EnsemblCdna(),
52           "CCDS5863.1",
53           ">CCDS5863.1\n"
54                   + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n"
55                   + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n"
56                   + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n"
57                   + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n"
58                   + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n"
59                   + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n"
60                   + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n"
61                   + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n"
62                   + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n"
63                   + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n"
64                   + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n"
65                   + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n"
66                   + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n"
67                   + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n"
68                   + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n"
69                   + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n"
70                   + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n"
71                   + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n"
72                   + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n"
73                   + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n"
74                   + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n"
75                   + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n"
76                   + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n"
77                   + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n"
78                   + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n"
79                   + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n"
80                   + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n"
81                   + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n"
82                   + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n"
83                   + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n"
84                   + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n"
85                   + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n"
86                   + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n"
87                   + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n"
88                   + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n"
89                   + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n"
90                   + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n"
91                   + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n"
92                   + "GGTGCGTTTCCTGTCCACTGA\n" },
93       {
94           new EnsemblProtein(),
95           "ENSP00000288602",
96           ">ENSP00000288602\n"
97                   + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
98                   + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
99                   + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
100                   + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
101                   + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
102                   + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
103                   + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
104                   + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
105                   + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
106                   + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
107                   + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
108                   + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
109                   + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } };
110
111   @BeforeClass
112   public void setUp()
113   {
114     SequenceOntologyFactory.setInstance(new SequenceOntologyLite());
115   }
116
117   @AfterClass
118   public void tearDown()
119   {
120     SequenceOntologyFactory.setInstance(null);
121   }
122
123   @DataProvider(name = "ens_seqs")
124   public Object[][] createData(Method m)
125   {
126     System.out.println(m.getName());
127     return allSeqs;
128   }
129
130   @Test(dataProvider = "ens_seqs", suiteName = "live")
131   public void testGetOneSeqs(EnsemblRestClient proxy, String sq, String fastasq)
132           throws Exception
133   {
134     FileParse fp = proxy.getSequenceReader(Arrays
135             .asList(new String[]
136     { sq }));
137     SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray();
138     FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE);
139     SequenceI[] trueSqs = trueRes.getSeqsAsArray();
140     Assert.assertEquals(sqs.length, trueSqs.length,
141             "Different number of sequences retrieved for query " + sq);
142     Alignment ral = new Alignment(sqs);
143     for (SequenceI tr : trueSqs)
144     {
145       SequenceI[] rseq;
146       Assert.assertNotNull(
147               rseq = ral.findSequenceMatch(tr.getName()),
148               "Couldn't find sequences matching expected sequence "
149                       + tr.getName());
150       Assert.assertEquals(rseq.length, 1,
151               "Expected only one sequence for sequence ID " + tr.getName());
152       Assert.assertEquals(
153               rseq[0].getSequenceAsString(),
154               tr.getSequenceAsString(),
155               "Sequences differ for " + tr.getName() + "\n" + "Exp:"
156                       + tr.getSequenceAsString() + "\n" + "Got:"
157                       + rseq[0].getSequenceAsString());
158   
159     }
160   }
161
162   @Test(suiteName = "live")
163   public void testLiveCheckEnsembl()
164   {
165     EnsemblRestClient sf = new EnsemblRestClient()
166     {
167
168       @Override
169       public String getDbName()
170       {
171         // TODO Auto-generated method stub
172         return null;
173       }
174
175       @Override
176       public AlignmentI getSequenceRecords(String queries) throws Exception
177       {
178         // TODO Auto-generated method stub
179         return null;
180       }
181
182       @Override
183       protected URL getUrl(List<String> ids) throws MalformedURLException
184       {
185         // TODO Auto-generated method stub
186         return null;
187       }
188
189       @Override
190       protected boolean useGetRequest()
191       {
192         // TODO Auto-generated method stub
193         return false;
194       }
195
196       @Override
197       protected String getRequestMimeType(boolean b)
198       {
199         // TODO Auto-generated method stub
200         return null;
201       }
202
203       @Override
204       protected String getResponseMimeType()
205       {
206         // TODO Auto-generated method stub
207         return null;
208       }
209
210     };
211     boolean isAvailable = sf.isEnsemblAvailable();
212     System.out.println("Ensembl is "
213             + (isAvailable ? "UP!"
214                     : "DOWN or unreachable ******************* BAD!"));
215   }
216
217   /**
218    * Tests for the method that computes all peptide variants given codon
219    * variants
220    */
221   @Test(groups = "Functional")
222   public void testComputePeptideVariants()
223   {
224     String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } };
225
226     /*
227      * AGT codes for S - this is not included in the variants returned
228      */
229     List<String> variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
230     assertEquals("[]", variants.toString());
231
232     // S is reported if it differs from the current value (A):
233     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "A");
234     assertEquals("[S]", variants.toString());
235
236     /*
237      * synonymous variant is not reported
238      */
239     codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } };
240     // AGC and AGT both code for S
241     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "s");
242     assertEquals("[]", variants.toString());
243
244     /*
245      * equivalent variants are only reported once
246      */
247     codonVariants = new String[][] { { "C" }, { "T" },
248         { "A", "C", "G", "T" } };
249     // CTA CTC CTG CTT all code for L
250     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
251     assertEquals("[L]", variants.toString());
252
253     /*
254      * vary codons 1 and 2; variant products are sorted and non-redundant
255      */
256     codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } };
257     // aga ata cga cta code for R, I, R, L
258     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
259     assertEquals("[I, L, R]", variants.toString());
260
261     /*
262      * vary codons 2 and 3
263      */
264     codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } };
265     // aga agc ata atc code for R, S, I, I
266     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
267     assertEquals("[I, R]", variants.toString());
268
269     /*
270      * vary codons 1 and 3
271      */
272     codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } };
273     // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end
274     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
275     assertEquals("[K, N, Y, STOP]", variants.toString());
276
277     /*
278      * vary codons 1, 2 and 3
279      */
280     codonVariants = new String[][] { { "a", "t" }, { "G", "C" },
281         { "t", "g" } };
282     // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S
283     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
284     assertEquals("[C, R, T, W]", variants.toString());
285   }
286   
287   /**
288    * Tests for the method that maps the subset of a dna sequence that has CDS
289    * (or subtype) feature.
290    */
291   @Test(groups = "Functional")
292   public void testGetCdsRanges()
293   {
294     EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter();
295
296     SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
297     dnaSeq.createDatasetSequence();
298     SequenceI ds = dnaSeq.getDatasetSequence();
299
300     // CDS for dna 3-6
301     SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
302     ds.addSequenceFeature(sf);
303     // exon feature should be ignored here
304     sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
305     ds.addSequenceFeature(sf);
306     // CDS for dna 10-12
307     sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
308     ds.addSequenceFeature(sf);
309
310     List<int[]> ranges = new ArrayList<int[]>();
311     int mappedLength = testee.getCdsRanges(dnaSeq, ranges);
312     assertEquals(6, mappedLength);
313     assertEquals(2, ranges.size());
314     assertEquals(4, ranges.get(0)[0]);
315     assertEquals(6, ranges.get(0)[1]);
316     assertEquals(10, ranges.get(1)[0]);
317     assertEquals(12, ranges.get(1)[1]);
318
319   }
320
321   @Test(groups = "Functional")
322   public void getGenomicRangesFromFeatures()
323   {
324
325   }
326
327   /**
328    * Tests for the method that maps the subset of a dna sequence that has CDS
329    * (or subtype) feature - case where the start codon is incomplete.
330    */
331   @Test(groups = "Functional")
332   public void testGetCdsRanges_fivePrimeIncomplete()
333   {
334     EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter();
335   
336     SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
337     dnaSeq.createDatasetSequence();
338     SequenceI ds = dnaSeq.getDatasetSequence();
339   
340     // CDS for dna 5-6 (incomplete codon), 7-9
341     SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
342     sf.setPhase("2"); // skip 2 bases to start of next codon
343     ds.addSequenceFeature(sf);
344     ds.addSequenceFeature(sf);
345     // CDS for dna 13-15
346     sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
347     ds.addSequenceFeature(sf);
348   
349     List<int[]> ranges = new ArrayList<int[]>();
350     int mappedLength = testee.getCdsRanges(dnaSeq, ranges);
351
352     /*
353      * check the mapping starts with the first complete codon
354      */
355     assertEquals(6, mappedLength);
356     assertEquals(2, ranges.size());
357     assertEquals(7, ranges.get(0)[0]);
358     assertEquals(9, ranges.get(0)[1]);
359     assertEquals(13, ranges.get(1)[0]);
360     assertEquals(15, ranges.get(1)[1]);
361   }
362 }