JAL-1705 JAL-1191 SequenceOntologyLite added as hard-coded alternative
[jalview.git] / test / jalview / ext / ensembl / EnsemblSeqProxyTest.java
1 package jalview.ext.ensembl;
2
3 import static org.testng.AssertJUnit.assertEquals;
4
5 import jalview.datamodel.Alignment;
6 import jalview.datamodel.AlignmentI;
7 import jalview.datamodel.Sequence;
8 import jalview.datamodel.SequenceFeature;
9 import jalview.datamodel.SequenceI;
10 import jalview.io.AppletFormatAdapter;
11 import jalview.io.FastaFile;
12 import jalview.io.FileParse;
13 import jalview.io.gff.SequenceOntologyFactory;
14 import jalview.io.gff.SequenceOntologyLite;
15
16 import java.lang.reflect.Method;
17 import java.net.MalformedURLException;
18 import java.net.URL;
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.List;
22
23 import org.testng.Assert;
24 import org.testng.annotations.DataProvider;
25 import org.testng.annotations.Test;
26
27
28 public class EnsemblSeqProxyTest
29 {
30   private static final Object[][] allSeqs = new Object[][] {
31       {
32           new EnsemblProtein(),
33           "CCDS5863.1",
34           ">CCDS5863.1\n"
35                   + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
36                   + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
37                   + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
38                   + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
39                   + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
40                   + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
41                   + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
42                   + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
43                   + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
44                   + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
45                   + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
46                   + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
47                   + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" },
48       {
49           new EnsemblCdna(),
50           "CCDS5863.1",
51           ">CCDS5863.1\n"
52                   + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n"
53                   + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n"
54                   + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n"
55                   + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n"
56                   + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n"
57                   + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n"
58                   + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n"
59                   + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n"
60                   + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n"
61                   + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n"
62                   + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n"
63                   + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n"
64                   + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n"
65                   + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n"
66                   + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n"
67                   + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n"
68                   + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n"
69                   + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n"
70                   + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n"
71                   + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n"
72                   + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n"
73                   + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n"
74                   + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n"
75                   + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n"
76                   + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n"
77                   + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n"
78                   + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n"
79                   + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n"
80                   + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n"
81                   + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n"
82                   + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n"
83                   + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n"
84                   + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n"
85                   + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n"
86                   + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n"
87                   + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n"
88                   + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n"
89                   + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n"
90                   + "GGTGCGTTTCCTGTCCACTGA\n" },
91       {
92           new EnsemblProtein(),
93           "ENSP00000288602",
94           ">ENSP00000288602\n"
95                   + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
96                   + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
97                   + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
98                   + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
99                   + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
100                   + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
101                   + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
102                   + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
103                   + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
104                   + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
105                   + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
106                   + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
107                   + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } };
108
109   @DataProvider(name = "queries")
110   public Object[][] createQueryData(Method m)
111   {
112     return new Object[][] { { "CCDS5863.1" }, { "ENSP00000288602" } };
113   }
114
115   @Test(dataProvider = "queries")
116   public void testIsValidReference(String query) throws Exception
117   {
118     EnsemblSequenceFetcher esq = new EnsemblProtein();
119     Assert.assertTrue(esq.isValidReference(query),
120             "Expected reference string " + query
121                     + " to be valid for regex "
122                     + esq.getAccessionValidator().toString());
123   }
124
125   @DataProvider(name = "ens_seqs")
126   public Object[][] createData(Method m)
127   {
128     System.out.println(m.getName());
129     return allSeqs;
130   }
131
132   @Test(dataProvider = "ens_seqs", suiteName = "live")
133   public void testGetOneSeqs(EnsemblRestClient proxy, String sq, String fastasq)
134           throws Exception
135   {
136     FileParse fp = proxy.getSequenceReader(Arrays
137             .asList(new String[]
138     { sq }));
139     SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray();
140     FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE);
141     SequenceI[] trueSqs = trueRes.getSeqsAsArray();
142     Assert.assertEquals(sqs.length, trueSqs.length,
143             "Different number of sequences retrieved for query " + sq);
144     Alignment ral = new Alignment(sqs);
145     for (SequenceI tr : trueSqs)
146     {
147       SequenceI[] rseq;
148       Assert.assertNotNull(
149               rseq = ral.findSequenceMatch(tr.getName()),
150               "Couldn't find sequences matching expected sequence "
151                       + tr.getName());
152       Assert.assertEquals(rseq.length, 1,
153               "Expected only one sequence for sequence ID " + tr.getName());
154       Assert.assertEquals(
155               rseq[0].getSequenceAsString(),
156               tr.getSequenceAsString(),
157               "Sequences differ for " + tr.getName() + "\n" + "Exp:"
158                       + tr.getSequenceAsString() + "\n" + "Got:"
159                       + rseq[0].getSequenceAsString());
160   
161     }
162   }
163
164   @Test(suiteName = "live")
165   public void testLiveCheckEnsembl()
166   {
167     EnsemblRestClient sf = new EnsemblRestClient()
168     {
169
170       @Override
171       public String getDbName()
172       {
173         // TODO Auto-generated method stub
174         return null;
175       }
176
177       @Override
178       public AlignmentI getSequenceRecords(String queries) throws Exception
179       {
180         // TODO Auto-generated method stub
181         return null;
182       }
183
184       @Override
185       protected URL getUrl(List<String> ids) throws MalformedURLException
186       {
187         // TODO Auto-generated method stub
188         return null;
189       }
190
191       @Override
192       protected boolean useGetRequest()
193       {
194         // TODO Auto-generated method stub
195         return false;
196       }
197
198       @Override
199       protected String getRequestMimeType(boolean b)
200       {
201         // TODO Auto-generated method stub
202         return null;
203       }
204
205       @Override
206       protected String getResponseMimeType()
207       {
208         // TODO Auto-generated method stub
209         return null;
210       }
211
212     };
213     boolean isAvailable = sf.isEnsemblAvailable();
214     System.out.println("Ensembl is "
215             + (isAvailable ? "UP!"
216                     : "DOWN or unreachable ******************* BAD!"));
217   }
218
219   /**
220    * Tests for the method that computes all peptide variants given codon
221    * variants
222    */
223   @Test(groups = "Functional")
224   public void testComputePeptideVariants()
225   {
226     String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } };
227
228     /*
229      * AGT codes for S - this is not included in the variants returned
230      */
231     List<String> variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
232     assertEquals("[]", variants.toString());
233
234     // S is reported if it differs from the current value (A):
235     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "A");
236     assertEquals("[S]", variants.toString());
237
238     /*
239      * synonymous variant is not reported
240      */
241     codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } };
242     // AGC and AGT both code for S
243     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "s");
244     assertEquals("[]", variants.toString());
245
246     /*
247      * equivalent variants are only reported once
248      */
249     codonVariants = new String[][] { { "C" }, { "T" },
250         { "A", "C", "G", "T" } };
251     // CTA CTC CTG CTT all code for L
252     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
253     assertEquals("[L]", variants.toString());
254
255     /*
256      * vary codons 1 and 2; variant products are sorted and non-redundant
257      */
258     codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } };
259     // aga ata cga cta code for R, I, R, L
260     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
261     assertEquals("[I, L, R]", variants.toString());
262
263     /*
264      * vary codons 2 and 3
265      */
266     codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } };
267     // aga agc ata atc code for R, S, I, I
268     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
269     assertEquals("[I, R]", variants.toString());
270
271     /*
272      * vary codons 1 and 3
273      */
274     codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } };
275     // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end
276     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
277     assertEquals("[K, N, Y, STOP]", variants.toString());
278
279     /*
280      * vary codons 1, 2 and 3
281      */
282     codonVariants = new String[][] { { "a", "t" }, { "G", "C" },
283         { "t", "g" } };
284     // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S
285     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
286     assertEquals("[C, R, T, W]", variants.toString());
287   }
288   
289   /**
290    * Tests for the method that maps the subset of a dna sequence that has CDS
291    * (or subtype) feature.
292    */
293   @Test(groups = "Functional")
294   public void testGetCdsRanges()
295   {
296     EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter();
297
298     SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
299     dnaSeq.createDatasetSequence();
300     SequenceI ds = dnaSeq.getDatasetSequence();
301
302     // CDS for dna 3-6
303     SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
304     ds.addSequenceFeature(sf);
305     // exon feature should be ignored here
306     sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
307     ds.addSequenceFeature(sf);
308     // CDS for dna 10-12
309     sf = new SequenceFeature("some_cds", "", 10, 12, 0f, null);
310     ds.addSequenceFeature(sf);
311
312     SequenceOntologyFactory.setInstance(new SequenceOntologyLite());
313     List<int[]> ranges = new ArrayList<int[]>();
314     int mappedLength = testee.getCdsRanges(dnaSeq, ranges);
315     assertEquals(6, mappedLength);
316     assertEquals(2, ranges.size());
317     assertEquals(4, ranges.get(0)[0]);
318     assertEquals(6, ranges.get(0)[1]);
319     assertEquals(10, ranges.get(1)[0]);
320     assertEquals(12, ranges.get(1)[1]);
321
322   }
323
324   @Test(groups = "Functional")
325   public void getGenomicRangesFromFeatures()
326   {
327
328   }
329
330   /**
331    * Tests for the method that maps the subset of a dna sequence that has CDS
332    * (or subtype) feature - case where the start codon is incomplete.
333    */
334   @Test(groups = "Functional")
335   public void testGetCdsRanges_fivePrimeIncomplete()
336   {
337     EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter();
338   
339     SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
340     dnaSeq.createDatasetSequence();
341     SequenceI ds = dnaSeq.getDatasetSequence();
342   
343     // CDS for dna 5-6 (incomplete codon), 7-9
344     SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
345     sf.setPhase("2"); // skip 2 bases to start of next codon
346     ds.addSequenceFeature(sf);
347     ds.addSequenceFeature(sf);
348     // CDS for dna 13-15
349     sf = new SequenceFeature("some_cds", "", 13, 15, 0f, null);
350     ds.addSequenceFeature(sf);
351   
352     SequenceOntologyFactory.setInstance(new SequenceOntologyLite());
353     List<int[]> ranges = new ArrayList<int[]>();
354     int mappedLength = testee.getCdsRanges(dnaSeq, ranges);
355
356     /*
357      * check the mapping starts with the first complete codon
358      */
359     assertEquals(6, mappedLength);
360     assertEquals(2, ranges.size());
361     assertEquals(7, ranges.get(0)[0]);
362     assertEquals(9, ranges.get(0)[1]);
363     assertEquals(13, ranges.get(1)[0]);
364     assertEquals(15, ranges.get(1)[1]);
365   }
366 }