JAL-1705 tests added, minor bugfix and refactoring
[jalview.git] / test / jalview / ext / ensembl / EnsemblSeqProxyTest.java
1 package jalview.ext.ensembl;
2
3 import static org.testng.AssertJUnit.assertEquals;
4
5 import jalview.datamodel.Alignment;
6 import jalview.datamodel.AlignmentI;
7 import jalview.datamodel.SequenceI;
8 import jalview.io.AppletFormatAdapter;
9 import jalview.io.FastaFile;
10 import jalview.io.FileParse;
11
12 import java.lang.reflect.Method;
13 import java.net.MalformedURLException;
14 import java.net.URL;
15 import java.util.Arrays;
16 import java.util.List;
17
18 import org.testng.Assert;
19 import org.testng.annotations.DataProvider;
20 import org.testng.annotations.Test;
21
22
23 public class EnsemblSeqProxyTest
24 {
25   private static final Object[][] allSeqs = new Object[][] {
26       {
27           new EnsemblProtein(),
28           "CCDS5863.1",
29           ">CCDS5863.1\n"
30                   + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
31                   + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
32                   + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
33                   + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
34                   + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
35                   + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
36                   + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
37                   + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
38                   + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
39                   + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
40                   + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
41                   + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
42                   + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" },
43       {
44           new EnsemblCdna(),
45           "CCDS5863.1",
46           ">CCDS5863.1\n"
47                   + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n"
48                   + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n"
49                   + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n"
50                   + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n"
51                   + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n"
52                   + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n"
53                   + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n"
54                   + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n"
55                   + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n"
56                   + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n"
57                   + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n"
58                   + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n"
59                   + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n"
60                   + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n"
61                   + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n"
62                   + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n"
63                   + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n"
64                   + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n"
65                   + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n"
66                   + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n"
67                   + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n"
68                   + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n"
69                   + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n"
70                   + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n"
71                   + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n"
72                   + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n"
73                   + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n"
74                   + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n"
75                   + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n"
76                   + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n"
77                   + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n"
78                   + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n"
79                   + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n"
80                   + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n"
81                   + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n"
82                   + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n"
83                   + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n"
84                   + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n"
85                   + "GGTGCGTTTCCTGTCCACTGA\n" },
86       {
87           new EnsemblProtein(),
88           "ENSP00000288602",
89           ">ENSP00000288602\n"
90                   + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
91                   + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
92                   + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
93                   + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
94                   + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
95                   + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
96                   + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
97                   + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
98                   + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
99                   + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
100                   + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
101                   + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
102                   + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } };
103
104   @DataProvider(name = "queries")
105   public Object[][] createQueryData(Method m)
106   {
107     return new Object[][] { { "CCDS5863.1" }, { "ENSP00000288602" } };
108   }
109
110   @Test(dataProvider = "queries")
111   public void testIsValidReference(String query) throws Exception
112   {
113     EnsemblSequenceFetcher esq = new EnsemblProtein();
114     Assert.assertTrue(esq.isValidReference(query),
115             "Expected reference string " + query
116                     + " to be valid for regex "
117                     + esq.getAccessionValidator().toString());
118   }
119
120   @DataProvider(name = "ens_seqs")
121   public Object[][] createData(Method m)
122   {
123     System.out.println(m.getName());
124     return allSeqs;
125   }
126
127   @Test(dataProvider = "ens_seqs", suiteName = "live")
128   public void testGetOneSeqs(EnsemblRestClient proxy, String sq, String fastasq)
129           throws Exception
130   {
131     FileParse fp = proxy.getSequenceReader(Arrays
132             .asList(new String[]
133     { sq }));
134     SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray();
135     FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE);
136     SequenceI[] trueSqs = trueRes.getSeqsAsArray();
137     Assert.assertEquals(sqs.length, trueSqs.length,
138             "Different number of sequences retrieved for query " + sq);
139     Alignment ral = new Alignment(sqs);
140     for (SequenceI tr : trueSqs)
141     {
142       SequenceI[] rseq;
143       Assert.assertNotNull(
144               rseq = ral.findSequenceMatch(tr.getName()),
145               "Couldn't find sequences matching expected sequence "
146                       + tr.getName());
147       Assert.assertEquals(rseq.length, 1,
148               "Expected only one sequence for sequence ID " + tr.getName());
149       Assert.assertEquals(
150               rseq[0].getSequenceAsString(),
151               tr.getSequenceAsString(),
152               "Sequences differ for " + tr.getName() + "\n" + "Exp:"
153                       + tr.getSequenceAsString() + "\n" + "Got:"
154                       + rseq[0].getSequenceAsString());
155   
156     }
157   }
158
159   @Test(suiteName = "live")
160   public void testLiveCheckEnsembl()
161   {
162     EnsemblRestClient sf = new EnsemblRestClient()
163     {
164
165       @Override
166       public String getDbName()
167       {
168         // TODO Auto-generated method stub
169         return null;
170       }
171
172       @Override
173       public AlignmentI getSequenceRecords(String queries) throws Exception
174       {
175         // TODO Auto-generated method stub
176         return null;
177       }
178
179       @Override
180       protected URL getUrl(List<String> ids) throws MalformedURLException
181       {
182         // TODO Auto-generated method stub
183         return null;
184       }
185
186       @Override
187       protected boolean useGetRequest()
188       {
189         // TODO Auto-generated method stub
190         return false;
191       }
192
193       @Override
194       protected String getRequestMimeType(boolean b)
195       {
196         // TODO Auto-generated method stub
197         return null;
198       }
199
200       @Override
201       protected String getResponseMimeType()
202       {
203         // TODO Auto-generated method stub
204         return null;
205       }
206
207     };
208     boolean isAvailable = sf.isEnsemblAvailable();
209     System.out.println("Ensembl is "
210             + (isAvailable ? "UP!"
211                     : "DOWN or unreachable ******************* BAD!"));
212   }
213
214   /**
215    * Tests for the method that computes all peptide variants given codon
216    * variants
217    */
218   @Test(groups = "Functional")
219   public void testComputePeptideVariants()
220   {
221     String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } };
222
223     /*
224      * AGT codes for S - this is not included in the variants returned
225      */
226     List<String> variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
227     assertEquals("[]", variants.toString());
228
229     // S is reported if it differs from the current value (A):
230     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "A");
231     assertEquals("[S]", variants.toString());
232
233     /*
234      * synonymous variant is not reported
235      */
236     codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } };
237     // AGC and AGT both code for S
238     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "s");
239     assertEquals("[]", variants.toString());
240
241     /*
242      * equivalent variants are only reported once
243      */
244     codonVariants = new String[][] { { "C" }, { "T" },
245         { "A", "C", "G", "T" } };
246     // CTA CTC CTG CTT all code for L
247     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
248     assertEquals("[L]", variants.toString());
249
250     /*
251      * vary codons 1 and 2; variant products are sorted and non-redundant
252      */
253     codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } };
254     // aga ata cga cta code for R, I, R, L
255     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
256     assertEquals("[I, L, R]", variants.toString());
257
258     /*
259      * vary codons 2 and 3
260      */
261     codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } };
262     // aga agc ata atc code for R, S, I, I
263     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
264     assertEquals("[I, R]", variants.toString());
265
266     /*
267      * vary codons 1 and 3
268      */
269     codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } };
270     // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end
271     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
272     assertEquals("[K, N, Y, STOP]", variants.toString());
273
274     /*
275      * vary codons 1, 2 and 3
276      */
277     codonVariants = new String[][] { { "a", "t" }, { "G", "C" },
278         { "t", "g" } };
279     // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S
280     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
281     assertEquals("[C, R, T, W]", variants.toString());
282   }
283 }