JAL-1705 regular expression updates, tests, other refactoring
[jalview.git] / test / jalview / ext / ensembl / EnsemblSeqProxyTest.java
1 package jalview.ext.ensembl;
2
3 import static org.testng.AssertJUnit.assertEquals;
4 import static org.testng.AssertJUnit.assertFalse;
5 import static org.testng.AssertJUnit.assertTrue;
6
7 import jalview.datamodel.Alignment;
8 import jalview.datamodel.AlignmentI;
9 import jalview.datamodel.Sequence;
10 import jalview.datamodel.SequenceFeature;
11 import jalview.datamodel.SequenceI;
12 import jalview.io.AppletFormatAdapter;
13 import jalview.io.FastaFile;
14 import jalview.io.FileParse;
15 import jalview.io.gff.SequenceOntologyFactory;
16 import jalview.io.gff.SequenceOntologyLite;
17
18 import java.lang.reflect.Method;
19 import java.net.MalformedURLException;
20 import java.net.URL;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.List;
24
25 import org.testng.Assert;
26 import org.testng.annotations.AfterClass;
27 import org.testng.annotations.BeforeClass;
28 import org.testng.annotations.DataProvider;
29 import org.testng.annotations.Test;
30
31
32 public class EnsemblSeqProxyTest
33 {
34   private static final Object[][] allSeqs = new Object[][] {
35       {
36           new EnsemblProtein(),
37           "CCDS5863.1",
38           ">CCDS5863.1\n"
39                   + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
40                   + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
41                   + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
42                   + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
43                   + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
44                   + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
45                   + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
46                   + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
47                   + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
48                   + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
49                   + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
50                   + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
51                   + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" },
52       {
53           new EnsemblCdna(),
54           "CCDS5863.1",
55           ">CCDS5863.1\n"
56                   + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n"
57                   + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n"
58                   + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n"
59                   + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n"
60                   + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n"
61                   + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n"
62                   + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n"
63                   + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n"
64                   + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n"
65                   + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n"
66                   + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n"
67                   + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n"
68                   + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n"
69                   + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n"
70                   + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n"
71                   + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n"
72                   + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n"
73                   + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n"
74                   + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n"
75                   + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n"
76                   + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n"
77                   + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n"
78                   + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n"
79                   + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n"
80                   + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n"
81                   + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n"
82                   + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n"
83                   + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n"
84                   + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n"
85                   + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n"
86                   + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n"
87                   + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n"
88                   + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n"
89                   + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n"
90                   + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n"
91                   + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n"
92                   + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n"
93                   + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n"
94                   + "GGTGCGTTTCCTGTCCACTGA\n" },
95       {
96           new EnsemblProtein(),
97           "ENSP00000288602",
98           ">ENSP00000288602\n"
99                   + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
100                   + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
101                   + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
102                   + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
103                   + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
104                   + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
105                   + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
106                   + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
107                   + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
108                   + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
109                   + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
110                   + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
111                   + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } };
112
113   @BeforeClass
114   public void setUp()
115   {
116     SequenceOntologyFactory.setInstance(new SequenceOntologyLite());
117   }
118
119   @AfterClass
120   public void tearDown()
121   {
122     SequenceOntologyFactory.setInstance(null);
123   }
124
125   @DataProvider(name = "ens_seqs")
126   public Object[][] createData(Method m)
127   {
128     System.out.println(m.getName());
129     return allSeqs;
130   }
131
132   @Test(dataProvider = "ens_seqs", suiteName = "live")
133   public void testGetOneSeqs(EnsemblRestClient proxy, String sq, String fastasq)
134           throws Exception
135   {
136     FileParse fp = proxy.getSequenceReader(Arrays
137             .asList(new String[]
138     { sq }));
139     SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray();
140     FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE);
141     SequenceI[] trueSqs = trueRes.getSeqsAsArray();
142     Assert.assertEquals(sqs.length, trueSqs.length,
143             "Different number of sequences retrieved for query " + sq);
144     Alignment ral = new Alignment(sqs);
145     for (SequenceI tr : trueSqs)
146     {
147       SequenceI[] rseq;
148       Assert.assertNotNull(
149               rseq = ral.findSequenceMatch(tr.getName()),
150               "Couldn't find sequences matching expected sequence "
151                       + tr.getName());
152       Assert.assertEquals(rseq.length, 1,
153               "Expected only one sequence for sequence ID " + tr.getName());
154       Assert.assertEquals(
155               rseq[0].getSequenceAsString(),
156               tr.getSequenceAsString(),
157               "Sequences differ for " + tr.getName() + "\n" + "Exp:"
158                       + tr.getSequenceAsString() + "\n" + "Got:"
159                       + rseq[0].getSequenceAsString());
160   
161     }
162   }
163
164   @Test(suiteName = "live")
165   public void testLiveCheckEnsembl()
166   {
167     EnsemblRestClient sf = new EnsemblRestClient()
168     {
169
170       @Override
171       public String getDbName()
172       {
173         // TODO Auto-generated method stub
174         return null;
175       }
176
177       @Override
178       public AlignmentI getSequenceRecords(String queries) throws Exception
179       {
180         // TODO Auto-generated method stub
181         return null;
182       }
183
184       @Override
185       protected URL getUrl(List<String> ids) throws MalformedURLException
186       {
187         // TODO Auto-generated method stub
188         return null;
189       }
190
191       @Override
192       protected boolean useGetRequest()
193       {
194         // TODO Auto-generated method stub
195         return false;
196       }
197
198       @Override
199       protected String getRequestMimeType(boolean b)
200       {
201         // TODO Auto-generated method stub
202         return null;
203       }
204
205       @Override
206       protected String getResponseMimeType()
207       {
208         // TODO Auto-generated method stub
209         return null;
210       }
211
212     };
213     boolean isAvailable = sf.isEnsemblAvailable();
214     System.out.println("Ensembl is "
215             + (isAvailable ? "UP!"
216                     : "DOWN or unreachable ******************* BAD!"));
217   }
218
219   /**
220    * Tests for the method that computes all peptide variants given codon
221    * variants
222    */
223   @Test(groups = "Functional")
224   public void testComputePeptideVariants()
225   {
226     String[][] codonVariants = new String[][] { { "A" }, { "G" }, { "T" } };
227
228     /*
229      * AGT codes for S - this is not included in the variants returned
230      */
231     List<String> variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
232     assertEquals("[]", variants.toString());
233
234     // S is reported if it differs from the current value (A):
235     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "A");
236     assertEquals("[S]", variants.toString());
237
238     /*
239      * synonymous variant is not reported
240      */
241     codonVariants = new String[][] { { "A" }, { "G" }, { "C", "T" } };
242     // AGC and AGT both code for S
243     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "s");
244     assertEquals("[]", variants.toString());
245
246     /*
247      * equivalent variants are only reported once
248      */
249     codonVariants = new String[][] { { "C" }, { "T" },
250         { "A", "C", "G", "T" } };
251     // CTA CTC CTG CTT all code for L
252     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
253     assertEquals("[L]", variants.toString());
254
255     /*
256      * vary codons 1 and 2; variant products are sorted and non-redundant
257      */
258     codonVariants = new String[][] { { "a", "C" }, { "g", "T" }, { "A" } };
259     // aga ata cga cta code for R, I, R, L
260     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
261     assertEquals("[I, L, R]", variants.toString());
262
263     /*
264      * vary codons 2 and 3
265      */
266     codonVariants = new String[][] { { "a" }, { "g", "T" }, { "A", "c" } };
267     // aga agc ata atc code for R, S, I, I
268     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
269     assertEquals("[I, R]", variants.toString());
270
271     /*
272      * vary codons 1 and 3
273      */
274     codonVariants = new String[][] { { "a", "t" }, { "a" }, { "t", "g" } };
275     // aat aag tat tag code for N, K, Y, STOP - STOP sorted to end
276     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
277     assertEquals("[K, N, Y, STOP]", variants.toString());
278
279     /*
280      * vary codons 1, 2 and 3
281      */
282     codonVariants = new String[][] { { "a", "t" }, { "G", "C" },
283         { "t", "g" } };
284     // agt agg act acg tgt tgg tct tcg code for S, R, T, T, C, W, S, S
285     variants = EnsemblSeqProxy.computePeptideVariants(codonVariants, "S");
286     assertEquals("[C, R, T, W]", variants.toString());
287   }
288   
289   /**
290    * Tests for the method that maps the subset of a dna sequence that has CDS
291    * (or subtype) feature.
292    */
293   @Test(groups = "Functional")
294   public void testGetCdsRanges()
295   {
296     EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter();
297
298     SequenceI dnaSeq = new Sequence("dna", "aaaGGGcccAAATTTttt");
299     dnaSeq.createDatasetSequence();
300     SequenceI ds = dnaSeq.getDatasetSequence();
301
302     // CDS for dna 3-6
303     SequenceFeature sf = new SequenceFeature("CDS", "", 4, 6, 0f, null);
304     ds.addSequenceFeature(sf);
305     // exon feature should be ignored here
306     sf = new SequenceFeature("exon", "", 7, 9, 0f, null);
307     ds.addSequenceFeature(sf);
308     // CDS for dna 10-12
309     sf = new SequenceFeature("CDS_predicted", "", 10, 12, 0f, null);
310     ds.addSequenceFeature(sf);
311
312     List<int[]> ranges = new ArrayList<int[]>();
313     int mappedLength = testee.getCdsRanges(dnaSeq, ranges);
314     assertEquals(6, mappedLength);
315     assertEquals(2, ranges.size());
316     assertEquals(4, ranges.get(0)[0]);
317     assertEquals(6, ranges.get(0)[1]);
318     assertEquals(10, ranges.get(1)[0]);
319     assertEquals(12, ranges.get(1)[1]);
320
321   }
322
323   @Test(groups = "Functional")
324   public void getGenomicRangesFromFeatures()
325   {
326
327   }
328
329   /**
330    * Tests for the method that maps the subset of a dna sequence that has CDS
331    * (or subtype) feature - case where the start codon is incomplete.
332    */
333   @Test(groups = "Functional")
334   public void testGetCdsRanges_fivePrimeIncomplete()
335   {
336     EnsemblSeqProxy testee = new EnsemblSeqProxyAdapter();
337   
338     SequenceI dnaSeq = new Sequence("dna", "aaagGGCCCaaaTTTttt");
339     dnaSeq.createDatasetSequence();
340     SequenceI ds = dnaSeq.getDatasetSequence();
341   
342     // CDS for dna 5-6 (incomplete codon), 7-9
343     SequenceFeature sf = new SequenceFeature("CDS", "", 5, 9, 0f, null);
344     sf.setPhase("2"); // skip 2 bases to start of next codon
345     ds.addSequenceFeature(sf);
346     // CDS for dna 13-15
347     sf = new SequenceFeature("CDS_predicted", "", 13, 15, 0f, null);
348     ds.addSequenceFeature(sf);
349   
350     List<int[]> ranges = new ArrayList<int[]>();
351     int mappedLength = testee.getCdsRanges(dnaSeq, ranges);
352
353     /*
354      * check the mapping starts with the first complete codon
355      */
356     assertEquals(6, mappedLength);
357     assertEquals(2, ranges.size());
358     assertEquals(7, ranges.get(0)[0]);
359     assertEquals(9, ranges.get(0)[1]);
360     assertEquals(13, ranges.get(1)[0]);
361     assertEquals(15, ranges.get(1)[1]);
362   }
363
364   @Test(groups = "Functional")
365   public void testIsTranscriptIdentifier()
366   {
367     assertFalse(EnsemblSeqProxy.isTranscriptIdentifier(null));
368     assertFalse(EnsemblSeqProxy.isTranscriptIdentifier(""));
369     assertFalse(EnsemblSeqProxy.isTranscriptIdentifier("ENSG00000012345"));
370     assertTrue(EnsemblSeqProxy.isTranscriptIdentifier("ENST00000012345"));
371     assertTrue(EnsemblSeqProxy.isTranscriptIdentifier("ENSMUST00000012345"));
372     assertFalse(EnsemblSeqProxy.isTranscriptIdentifier("enst00000012345"));
373     assertFalse(EnsemblSeqProxy.isTranscriptIdentifier("ENST000000123456"));
374     assertFalse(EnsemblSeqProxy.isTranscriptIdentifier("ENST0000001234"));
375   }
376
377   @Test(groups = "Functional")
378   public void testIsGeneIdentifier()
379   {
380     assertFalse(EnsemblSeqProxy.isGeneIdentifier(null));
381     assertFalse(EnsemblSeqProxy.isGeneIdentifier(""));
382     assertFalse(EnsemblSeqProxy.isGeneIdentifier("ENST00000012345"));
383     assertTrue(EnsemblSeqProxy.isGeneIdentifier("ENSG00000012345"));
384     assertTrue(EnsemblSeqProxy.isGeneIdentifier("ENSMUSG00000012345"));
385     assertFalse(EnsemblSeqProxy.isGeneIdentifier("ensg00000012345"));
386     assertFalse(EnsemblSeqProxy.isGeneIdentifier("ENSG000000123456"));
387     assertFalse(EnsemblSeqProxy.isGeneIdentifier("ENSG0000001234"));
388   }
389 }