JAL-1705 DbSourceProxy properties converted to methods, tidy/format code
[jalview.git] / test / jalview / ext / ensembl / SeqFetcherTest.java
1 package jalview.ext.ensembl;
2
3 import jalview.datamodel.Alignment;
4 import jalview.datamodel.SequenceI;
5 import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
6 import jalview.io.AppletFormatAdapter;
7 import jalview.io.FastaFile;
8 import jalview.io.FileParse;
9
10 import java.lang.reflect.Method;
11 import java.util.Arrays;
12
13 import org.testng.Assert;
14 import org.testng.annotations.DataProvider;
15 import org.testng.annotations.Test;
16
17 public class SeqFetcherTest
18 {
19   private static final Object[][] allSeqs = new Object[][] {
20       {
21           EnsemblSeqType.PROTEIN,
22           "CCDS5863.1",
23           ">CCDS5863.1\n"
24                   + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
25                   + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
26                   + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
27                   + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
28                   + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
29                   + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
30                   + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
31                   + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
32                   + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
33                   + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
34                   + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
35                   + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
36                   + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" },
37       {
38           EnsemblSeqType.TRANSCRIPT,
39           "CCDS5863.1",
40           ">CCDS5863.1\n"
41                   + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n"
42                   + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n"
43                   + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n"
44                   + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n"
45                   + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n"
46                   + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n"
47                   + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n"
48                   + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n"
49                   + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n"
50                   + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n"
51                   + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n"
52                   + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n"
53                   + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n"
54                   + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n"
55                   + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n"
56                   + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n"
57                   + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n"
58                   + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n"
59                   + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n"
60                   + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n"
61                   + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n"
62                   + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n"
63                   + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n"
64                   + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n"
65                   + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n"
66                   + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n"
67                   + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n"
68                   + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n"
69                   + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n"
70                   + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n"
71                   + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n"
72                   + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n"
73                   + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n"
74                   + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n"
75                   + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n"
76                   + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n"
77                   + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n"
78                   + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n"
79                   + "GGTGCGTTTCCTGTCCACTGA\n" },
80       {
81           EnsemblSeqType.PROTEIN,
82           "ENSP00000288602",
83           ">ENSP00000288602\n"
84                   + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
85                   + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
86                   + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
87                   + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
88                   + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
89                   + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
90                   + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
91                   + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
92                   + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
93                   + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
94                   + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
95                   + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
96                   + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } };
97
98   @DataProvider(name = "ens_seqs")
99   public Object[][] createData(Method m)
100   {
101     System.out.println(m.getName());
102     return allSeqs;
103   }
104
105   @Test(dataProvider = "ens_seqs", suiteName = "live")
106   public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq)
107           throws Exception
108   {
109     SeqFetcher sf = new SeqFetcher();
110     FileParse fp = sf.getSequenceReader(type, Arrays.asList(new String[]
111     { sq }));
112     SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray();
113     FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE);
114     SequenceI[] trueSqs = trueRes.getSeqsAsArray();
115     Assert.assertEquals(sqs.length, trueSqs.length,
116             "Different number of sequences retrieved for query " + sq);
117     Alignment ral = new Alignment(sqs);
118     for (SequenceI tr : trueSqs)
119     {
120       SequenceI[] rseq;
121       Assert.assertNotNull(
122               rseq = ral.findSequenceMatch(tr.getName()),
123               "Couldn't find sequences matching expected sequence "
124                       + tr.getName());
125       Assert.assertEquals(rseq.length, 1,
126               "Expected only one sequence for sequence ID " + tr.getName());
127       Assert.assertEquals(
128               rseq[0].getSequenceAsString(),
129               tr.getSequenceAsString(),
130               "Sequences differ for " + tr.getName() + "\n" + "Exp:"
131                       + tr.getSequenceAsString() + "\n" + "Got:"
132                       + rseq[0].getSequenceAsString());
133   
134     }
135   }
136
137   @Test(suiteName = "live")
138   public void testLiveCheckEnsembl()
139   {
140     SeqFetcher sf = new SeqFetcher();
141     boolean isAvailable = sf.isEnsemblAvailable();
142     System.out.println("Ensembl is "
143             + (isAvailable ? "UP!"
144                     : "DOWN or unreachable ******************* BAD!"));
145   }
146   // TODO:
147   // sequence query with ENSG and anything other than a genomic type will yield
148   // sequences with different IDs which will
149   // break the post-processing stage where DBRefs are assigned to sequences.
150   // -> multiple_sequences = true is needed additional parameter
151   // http://rest.ensembl.org/sequence/id/ENSG00000157764?content-type=text/x-json;type=protein;multiple_sequences=true
152   // result with four transcripts, cds, cdna, and protein products.
153   // *
154   // features for ENG -
155   // http://rest.ensembl.org/overlap/id/ENSG00000157764?feature=cds&feature=exon&feature=transcript&content-type=text/x-gff3
156   // transcript: gives locus, all transcript products with ENSG parents
157   // gene: give all ENSG on locus
158   // exon: all exon boundaries. CDS same info.
159
160   // @Test(dataProvider = "ens_seqs", suiteName = "live")
161   // public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq)
162   // throws Exception
163   // {
164   //
165   // {
166   // Assert.assertTrue(rseq[0].getDBRef() != null
167   // && rseq[0].getDBRef().length > 0,
168   // "No database references added to sequence by fetcher.");
169   // Assert.assertNotNull(DBRefUtils.searchRefs(rseq[0].getDBRef(),
170   // new DBRefEntry("ENSEMBL", null, sq)),
171   // "Could't find database references added to sequence by fetcher.");
172   //
173   // }
174
175 }