1 package jalview.ext.ensembl;
3 import jalview.datamodel.Alignment;
4 import jalview.datamodel.SequenceI;
5 import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
6 import jalview.io.AppletFormatAdapter;
7 import jalview.io.FastaFile;
8 import jalview.io.FileParse;
10 import java.lang.reflect.Method;
11 import java.util.Arrays;
13 import org.testng.Assert;
14 import org.testng.annotations.DataProvider;
15 import org.testng.annotations.Test;
17 public class SeqFetcherTest
19 private static final Object[][] allSeqs = new Object[][] {
21 EnsemblSeqType.PROTEIN,
24 + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
25 + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
26 + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
27 + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
28 + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
29 + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
30 + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
31 + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
32 + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
33 + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
34 + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
35 + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
36 + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" },
38 EnsemblSeqType.TRANSCRIPT,
41 + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n"
42 + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n"
43 + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n"
44 + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n"
45 + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n"
46 + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n"
47 + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n"
48 + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n"
49 + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n"
50 + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n"
51 + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n"
52 + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n"
53 + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n"
54 + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n"
55 + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n"
56 + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n"
57 + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n"
58 + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n"
59 + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n"
60 + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n"
61 + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n"
62 + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n"
63 + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n"
64 + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n"
65 + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n"
66 + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n"
67 + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n"
68 + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n"
69 + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n"
70 + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n"
71 + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n"
72 + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n"
73 + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n"
74 + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n"
75 + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n"
76 + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n"
77 + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n"
78 + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n"
79 + "GGTGCGTTTCCTGTCCACTGA\n" },
81 EnsemblSeqType.PROTEIN,
84 + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
85 + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
86 + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
87 + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
88 + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
89 + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
90 + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
91 + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
92 + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
93 + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
94 + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
95 + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
96 + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } };
98 @DataProvider(name = "ens_seqs")
99 public Object[][] createData(Method m)
101 System.out.println(m.getName());
105 @Test(dataProvider = "ens_seqs", suiteName = "live")
106 public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq)
109 SeqFetcher sf = new SeqFetcher();
110 FileParse fp = sf.getSequenceReader(type, Arrays.asList(new String[]
112 SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray();
113 FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE);
114 SequenceI[] trueSqs = trueRes.getSeqsAsArray();
115 Assert.assertEquals(sqs.length, trueSqs.length,
116 "Different number of sequences retrieved for query " + sq);
117 Alignment ral = new Alignment(sqs);
118 for (SequenceI tr : trueSqs)
121 Assert.assertNotNull(
122 rseq = ral.findSequenceMatch(tr.getName()),
123 "Couldn't find sequences matching expected sequence "
125 Assert.assertEquals(rseq.length, 1,
126 "Expected only one sequence for sequence ID " + tr.getName());
128 rseq[0].getSequenceAsString(),
129 tr.getSequenceAsString(),
130 "Sequences differ for " + tr.getName() + "\n" + "Exp:"
131 + tr.getSequenceAsString() + "\n" + "Got:"
132 + rseq[0].getSequenceAsString());
137 @Test(suiteName = "live")
138 public void testLiveCheckEnsembl()
140 SeqFetcher sf = new SeqFetcher();
141 boolean isAvailable = sf.isEnsemblAvailable();
142 System.out.println("Ensembl is "
143 + (isAvailable ? "UP!"
144 : "DOWN or unreachable ******************* BAD!"));
147 // sequence query with ENSG and anything other than a genomic type will yield
148 // sequences with different IDs which will
149 // break the post-processing stage where DBRefs are assigned to sequences.
150 // -> multiple_sequences = true is needed additional parameter
151 // http://rest.ensembl.org/sequence/id/ENSG00000157764?content-type=text/x-json;type=protein;multiple_sequences=true
152 // result with four transcripts, cds, cdna, and protein products.
154 // features for ENG -
155 // http://rest.ensembl.org/overlap/id/ENSG00000157764?feature=cds&feature=exon&feature=transcript&content-type=text/x-gff3
156 // transcript: gives locus, all transcript products with ENSG parents
157 // gene: give all ENSG on locus
158 // exon: all exon boundaries. CDS same info.
160 // @Test(dataProvider = "ens_seqs", suiteName = "live")
161 // public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq)
166 // Assert.assertTrue(rseq[0].getDBRef() != null
167 // && rseq[0].getDBRef().length > 0,
168 // "No database references added to sequence by fetcher.");
169 // Assert.assertNotNull(DBRefUtils.searchRefs(rseq[0].getDBRef(),
170 // new DBRefEntry("ENSEMBL", null, sq)),
171 // "Could't find database references added to sequence by fetcher.");