package jalview.ext.ensembl;
-import jalview.datamodel.Alignment;
-import jalview.datamodel.SequenceI;
-import jalview.ext.ensembl.SeqFetcher.EnsemblSeqType;
-import jalview.io.AppletFormatAdapter;
-import jalview.io.FastaFile;
-import jalview.io.FileParse;
-import jalview.util.DBRefUtils;
-
import java.lang.reflect.Method;
-import java.util.Arrays;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
+
public class EnsemblSeqProxyTest
{
- @Test
- public void testCheckEnsembl()
- {
- SeqFetcher sf = new SeqFetcher();
- sf.setTestEnsemblStatus(true);
- sf.setTesting(true);
- Assert.assertTrue(sf.isEnsemblAvailable());
- sf.setTestEnsemblStatus(false);
- Assert.assertFalse(sf.isEnsemblAvailable());
- }
-
- @Test(suiteName = "live")
- public void testLiveCheckEnsembl()
- {
- SeqFetcher sf = new SeqFetcher();
- boolean isAvailable = sf.isEnsemblAvailable();
- System.out.println("Ensembl is "
- + (isAvailable ? "UP!" : "DOWN ******************* BAD!"));
- }
-
- @DataProvider(name = "ens_seqs")
+ @DataProvider(name = "queries")
public Object[][] createData(Method m)
{
- System.out.println(m.getName());
- return allSeqs;
+ return new Object[][] { { "CCDS5863.1" }, { "ENSP00000288602" } };
}
- public static Object[][] allSeqs = new Object[][]
- {
- {
- EnsemblSeqType.PROTEIN,
- "CCDS5863.1",
- ">CCDS5863.1\n"
- + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
- + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
- + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
- + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
- + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
- + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
- + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
- + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
- + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
- + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
- + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
- + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
- + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH\n" },
- {
- EnsemblSeqType.TRANSCRIPT,
- "CCDS5863.1",
- ">CCDS5863.1\n"
- + "ATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAAC\n"
- + "GGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGAC\n"
- + "CCTGCCATTCCGGAGGAGGTGTGGAATATCAAACAAATGATTAAGTTGACACAGGAACAT\n"
- + "ATAGAGGCCCTATTGGACAAATTTGGTGGGGAGCATAATCCACCATCAATATATCTGGAG\n"
- + "GCCTATGAAGAATACACCAGCAAGCTAGATGCACTCCAACAAAGAGAACAACAGTTATTG\n"
- + "GAATCTCTGGGGAACGGAACTGATTTTTCTGTTTCTAGCTCTGCATCAATGGATACCGTT\n"
- + "ACATCTTCTTCCTCTTCTAGCCTTTCAGTGCTACCTTCATCTCTTTCAGTTTTTCAAAAT\n"
- + "CCCACAGATGTGGCACGGAGCAACCCCAAGTCACCACAAAAACCTATCGTTAGAGTCTTC\n"
- + "CTGCCCAACAAACAGAGGACAGTGGTACCTGCAAGGTGTGGAGTTACAGTCCGAGACAGT\n"
- + "CTAAAGAAAGCACTGATGATGAGAGGTCTAATCCCAGAGTGCTGTGCTGTTTACAGAATT\n"
- + "CAGGATGGAGAGAAGAAACCAATTGGTTGGGACACTGATATTTCCTGGCTTACTGGAGAA\n"
- + "GAATTGCATGTGGAAGTGTTGGAGAATGTTCCACTTACAACACACAACTTTGTACGAAAA\n"
- + "ACGTTTTTCACCTTAGCATTTTGTGACTTTTGTCGAAAGCTGCTTTTCCAGGGTTTCCGC\n"
- + "TGTCAAACATGTGGTTATAAATTTCACCAGCGTTGTAGTACAGAAGTTCCACTGATGTGT\n"
- + "GTTAATTATGACCAACTTGATTTGCTGTTTGTCTCCAAGTTCTTTGAACACCACCCAATA\n"
- + "CCACAGGAAGAGGCGTCCTTAGCAGAGACTGCCCTAACATCTGGATCATCCCCTTCCGCA\n"
- + "CCCGCCTCGGACTCTATTGGGCCCCAAATTCTCACCAGTCCGTCTCCTTCAAAATCCATT\n"
- + "CCAATTCCACAGCCCTTCCGACCAGCAGATGAAGATCATCGAAATCAATTTGGGCAACGA\n"
- + "GACCGATCCTCATCAGCTCCCAATGTGCATATAAACACAATAGAACCTGTCAATATTGAT\n"
- + "GACTTGATTAGAGACCAAGGATTTCGTGGTGATGGAGGATCAACCACAGGTTTGTCTGCT\n"
- + "ACCCCCCCTGCCTCATTACCTGGCTCACTAACTAACGTGAAAGCCTTACAGAAATCTCCA\n"
- + "GGACCTCAGCGAGAAAGGAAGTCATCTTCATCCTCAGAAGACAGGAATCGAATGAAAACA\n"
- + "CTTGGTAGACGGGACTCGAGTGATGATTGGGAGATTCCTGATGGGCAGATTACAGTGGGA\n"
- + "CAAAGAATTGGATCTGGATCATTTGGAACAGTCTACAAGGGAAAGTGGCATGGTGATGTG\n"
- + "GCAGTGAAAATGTTGAATGTGACAGCACCTACACCTCAGCAGTTACAAGCCTTCAAAAAT\n"
- + "GAAGTAGGAGTACTCAGGAAAACACGACATGTGAATATCCTACTCTTCATGGGCTATTCC\n"
- + "ACAAAGCCACAACTGGCTATTGTTACCCAGTGGTGTGAGGGCTCCAGCTTGTATCACCAT\n"
- + "CTCCATATCATTGAGACCAAATTTGAGATGATCAAACTTATAGATATTGCACGACAGACT\n"
- + "GCACAGGGCATGGATTACTTACACGCCAAGTCAATCATCCACAGAGACCTCAAGAGTAAT\n"
- + "AATATATTTCTTCATGAAGACCTCACAGTAAAAATAGGTGATTTTGGTCTAGCTACAGTG\n"
- + "AAATCTCGATGGAGTGGGTCCCATCAGTTTGAACAGTTGTCTGGATCCATTTTGTGGATG\n"
- + "GCACCAGAAGTCATCAGAATGCAAGATAAAAATCCATACAGCTTTCAGTCAGATGTATAT\n"
- + "GCATTTGGAATTGTTCTGTATGAATTGATGACTGGACAGTTACCTTATTCAAACATCAAC\n"
- + "AACAGGGACCAGATAATTTTTATGGTGGGACGAGGATACCTGTCTCCAGATCTCAGTAAG\n"
- + "GTACGGAGTAACTGTCCAAAAGCCATGAAGAGATTAATGGCAGAGTGCCTCAAAAAGAAA\n"
- + "AGAGATGAGAGACCACTCTTTCCCCAAATTCTCGCCTCTATTGAGCTGCTGGCCCGCTCA\n"
- + "TTGCCAAAAATTCACCGCAGTGCATCAGAACCCTCCTTGAATCGGGCTGGTTTCCAAACA\n"
- + "GAGGATTTTAGTCTATATGCTTGTGCTTCTCCAAAAACACCCATCCAGGCAGGGGGATAT\n"
- + "GGTGCGTTTCCTGTCCACTGA\n" },
- {
- EnsemblSeqType.PROTEIN,
- "ENSP00000288602",
- ">ENSP00000288602\n"
- + "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\n"
- + "IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\n"
- + "TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\n"
- + "LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\n"
- + "TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\n"
- + "PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\n"
- + "DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\n"
- + "GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\n"
- + "AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\n"
- + "LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATV\n"
- + "KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\n"
- + "NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\n"
- + "LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" } };
-
- @Test(dataProvider = "ens_seqs", suiteName = "live")
- public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq)
- throws Exception
- {
- SeqFetcher sf = new SeqFetcher();
- FileParse fp = sf.getSequenceReader(type, Arrays.asList(new String[]
- { sq }));
- SequenceI[] sqs = new FastaFile(fp).getSeqsAsArray();
- FastaFile trueRes = new FastaFile(fastasq, AppletFormatAdapter.PASTE);
- SequenceI[] trueSqs = trueRes.getSeqsAsArray();
- Assert.assertEquals(sqs.length, trueSqs.length,
- "Different number of sequences retrieved for query " + sq);
- Alignment ral = new Alignment(sqs);
- for (SequenceI tr : trueSqs)
- {
- SequenceI[] rseq;
- Assert.assertNotNull(
- rseq = ral.findSequenceMatch(tr.getName()),
- "Couldn't find sequences matching expected sequence "
- + tr.getName());
- Assert.assertEquals(rseq.length, 1,
- "Expected only one sequence for sequence ID " + tr.getName());
- Assert.assertEquals(
- rseq[0].getSequenceAsString(),
- tr.getSequenceAsString(),
- "Sequences differ for " + tr.getName() + "\n" + "Exp:"
- + tr.getSequenceAsString() + "\n" + "Got:"
- + rseq[0].getSequenceAsString());
-
- }
- }
-
- @Test(dataProvider = "ens_seqs")
- public void testRegexForProxy(EnsemblSeqType type, String sq,
- String fastasq) throws Exception
+ @Test(dataProvider = "queries")
+ public void testIsValidReference(String query) throws Exception
{
EnsemblSeqProxy esq = new EnsemblProtein();
- Assert.assertTrue(esq.isValidReference(sq),
- "Expected reference string " + sq + " to be valid for regex "
+ Assert.assertTrue(esq.isValidReference(query),
+ "Expected reference string " + query
+ + " to be valid for regex "
+ esq.getAccessionValidator().toString());
-
- Assert.assertEquals(sq, DBRefUtils.processQueryToAccessionFor(esq, sq),
- "Regex for " + esq.getClass().toString() + " not correct.");
}
- // TODO:
- // sequence query with ENSG and anything other than a genomic type will yield
- // sequences with different IDs which will
- // break the post-processing stage where DBRefs are assigned to sequences.
- // -> multiple_sequences = true is needed additional parameter
- // http://rest.ensembl.org/sequence/id/ENSG00000157764?content-type=text/x-json;type=protein;multiple_sequences=true
- // result with four transcripts, cds, cdna, and protein products.
- // *
- // features for ENG -
- // http://rest.ensembl.org/overlap/id/ENSG00000157764?feature=cds&feature=exon&feature=transcript&content-type=text/x-gff3
- // transcript: gives locus, all transcript products with ENSG parents
- // gene: give all ENSG on locus
- // exon: all exon boundaries. CDS same info.
-
- // @Test(dataProvider = "ens_seqs", suiteName = "live")
- // public void testGetOneSeqs(EnsemblSeqType type, String sq, String fastasq)
- // throws Exception
- // {
- //
- // {
- // Assert.assertTrue(rseq[0].getDBRef() != null
- // && rseq[0].getDBRef().length > 0,
- // "No database references added to sequence by fetcher.");
- // Assert.assertNotNull(DBRefUtils.searchRefs(rseq[0].getDBRef(),
- // new DBRefEntry("ENSEMBL", null, sq)),
- // "Could't find database references added to sequence by fetcher.");
- //
- // }
}
\ No newline at end of file