Add method to SequenceUtil to clean the protein sequence
[jabaws.git] / testsrc / compbio / data / sequence / SequenceUtilTester.java
1 /*\r
2  * Copyright (c) 2009 Peter Troshin JAva Bioinformatics Analysis Web Services\r
3  * (JABAWS) @version: 1.0 This library is free software; you can redistribute it\r
4  * and/or modify it under the terms of the Apache License version 2 as published\r
5  * by the Apache Software Foundation This library is distributed in the hope\r
6  * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied\r
7  * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\r
8  * Apache License for more details. A copy of the license is in\r
9  * apache_license.txt. It is also available here:\r
10  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or\r
11  * derived work distributed in source code form must include this copyright and\r
12  * license notice.\r
13  */\r
14 package compbio.data.sequence;\r
15 \r
16 import static org.testng.AssertJUnit.assertEquals;\r
17 import static org.testng.AssertJUnit.assertFalse;\r
18 import static org.testng.AssertJUnit.assertNotNull;\r
19 import static org.testng.AssertJUnit.assertTrue;\r
20 import static org.testng.AssertJUnit.fail;\r
21 \r
22 import java.io.FileInputStream;\r
23 import java.io.FileNotFoundException;\r
24 import java.io.FileOutputStream;\r
25 import java.io.IOException;\r
26 import java.io.InputStream;\r
27 import java.util.HashMap;\r
28 import java.util.HashSet;\r
29 import java.util.List;\r
30 import java.util.Map;\r
31 import java.util.Set;\r
32 \r
33 import org.testng.annotations.Test;\r
34 \r
35 import compbio.metadata.AllTestSuit;\r
36 \r
37 public class SequenceUtilTester {\r
38 \r
39         @Test()\r
40         public void testisNonAmbNucleotideSequence() {\r
41                 String dnaseq = "atgatTGACGCTGCTGatgtcgtgagtgga";\r
42                 assertTrue(SequenceUtil.isNonAmbNucleotideSequence(dnaseq));\r
43                 String dirtyDnaseq = "atgAGTggt\taGGTgc\ncgcACTgc gACtcgcGAt cgA ";\r
44                 assertTrue(SequenceUtil.isNonAmbNucleotideSequence(dirtyDnaseq));\r
45                 String nonDna = "atgfctgatgcatgcatgatgctga";\r
46                 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));\r
47 \r
48                 nonDna = "atgc1tgatgcatgcatgatgctga";\r
49                 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));\r
50 \r
51                 nonDna = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";\r
52                 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));\r
53                 // String ambDna = "AGTCRYMKSWHBVDN"; // see IUPAC Nucleotide Code\r
54                 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));\r
55 \r
56         }\r
57 \r
58         @Test()\r
59         public void testCleanSequence() {\r
60                 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";\r
61                 assertEquals("atgAGTggtaGGTgccgcACTgcgACtcgcGAtcgA".toUpperCase(),\r
62                                 SequenceUtil.cleanSequence(dirtySeq));\r
63         }\r
64 \r
65         @Test()\r
66         public void testDeepCleanSequence() {\r
67                 String dirtySeq = "a!t?g.A;GTggt\ta12GGTgc\ncgc23AC\rTgc gAC<>.,?!|\\|/t@cg-c¬GA=_+(0){]}[:£$&^*\"t cgA ";\r
68                 assertEquals("atgAGTggtaGGTgccgcACTgcgACtcgcGAtcgA".toUpperCase(),\r
69                                 SequenceUtil.deepCleanSequence(dirtySeq));\r
70         }\r
71 \r
72         @Test()\r
73         public void testisProteinSequence() {\r
74                 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";\r
75                 assertFalse(SequenceUtil.isProteinSequence(dirtySeq));\r
76                 String notaSeq = "atgc1tgatgcatgcatgatgctga";\r
77                 assertFalse(SequenceUtil.isProteinSequence(notaSeq));\r
78                 String AAseq = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";\r
79                 assertTrue(SequenceUtil.isProteinSequence(AAseq));\r
80                 AAseq += "XU";\r
81                 assertFalse(SequenceUtil.isProteinSequence(AAseq));\r
82 \r
83         }\r
84 \r
85         @Test()\r
86         public void testCleanProteinSequence() {\r
87                 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";\r
88                 assertFalse(SequenceUtil.isProteinSequence(dirtySeq));\r
89                 // This will still be NON protein sequence despite having only correct\r
90                 // letters because the letters match perfectly the nucleotide sequence!\r
91                 assertFalse(SequenceUtil.isProteinSequence(SequenceUtil\r
92                                 .cleanProteinSequence(dirtySeq)));\r
93 \r
94                 String notaSeq = "atgc1tgatgcatgcatgatgmctga";\r
95                 assertFalse(SequenceUtil.isProteinSequence(notaSeq));\r
96                 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil\r
97                                 .cleanProteinSequence(notaSeq)));\r
98 \r
99                 String AAseq = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";\r
100                 assertTrue(SequenceUtil.isProteinSequence(AAseq));\r
101                 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil\r
102                                 .cleanProteinSequence(AAseq)));\r
103                 AAseq += "XU";\r
104 \r
105                 assertFalse(SequenceUtil.isProteinSequence(AAseq));\r
106                 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil\r
107                                 .cleanProteinSequence(AAseq)));\r
108         }\r
109 \r
110         @Test()\r
111         public void testReadWriteFasta() {\r
112 \r
113                 try {\r
114                         FileInputStream fio = new FileInputStream(\r
115                                         AllTestSuit.TEST_DATA_PATH + "TO1381.fasta");\r
116                         assertNotNull(fio);\r
117                         List<FastaSequence> fseqs = SequenceUtil.readFasta(fio);\r
118                         assertNotNull(fseqs);\r
119                         assertEquals(3, fseqs.size());\r
120                         assertEquals(3, fseqs.size());\r
121                         fio.close();\r
122                         FileOutputStream fou = new FileOutputStream(\r
123                                         AllTestSuit.TEST_DATA_PATH + "TO1381.fasta.written");\r
124                         SequenceUtil.writeFasta(fou, fseqs);\r
125                         fou.close();\r
126                         FileOutputStream fou20 = new FileOutputStream(\r
127                                         AllTestSuit.TEST_DATA_PATH + "TO1381.fasta20.written");\r
128                         SequenceUtil.writeFasta(fou20, fseqs, 21);\r
129                         fou20.close();\r
130 \r
131                 } catch (FileNotFoundException e) {\r
132                         e.printStackTrace();\r
133                         fail(e.getLocalizedMessage());\r
134                 } catch (IOException e) {\r
135                         e.printStackTrace();\r
136                         fail(e.getLocalizedMessage());\r
137                 }\r
138         }\r
139 \r
140         /**\r
141          * This test tests the loading of horizontally formatted Jronn output file\r
142          */\r
143         @Test\r
144         public void loadJronnFile() {\r
145 \r
146                 FileInputStream fio;\r
147                 try {\r
148                         fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "jronn.out");\r
149                         Map<String, Score> aseqs = SequenceUtil.readJRonn(fio);\r
150                         assertNotNull(aseqs);\r
151                         assertEquals(aseqs.size(), 3);\r
152                         Score aseq = aseqs.get("Foobar");\r
153                         assertNotNull(aseq);\r
154                         assertNotNull(aseq.getScores());\r
155                         // System.out.println(aseq);\r
156                         assertEquals(aseq.getScores().size(), aseq.getScores().size());\r
157                         fio.close();\r
158                 } catch (FileNotFoundException e) {\r
159                         e.printStackTrace();\r
160                         fail(e.getLocalizedMessage());\r
161                 } catch (IOException e) {\r
162                         e.printStackTrace();\r
163                         fail(e.getLocalizedMessage());\r
164                 } catch (UnknownFileFormatException e) {\r
165                         e.printStackTrace();\r
166                         fail(e.getLocalizedMessage());\r
167                 }\r
168 \r
169         }\r
170 \r
171         enum Trial {\r
172                 one, two, three\r
173         };\r
174 \r
175         /**\r
176          * This test tests the loading of horizontally formatted Jronn output file\r
177          * \r
178          * First seq\r
179          * \r
180          * M 0.86010 0.88512 0.37094\r
181          * \r
182          * T 0.79983 0.85864 0.44331\r
183          * \r
184          */\r
185         @SuppressWarnings("unchecked")\r
186         @Test\r
187         public void testReadDisemblResults() {\r
188 \r
189                 FileInputStream fio;\r
190                 try {\r
191                         fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH\r
192                                         + "disembl.out");\r
193                         Map<String, Set<Score>> aseqs = SequenceUtil.readDisembl(fio);\r
194                         assertNotNull(aseqs);\r
195                         assertEquals(aseqs.size(), 3);\r
196                         // System.out.println(aseqs);\r
197                         for (String fs : aseqs.keySet()) {\r
198                                 assertTrue(" Foobar_dundeefriends Foobar dundeefriends "\r
199                                                 .contains(fs));\r
200                                 Set<Score> scores = aseqs.get(fs);\r
201                                 assertEquals(scores.size(), 3);\r
202                         }\r
203                         fio.close();\r
204                 } catch (FileNotFoundException e) {\r
205                         e.printStackTrace();\r
206                         fail(e.getLocalizedMessage());\r
207                 } catch (IOException e) {\r
208                         e.printStackTrace();\r
209                         fail(e.getLocalizedMessage());\r
210                 } catch (UnknownFileFormatException e) {\r
211                         e.printStackTrace();\r
212                         fail(e.getLocalizedMessage());\r
213                 }\r
214         }\r
215 \r
216         /**\r
217          * This test tests the loading of horizontally formatted Jronn output file\r
218          * \r
219          * First sequence:\r
220          * \r
221          * >Foobar_dundeefriends\r
222          * \r
223          * # GlobDoms 2-358, 373-568\r
224          * \r
225          * # Disorder 1-5, 206-218, 243-250, 288-300, 313-324, 359-372, 475-481\r
226          * \r
227          * # RESIDUE DYDX RAW SMOOTHED\r
228          * \r
229          * M 0.0044 -0.2259 -0.2259\r
230          * \r
231          * T -0.1308 -0.2170 -0.2170\r
232          * \r
233          * ............\r
234          * \r
235          * > Second sequence\r
236          */\r
237         @SuppressWarnings("unchecked")\r
238         @Test\r
239         public void testReadGlobPlotResults() {\r
240 \r
241                 FileInputStream fio;\r
242                 try {\r
243                         fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH\r
244                                         + "globplot.out");\r
245                         HashMap<String, Set<Score>> aseqs = SequenceUtil.readGlobPlot(fio);\r
246                         assertNotNull(aseqs);\r
247                         assertEquals(aseqs.size(), 3);\r
248 \r
249                         String fsdf = null;\r
250                         Set<Score> scores = null;\r
251                         for (String fs : aseqs.keySet()) {\r
252                                 if ("Foobar_dundeefriends".contains(fs)) {\r
253                                         fsdf = fs;\r
254                                         scores = aseqs.get(fs);\r
255                                 }\r
256                                 assertEquals(scores.size(), 5);\r
257                         }\r
258                         for (Score score : scores) {\r
259 \r
260                                 if (score.getMethod()\r
261                                                 .equals(GlobProtResult.Disorder.toString())) {\r
262                                         assertEquals(score.getRanges().size(), 7);\r
263                                         assertTrue(score.getScores().isEmpty());\r
264                                 }\r
265                                 if (GlobProtResult.valueOf(score.getMethod()) == GlobProtResult.Dydx) {\r
266                                         assertFalse(score.getScores().isEmpty());\r
267                                         assertTrue(score.getRanges().isEmpty());\r
268                                 }\r
269                         }\r
270                         fio.close();\r
271                 } catch (FileNotFoundException e) {\r
272                         e.printStackTrace();\r
273                         fail(e.getLocalizedMessage());\r
274                 } catch (IOException e) {\r
275                         e.printStackTrace();\r
276                         fail(e.getLocalizedMessage());\r
277                 } catch (UnknownFileFormatException e) {\r
278                         e.printStackTrace();\r
279                         fail(e.getLocalizedMessage());\r
280                 }\r
281         }\r
282 \r
283         @Test\r
284         public void testReadAAConResults() {\r
285                 try {\r
286                         InputStream inStream = new FileInputStream(\r
287                                         AllTestSuit.TEST_DATA_PATH + "aacon_results.txt");\r
288                         HashSet<Score> result = SequenceUtil.readAAConResults(inStream);\r
289                         inStream.close();\r
290                         assertNotNull(result);\r
291                         assertEquals(result.size(), 18);\r
292 \r
293                         inStream = new FileInputStream(AllTestSuit.TEST_DATA_PATH\r
294                                         + "aacon_result_single.out");\r
295                         result = SequenceUtil.readAAConResults(inStream);\r
296                         inStream.close();\r
297                         assertNotNull(result);\r
298                         assertEquals(result.size(), 1);\r
299                         assertEquals(result.iterator().next().getScores().size(), 568);\r
300                 } catch (IOException e) {\r
301                         e.printStackTrace();\r
302                         fail(e.getMessage());\r
303                 }\r
304         }\r
305 }\r