Simple datamodel for RNAalifold.exe output and method to convert from
[jabaws.git] / testsrc / compbio / data / sequence / SequenceUtilTester.java
1 /*\r
2  * Copyright (c) 2009 Peter Troshin JAva Bioinformatics Analysis Web Services\r
3  * (JABAWS) @version: 1.0 This library is free software; you can redistribute it\r
4  * and/or modify it under the terms of the Apache License version 2 as published\r
5  * by the Apache Software Foundation This library is distributed in the hope\r
6  * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied\r
7  * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\r
8  * Apache License for more details. A copy of the license is in\r
9  * apache_license.txt. It is also available here:\r
10  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or\r
11  * derived work distributed in source code form must include this copyright and\r
12  * license notice.\r
13  */\r
14 package compbio.data.sequence;\r
15 \r
16 import static org.testng.AssertJUnit.assertEquals;\r
17 import static org.testng.AssertJUnit.assertFalse;\r
18 import static org.testng.AssertJUnit.assertNotNull;\r
19 import static org.testng.AssertJUnit.assertTrue;\r
20 import static org.testng.AssertJUnit.fail;\r
21 \r
22 import java.io.File;\r
23 import java.io.FileInputStream;\r
24 import java.io.FileNotFoundException;\r
25 import java.io.FileOutputStream;\r
26 import java.io.IOException;\r
27 import java.io.InputStream;\r
28 import java.io.PrintWriter;\r
29 import java.util.HashMap;\r
30 import java.util.HashSet;\r
31 import java.util.List;\r
32 import java.util.Map;\r
33 import java.util.Set;\r
34 \r
35 import org.testng.annotations.Test;\r
36 \r
37 import compbio.metadata.AllTestSuit;\r
38 import compbio.runner.disorder.Disembl;\r
39 \r
40 public class SequenceUtilTester {\r
41 \r
42         @Test()\r
43         public void testisNonAmbNucleotideSequence() {\r
44                 String dnaseq = "atgatTGACGCTGCTGatgtcgtgagtgga";\r
45                 assertTrue(SequenceUtil.isNonAmbNucleotideSequence(dnaseq));\r
46                 String dirtyDnaseq = "atgAGTggt\taGGTgc\ncgcACTgc gACtcgcGAt cgA ";\r
47                 assertTrue(SequenceUtil.isNonAmbNucleotideSequence(dirtyDnaseq));\r
48                 String nonDna = "atgfctgatgcatgcatgatgctga";\r
49                 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));\r
50 \r
51                 nonDna = "atgc1tgatgcatgcatgatgctga";\r
52                 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));\r
53 \r
54                 nonDna = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";\r
55                 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));\r
56                 // String ambDna = "AGTCRYMKSWHBVDN"; // see IUPAC Nucleotide Code\r
57                 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));\r
58 \r
59         }\r
60 \r
61         @Test()\r
62         public void testCleanSequence() {\r
63                 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";\r
64                 assertEquals("atgAGTggtaGGTgccgcACTgcgACtcgcGAtcgA".toUpperCase(),\r
65                                 SequenceUtil.cleanSequence(dirtySeq));\r
66         }\r
67 \r
68         @Test()\r
69         public void testDeepCleanSequence() {\r
70                 String dirtySeq = "a!t?g.A;GTggt\ta12GGTgc\ncgc23AC\rTgc gAC<>.,?!|\\|/t@cg-c¬GA=_+(0){]}[:£$&^*\"t cgA ";\r
71                 assertEquals("atgAGTggtaGGTgccgcACTgcgACtcgcGAtcgA".toUpperCase(),\r
72                                 SequenceUtil.deepCleanSequence(dirtySeq));\r
73         }\r
74 \r
75         @Test()\r
76         public void testisProteinSequence() {\r
77                 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";\r
78                 assertFalse(SequenceUtil.isProteinSequence(dirtySeq));\r
79                 String notaSeq = "atgc1tgatgcatgcatgatgctga";\r
80                 assertFalse(SequenceUtil.isProteinSequence(notaSeq));\r
81                 String AAseq = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";\r
82                 assertTrue(SequenceUtil.isProteinSequence(AAseq));\r
83                 AAseq += "XU";\r
84                 assertFalse(SequenceUtil.isProteinSequence(AAseq));\r
85 \r
86         }\r
87 \r
88         @Test()\r
89         public void testCleanProteinSequence() {\r
90                 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";\r
91                 assertFalse(SequenceUtil.isProteinSequence(dirtySeq));\r
92                 // This will still be NON protein sequence despite having only correct\r
93                 // letters because the letters match perfectly the nucleotide sequence!\r
94                 assertFalse(SequenceUtil.isProteinSequence(SequenceUtil\r
95                                 .cleanProteinSequence(dirtySeq)));\r
96 \r
97                 String notaSeq = "atgc1tgatgcatgcatgatgmctga";\r
98                 assertFalse(SequenceUtil.isProteinSequence(notaSeq));\r
99                 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil\r
100                                 .cleanProteinSequence(notaSeq)));\r
101 \r
102                 String AAseq = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";\r
103                 assertTrue(SequenceUtil.isProteinSequence(AAseq));\r
104                 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil\r
105                                 .cleanProteinSequence(AAseq)));\r
106                 AAseq += "XU";\r
107 \r
108                 assertFalse(SequenceUtil.isProteinSequence(AAseq));\r
109                 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil\r
110                                 .cleanProteinSequence(AAseq)));\r
111         }\r
112 \r
113         @Test()\r
114         public void testReadWriteFasta() {\r
115 \r
116                 try {\r
117                         FileInputStream fio = new FileInputStream(\r
118                                         AllTestSuit.TEST_DATA_PATH + "TO1381.fasta");\r
119                         assertNotNull(fio);\r
120                         List<FastaSequence> fseqs = SequenceUtil.readFasta(fio);\r
121                         assertNotNull(fseqs);\r
122                         assertEquals(3, fseqs.size());\r
123                         assertEquals(3, fseqs.size());\r
124                         fio.close();\r
125                         FileOutputStream fou = new FileOutputStream(\r
126                                         AllTestSuit.TEST_DATA_PATH + "TO1381.fasta.written");\r
127                         SequenceUtil.writeFasta(fou, fseqs);\r
128                         fou.close();\r
129                         FileOutputStream fou20 = new FileOutputStream(\r
130                                         AllTestSuit.TEST_DATA_PATH + "TO1381.fasta20.written");\r
131                         SequenceUtil.writeFasta(fou20, fseqs, 21);\r
132                         fou20.close();\r
133 \r
134                 } catch (FileNotFoundException e) {\r
135                         e.printStackTrace();\r
136                         fail(e.getLocalizedMessage());\r
137                 } catch (IOException e) {\r
138                         e.printStackTrace();\r
139                         fail(e.getLocalizedMessage());\r
140                 }\r
141         }\r
142 \r
143         // Potential Bug :- Sequence names are shortened to 2-3 letters\r
144         @Test\r
145         public void testReadFastaWriteClustal() {\r
146                 \r
147                 try {\r
148                         FileInputStream fio = new FileInputStream(\r
149                                         AllTestSuit.TEST_DATA_PATH + "TO1381.fasta");\r
150                         assertNotNull(fio);\r
151                         List<FastaSequence> fseqs = SequenceUtil.readFasta(fio);\r
152                         assertNotNull(fseqs);\r
153                         fio.close();\r
154                         \r
155                         char gapChar = '-';\r
156                         FileOutputStream fou = new FileOutputStream(\r
157                                         AllTestSuit.TEST_DATA_PATH + "TO1381.aln.written");\r
158                         SequenceUtil.writeClustal(fou, fseqs, gapChar);\r
159                         fou.close();\r
160                         \r
161                 } catch (FileNotFoundException e) {\r
162                         e.printStackTrace();\r
163                         fail(e.getLocalizedMessage());\r
164                 } catch (IOException e) {\r
165                         e.printStackTrace();\r
166                         fail(e.getLocalizedMessage());\r
167                 }\r
168         }               \r
169                 \r
170                         \r
171 \r
172                         \r
173         \r
174         /**\r
175          * This test tests the loading of horizontally formatted Jronn output file\r
176          */\r
177         @Test\r
178         public void loadJronnFile() {\r
179 \r
180                 FileInputStream fio;\r
181                 try {\r
182                         fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "jronn.out");\r
183                         Map<String, Score> aseqs = SequenceUtil.readJRonn(fio);\r
184                         assertNotNull(aseqs);\r
185                         assertEquals(aseqs.size(), 3);\r
186                         Score aseq = aseqs.get("Foobar");\r
187                         assertNotNull(aseq);\r
188                         assertNotNull(aseq.getScores());\r
189                         // System.out.println(aseq);\r
190                         assertEquals(aseq.getScores().size(), aseq.getScores().size());\r
191                         fio.close();\r
192                 } catch (FileNotFoundException e) {\r
193                         e.printStackTrace();\r
194                         fail(e.getLocalizedMessage());\r
195                 } catch (IOException e) {\r
196                         e.printStackTrace();\r
197                         fail(e.getLocalizedMessage());\r
198                 } catch (UnknownFileFormatException e) {\r
199                         e.printStackTrace();\r
200                         fail(e.getLocalizedMessage());\r
201                 }\r
202 \r
203         }\r
204 \r
205         enum Trial {\r
206                 one, two, three\r
207         };\r
208 \r
209         /**\r
210          * This test tests the loading of horizontally formatted Jronn output file\r
211          * \r
212          * First seq\r
213          * \r
214          * M 0.86010 0.88512 0.37094\r
215          * \r
216          * T 0.79983 0.85864 0.44331\r
217          * \r
218          */\r
219         @SuppressWarnings("unchecked")\r
220         @Test\r
221         public void testReadDisemblResults() {\r
222 \r
223 \r
224                 Map<String, Map<String,Set<Range>>> _ranges=new HashMap<String, Map<String,Set<Range>>>();\r
225                 Map<String, Set<Range>> ranges=new HashMap<String,Set<Range>>();\r
226                 Map<String,Map<String, Float>>  _values=new HashMap<String, Map<String,Float>>();\r
227                 Map<String, Float> values = new HashMap<String, Float>();\r
228                 Set<Range> rset;\r
229                 rset = new HashSet<Range>();\r
230                 for (String[] se:new String[][] { { "34","41"},{"50","58"},{"83","91"},{"118","127"},{" 160","169"},{" 191","220"},{" 243","252"},{" 287","343"},{" 350","391"},{" 429","485"},{" 497","506"},{"539","547"}})\r
231                 {\r
232                         rset.add(new Range(se));\r
233                 }\r
234                 ranges.put(DisemblResult.COILS.toString(), rset);\r
235                 values.put(DisemblResult.COILS.toString(), Float.valueOf(0.86010f));\r
236                 rset = new HashSet<Range>();\r
237                 for (String[] se:new String[][] { { "355","368"}})\r
238                 {\r
239                         rset.add(new Range(se));\r
240                 }\r
241                 ranges.put(DisemblResult.REM465.toString(), rset);\r
242                 values.put(DisemblResult.REM465.toString(), Float.valueOf(0.88512f));\r
243                 rset = new HashSet<Range>();\r
244                 for (String[] se:new String[][] { { "190","204"}})\r
245                 {\r
246                         rset.add(new Range(se));\r
247                 }\r
248                 ranges.put(DisemblResult.HOTLOOPS.toString(), rset);\r
249                 values.put(DisemblResult.HOTLOOPS.toString(), Float.valueOf(0.37094f));\r
250                 _ranges.put("Foobar_dundeefriends", ranges);\r
251                 _values.put("Foobar_dundeefriends", values);\r
252                 FileInputStream fio;\r
253                 try {\r
254                         fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH\r
255                                         + "disembl.out");\r
256                         Map<String, Set<Score>> aseqs = SequenceUtil.readDisembl(fio);\r
257                         assertNotNull(aseqs);\r
258                         assertEquals(aseqs.size(), 3);\r
259                         ScoreManager sman = ScoreManager.newInstance(aseqs);\r
260 \r
261                         for (String fs : aseqs.keySet()) {\r
262                                 assertTrue(" Foobar_dundeefriends Foobar dundeefriends "\r
263                                                 .contains(fs));\r
264                                 Set<Score> scores = aseqs.get(fs);\r
265                                 assertEquals(scores.size(), 3);\r
266                                 for (Score sc:scores) {\r
267                                         if (_ranges.containsKey(fs))\r
268                                         {\r
269                                         assertEquals("Checking range for Method "+sc.getMethod(),_ranges.get(fs).get(sc.getMethod()), sc.getRanges());\r
270                                         assertEquals("Checking first value for Method "+sc.getMethod(), _values.get(fs).get(sc.getMethod()), sc.getScores().get(0));\r
271                                         }\r
272                                 }\r
273                         }\r
274                         fio.close();\r
275                 } catch (FileNotFoundException e) {\r
276                         e.printStackTrace();\r
277                         fail(e.getLocalizedMessage());\r
278                 } catch (IOException e) {\r
279                         e.printStackTrace();\r
280                         fail(e.getLocalizedMessage());\r
281                 } catch (UnknownFileFormatException e) {\r
282                         e.printStackTrace();\r
283                         fail(e.getLocalizedMessage());\r
284                 }\r
285         }\r
286         /**\r
287          * This test tests the loading of horizontally formatted Jronn output file\r
288          * \r
289          * First sequence:\r
290          * \r
291          * >Foobar_dundeefriends\r
292          * \r
293          * # GlobDoms 2-358, 373-568\r
294          * \r
295          * # Disorder 1-5, 206-218, 243-250, 288-300, 313-324, 359-372, 475-481\r
296          * \r
297          * # RESIDUE DYDX RAW SMOOTHED\r
298          * \r
299          * M 0.0044 -0.2259 -0.2259\r
300          * \r
301          * T -0.1308 -0.2170 -0.2170\r
302          * \r
303          * ............\r
304          * \r
305          * > Second sequence\r
306          */\r
307         @SuppressWarnings("unchecked")\r
308         @Test\r
309         public void testReadGlobPlotResults() {\r
310 \r
311                 FileInputStream fio;\r
312                 try {\r
313                         fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH\r
314                                         + "globplot.out");\r
315                         HashMap<String, Set<Score>> aseqs = SequenceUtil.readGlobPlot(fio);\r
316                         assertNotNull(aseqs);\r
317                         assertEquals(aseqs.size(), 3);\r
318 \r
319                         String fsdf = null;\r
320                         Set<Score> scores = null;\r
321                         for (String fs : aseqs.keySet()) {\r
322                                 if ("Foobar_dundeefriends".contains(fs)) {\r
323                                         fsdf = fs;\r
324                                         scores = aseqs.get(fs);\r
325                                 }\r
326                                 assertEquals(scores.size(), 5);\r
327                         }\r
328 \r
329                         ScoreManager sm = ScoreManager.newInstanceSingleSequence(scores);\r
330                         sm.writeOut(new PrintWriter(System.out, true));\r
331 \r
332                         for (Score score : scores) {\r
333 \r
334                                 if (score.getMethod()\r
335                                                 .equals(GlobProtResult.Disorder.toString())) {\r
336                                         assertEquals(score.getRanges().size(), 7);\r
337                                         assertTrue(score.getScores().isEmpty());\r
338                                 }\r
339                                 if (GlobProtResult.valueOf(score.getMethod()) == GlobProtResult.Dydx) {\r
340                                         assertFalse(score.getScores().isEmpty());\r
341                                         assertTrue(score.getRanges().isEmpty());\r
342                                 }\r
343                         }\r
344                         fio.close();\r
345                 } catch (FileNotFoundException e) {\r
346                         e.printStackTrace();\r
347                         fail(e.getLocalizedMessage());\r
348                 } catch (IOException e) {\r
349                         e.printStackTrace();\r
350                         fail(e.getLocalizedMessage());\r
351                 } catch (UnknownFileFormatException e) {\r
352                         e.printStackTrace();\r
353                         fail(e.getLocalizedMessage());\r
354                 }\r
355         }\r
356 \r
357         @Test\r
358         public void testReadIUPredForShortAndLongDisorder() {\r
359                 try {\r
360                         Map<String, Score> scores = SequenceUtil.readIUPred(new File(\r
361                                         AllTestSuit.TEST_DATA_PATH, "out.long"));\r
362                         ScoreManager man = ScoreManager.newInstanceSingleScore(scores);\r
363                         // man.writeOut(new PrintWriter(System.out, true));\r
364                         assertNotNull(scores);\r
365                         assertEquals(3, scores.size());\r
366 \r
367                         Score score = scores.get("Foobar_dundeefriends");\r
368                         assertNotNull(score);\r
369                         assertEquals(0, score.getRanges().size());\r
370                         assertEquals(568, score.getScores().size());\r
371                         assertEquals("Long", score.getMethod());\r
372 \r
373                         score = scores.get("Foobar");\r
374                         assertNotNull(score);\r
375                         assertEquals(0, score.getRanges().size());\r
376                         assertEquals(481, score.getScores().size());\r
377                         assertEquals("Long", score.getMethod());\r
378 \r
379                         score = scores.get("dundeefriends");\r
380                         assertNotNull(score);\r
381                         assertEquals(0, score.getRanges().size());\r
382                         assertEquals(513, score.getScores().size());\r
383                         assertEquals("Long", score.getMethod());\r
384 \r
385                 } catch (IOException e) {\r
386                         e.printStackTrace();\r
387                         fail(e.getLocalizedMessage());\r
388                 } catch (UnknownFileFormatException e) {\r
389                         e.printStackTrace();\r
390                         fail(e.getLocalizedMessage());\r
391                 }\r
392         }\r
393 \r
394         @Test\r
395         public void testReadIUPredForGlobDomain() {\r
396                 try {\r
397                         Map<String, Score> scores = SequenceUtil.readIUPred(new File(\r
398                                         AllTestSuit.TEST_DATA_PATH, "output.glob"));\r
399                         assertNotNull(scores);\r
400                         assertEquals(2, scores.size());\r
401                         ScoreManager man = ScoreManager.newInstanceSingleScore(scores);\r
402                         // man.writeOut(new PrintWriter(System.out, true));\r
403                         assertEquals(2, man.getNumberOfSeq());\r
404                         Score score = scores.get("P53_HUMA");\r
405                         assertNotNull(score);\r
406                         assertEquals(2, score.getRanges().size());\r
407                         assertEquals(0, score.getScores().size());\r
408                         assertEquals("Glob", score.getMethod());\r
409 \r
410                         score = scores.get("Foobar_dundeefriends");\r
411                         assertEquals(0, score.getRanges().size());\r
412                 } catch (IOException e) {\r
413                         e.printStackTrace();\r
414                         fail(e.getLocalizedMessage());\r
415                 } catch (UnknownFileFormatException e) {\r
416                         e.printStackTrace();\r
417                         fail(e.getLocalizedMessage());\r
418                 }\r
419         }\r
420         @Test\r
421         public void testReadAAConResults() {\r
422                 try {\r
423                         InputStream inStream = new FileInputStream(\r
424                                         AllTestSuit.TEST_DATA_PATH + "aacon_results.txt");\r
425                         HashSet<Score> result = SequenceUtil.readAAConResults(inStream);\r
426                         inStream.close();\r
427                         assertNotNull(result);\r
428                         assertEquals(result.size(), 18);\r
429 \r
430                         inStream = new FileInputStream(AllTestSuit.TEST_DATA_PATH\r
431                                         + "aacon_result_single.out");\r
432                         result = SequenceUtil.readAAConResults(inStream);\r
433                         inStream.close();\r
434                         assertNotNull(result);\r
435                         assertEquals(result.size(), 1);\r
436                         assertEquals(result.iterator().next().getScores().size(), 568);\r
437                 } catch (IOException e) {\r
438                         e.printStackTrace();\r
439                         fail(e.getMessage());\r
440                 }\r
441         }\r
442 }\r
443 \r