Some lost files
[proteocache.git] / testsrc / compbio / data / sequence / SequenceUtilTester.java
1 /* Copyright (c) 2009 Peter Troshin\r
2  * Copyright (c) 2013 Alexander Sherstnev\r
3  * \r
4  *  JAva Bioinformatics Analysis Web Services (JABAWS) \r
5  *  @version: 2.5\r
6  * \r
7  * This library is free software; you can redistribute it and/or modify it under \r
8  * the terms of the Apache License version 2 as published\r
9  * by the Apache Software Foundation This library is distributed in the hope\r
10  * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied\r
11  * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\r
12  * Apache License for more details. A copy of the license is in\r
13  * apache_license.txt. It is also available here:\r
14  * \r
15  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt \r
16  * \r
17  * Any republication or derived work distributed in source code form must include \r
18  * this copyright and license notice.\r
19  */\r
20 package compbio.data.sequence;\r
21 \r
22 import static org.testng.AssertJUnit.assertEquals;\r
23 import static org.testng.AssertJUnit.assertFalse;\r
24 import static org.testng.AssertJUnit.assertNotNull;\r
25 import static org.testng.AssertJUnit.assertTrue;\r
26 import static org.testng.AssertJUnit.fail;\r
27 \r
28 import java.io.File;\r
29 import java.io.FileInputStream;\r
30 import java.io.FileNotFoundException;\r
31 import java.io.FileOutputStream;\r
32 import java.io.IOException;\r
33 import java.io.InputStream;\r
34 import java.io.PrintWriter;\r
35 import java.util.HashMap;\r
36 import java.util.HashSet;\r
37 import java.util.List;\r
38 import java.util.Map;\r
39 import java.util.Set;\r
40 \r
41 import org.testng.annotations.Test;\r
42 \r
43 import compbio.metadata.AllTestSuit;\r
44 \r
45 public class SequenceUtilTester {\r
46 \r
47         @Test()\r
48         public void isNonAmbNucleotideSequence() {\r
49                 String dnaseq = "atgatTGACGCTGCTGatgtcgtgagtgga";\r
50                 assertTrue(SequenceUtil.isNonAmbNucleotideSequence(dnaseq));\r
51                 String dirtyDnaseq = "atgAGTggt\taGGTgc\ncgcACTgc gACtcgcGAt cgA ";\r
52                 assertTrue(SequenceUtil.isNonAmbNucleotideSequence(dirtyDnaseq));\r
53                 String nonDna = "atgfctgatgcatgcatgatgctga";\r
54                 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));\r
55 \r
56                 nonDna = "atgc1tgatgcatgcatgatgctga";\r
57                 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));\r
58 \r
59                 nonDna = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";\r
60                 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));\r
61                 // String ambDna = "AGTCRYMKSWHBVDN"; // see IUPAC Nucleotide Code\r
62                 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));\r
63         }\r
64 \r
65         @Test()\r
66         public void CleanSequence() {\r
67                 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";\r
68                 assertEquals("atgAGTggtaGGTgccgcACTgcgACtcgcGAtcgA".toUpperCase(),SequenceUtil.cleanSequence(dirtySeq));\r
69         }\r
70 \r
71         @Test()\r
72         public void DeepCleanSequence() {\r
73                 String dirtySeq = "a!t?g.A;GTggt\ta12GGTgc\ncgc23AC\rTgc gAC<>.,?!|\\|/t@cg-c¬GA=_+(0){]}[:£$&^*\"t cgA ";\r
74                 assertEquals("atgAGTggtaGGTgccgcACTgcgACtcgcGAtcgA".toUpperCase(),SequenceUtil.deepCleanSequence(dirtySeq));\r
75         }\r
76 \r
77         @Test()\r
78         public void isProteinSequence() {\r
79                 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";\r
80                 assertFalse(SequenceUtil.isProteinSequence(dirtySeq));\r
81                 String notaSeq = "atgc1tgatgcatgcatgatgctga";\r
82                 assertFalse(SequenceUtil.isProteinSequence(notaSeq));\r
83                 String AAseq = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";\r
84                 assertTrue(SequenceUtil.isProteinSequence(AAseq));\r
85                 AAseq += "XU";\r
86                 assertFalse(SequenceUtil.isProteinSequence(AAseq));\r
87 \r
88         }\r
89 \r
90         @Test()\r
91         public void CleanProteinSequence() {\r
92                 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";\r
93                 assertFalse(SequenceUtil.isProteinSequence(dirtySeq));\r
94                 // This will still be NON protein sequence despite having only correct\r
95                 // letters because the letters match perfectly the nucleotide sequence!\r
96                 assertFalse(SequenceUtil.isProteinSequence(SequenceUtil.cleanProteinSequence(dirtySeq)));\r
97 \r
98                 String notaSeq = "atgc1tgatgcatgcatgatgmctga";\r
99                 assertFalse(SequenceUtil.isProteinSequence(notaSeq));\r
100                 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil.cleanProteinSequence(notaSeq)));\r
101 \r
102                 String AAseq = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";\r
103                 assertTrue(SequenceUtil.isProteinSequence(AAseq));\r
104                 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil.cleanProteinSequence(AAseq)));\r
105                 AAseq += "XU";\r
106 \r
107                 assertFalse(SequenceUtil.isProteinSequence(AAseq));\r
108                 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil.cleanProteinSequence(AAseq)));\r
109         }\r
110 \r
111         @Test()\r
112         public void ReadWriteFasta() {\r
113                 try {\r
114                         FileInputStream fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "TO1381.fasta");\r
115                         assertNotNull(fio);\r
116                         List<FastaSequence> fseqs = SequenceUtil.readFasta(fio);\r
117                         assertNotNull(fseqs);\r
118                         assertEquals(3, fseqs.size());\r
119                         assertEquals(3, fseqs.size());\r
120                         fio.close();\r
121                         FileOutputStream fou = new FileOutputStream(AllTestSuit.TEST_DATA_PATH + "TO1381.fasta.written");\r
122                         SequenceUtil.writeFasta(fou, fseqs);\r
123                         fou.close();\r
124                         FileOutputStream fou20 = new FileOutputStream(AllTestSuit.TEST_DATA_PATH + "TO1381.fasta20.written");\r
125                         SequenceUtil.writeFasta(fou20, fseqs, 21);\r
126                         fou20.close();\r
127                 } catch (FileNotFoundException e) {\r
128                         e.printStackTrace();\r
129                         fail(e.getLocalizedMessage());\r
130                 } catch (IOException e) {\r
131                         e.printStackTrace();\r
132                         fail(e.getLocalizedMessage());\r
133                 }\r
134         }\r
135 \r
136         // Potential Bug :- Sequence names are shortened to 2-3 letters\r
137         @Test\r
138         public void testReadFastaWriteClustal() {\r
139                 \r
140                 try {\r
141                         FileInputStream fio = new FileInputStream(\r
142                                         AllTestSuit.TEST_DATA_PATH + "TO1381.fasta");\r
143                         assertNotNull(fio);\r
144                         List<FastaSequence> fseqs = SequenceUtil.readFasta(fio);\r
145                         assertNotNull(fseqs);\r
146                         fio.close();\r
147                         \r
148                         char gapChar = '-';\r
149                         FileOutputStream fou = new FileOutputStream(\r
150                                         AllTestSuit.TEST_DATA_PATH + "TO1381.aln.written");\r
151                         SequenceUtil.writeClustal(fou, fseqs, gapChar);\r
152                         fou.close();\r
153                         \r
154                 } catch (FileNotFoundException e) {\r
155                         e.printStackTrace();\r
156                         fail(e.getLocalizedMessage());\r
157                 } catch (IOException e) {\r
158                         e.printStackTrace();\r
159                         fail(e.getLocalizedMessage());\r
160                 }\r
161         }\r
162 \r
163         /**\r
164          * This test tests the loading of horizontally formatted Jronn output file\r
165          */\r
166         @Test\r
167         public void LoadJronnFile() {\r
168 \r
169                 FileInputStream fio;\r
170                 try {\r
171                         fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "jronn.out");\r
172                         Map<String, Score> aseqs = SequenceUtil.readJRonn(fio);\r
173                         assertNotNull(aseqs);\r
174                         assertEquals(aseqs.size(), 3);\r
175                         Score aseq = aseqs.get("Foobar");\r
176                         assertNotNull(aseq);\r
177                         assertNotNull(aseq.getScores());\r
178                         assertEquals(aseq.getScores().size(), aseq.getScores().size());\r
179                         fio.close();\r
180                 } catch (FileNotFoundException e) {\r
181                         e.printStackTrace();\r
182                         fail(e.getLocalizedMessage());\r
183                 } catch (IOException e) {\r
184                         e.printStackTrace();\r
185                         fail(e.getLocalizedMessage());\r
186                 } catch (UnknownFileFormatException e) {\r
187                         e.printStackTrace();\r
188                         fail(e.getLocalizedMessage());\r
189                 }\r
190         }\r
191 \r
192         enum Trial {\r
193                 one, two, three\r
194         };\r
195 \r
196         /**\r
197          * This test tests the loading of horizontally formatted Jronn output file\r
198          * \r
199          * First seq\r
200          * \r
201          * M 0.86010 0.88512 0.37094\r
202          * \r
203          * T 0.79983 0.85864 0.44331\r
204          * \r
205          */\r
206         @SuppressWarnings("unchecked")\r
207         @Test\r
208         public void ReadDisemblResults() {\r
209                 Map<String, Map<String,Set<Range>>> _ranges=new HashMap<String, Map<String,Set<Range>>>();\r
210                 Map<String, Set<Range>> ranges=new HashMap<String,Set<Range>>();\r
211                 Map<String,Map<String, Float>>  _values=new HashMap<String, Map<String,Float>>();\r
212                 Map<String, Float> values = new HashMap<String, Float>();\r
213                 Set<Range> rset;\r
214                 rset = new HashSet<Range>();\r
215                 for (String[] se:new String[][] { { "34","41"},{"50","58"},{"83","91"},{"118","127"},{" 160","169"},{" 191","220"},{" 243","252"},{" 287","343"},{" 350","391"},{" 429","485"},{" 497","506"},{"539","547"}}) {\r
216                         rset.add(new Range(se));\r
217                 }\r
218                 ranges.put(DisemblResult.COILS.toString(), rset);\r
219                 values.put(DisemblResult.COILS.toString(), Float.valueOf(0.86010f));\r
220                 rset = new HashSet<Range>();\r
221                 for (String[] se:new String[][] { { "355","368"}}) {\r
222                         rset.add(new Range(se));\r
223                 }\r
224                 ranges.put(DisemblResult.REM465.toString(), rset);\r
225                 values.put(DisemblResult.REM465.toString(), Float.valueOf(0.88512f));\r
226                 rset = new HashSet<Range>();\r
227                 for (String[] se:new String[][] { { "190","204"}}) {\r
228                         rset.add(new Range(se));\r
229                 }\r
230                 ranges.put(DisemblResult.HOTLOOPS.toString(), rset);\r
231                 values.put(DisemblResult.HOTLOOPS.toString(), Float.valueOf(0.37094f));\r
232                 _ranges.put("Foobar_dundeefriends", ranges);\r
233                 _values.put("Foobar_dundeefriends", values);\r
234                 FileInputStream fio;\r
235                 try {\r
236                         fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "disembl.out");\r
237                         Map<String, Set<Score>> aseqs = SequenceUtil.readDisembl(fio);\r
238                         assertNotNull(aseqs);\r
239                         assertEquals(aseqs.size(), 3);\r
240                         ScoreManager sman = ScoreManager.newInstance(aseqs);\r
241 \r
242                         for (String fs : aseqs.keySet()) {\r
243                                 assertTrue(" Foobar_dundeefriends Foobar dundeefriends ".contains(fs));\r
244                                 Set<Score> scores = aseqs.get(fs);\r
245                                 assertEquals(scores.size(), 3);\r
246                                 for (Score sc:scores) {\r
247                                         if (_ranges.containsKey(fs)) {\r
248                                                 assertEquals("Checking range for Method "+sc.getMethod(),_ranges.get(fs).get(sc.getMethod()), sc.getRanges());\r
249                                                 assertEquals("Checking first value for Method "+sc.getMethod(), _values.get(fs).get(sc.getMethod()), sc.getScores().get(0));\r
250                                         }\r
251                                 }\r
252                         }\r
253                         fio.close();\r
254                 } catch (FileNotFoundException e) {\r
255                         e.printStackTrace();\r
256                         fail(e.getLocalizedMessage());\r
257                 } catch (IOException e) {\r
258                         e.printStackTrace();\r
259                         fail(e.getLocalizedMessage());\r
260                 } catch (UnknownFileFormatException e) {\r
261                         e.printStackTrace();\r
262                         fail(e.getLocalizedMessage());\r
263                 }\r
264         }\r
265 \r
266         /**\r
267          * This method tests the loading of horizontally formatted Jronn output file\r
268          * \r
269          * First sequence:\r
270          * \r
271          * >Foobar_dundeefriends\r
272          * \r
273          * # GlobDoms 2-358, 373-568\r
274          * \r
275          * # Disorder 1-5, 206-218, 243-250, 288-300, 313-324, 359-372, 475-481\r
276          * \r
277          * # RESIDUE DYDX RAW SMOOTHED\r
278          * \r
279          * M 0.0044 -0.2259 -0.2259\r
280          * \r
281          * T -0.1308 -0.2170 -0.2170\r
282          * \r
283          * ............\r
284          * \r
285          * > Second sequence\r
286          */\r
287         @SuppressWarnings("unchecked")\r
288         @Test\r
289         public void ReadGlobPlotResults() {\r
290 \r
291                 FileInputStream fio;\r
292                 try {\r
293                         fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "globplot.out");\r
294                         HashMap<String, Set<Score>> aseqs = SequenceUtil.readGlobPlot(fio);\r
295                         assertNotNull(aseqs);\r
296                         assertEquals(aseqs.size(), 3);\r
297 \r
298                         String fsdf = null;\r
299                         Set<Score> scores = null;\r
300                         for (String fs : aseqs.keySet()) {\r
301                                 if ("Foobar_dundeefriends".contains(fs)) {\r
302                                         fsdf = fs;\r
303                                         scores = aseqs.get(fs);\r
304                                 }\r
305                                 assertEquals(scores.size(), 5);\r
306                         }\r
307 \r
308                         ScoreManager sm = ScoreManager.newInstanceSingleSequence(scores);\r
309                         sm.writeOut(new PrintWriter(System.out, true));\r
310 \r
311                         for (Score score : scores) {\r
312                                 if (score.getMethod().equals(GlobProtResult.Disorder.toString())) {\r
313                                         assertEquals(score.getRanges().size(), 7);\r
314                                         assertTrue(score.getScores().isEmpty());\r
315                                 }\r
316                                 if (GlobProtResult.valueOf(score.getMethod()) == GlobProtResult.Dydx) {\r
317                                         assertFalse(score.getScores().isEmpty());\r
318                                         assertTrue(score.getRanges().isEmpty());\r
319                                 }\r
320                         }\r
321                         fio.close();\r
322                 } catch (FileNotFoundException e) {\r
323                         e.printStackTrace();\r
324                         fail(e.getLocalizedMessage());\r
325                 } catch (IOException e) {\r
326                         e.printStackTrace();\r
327                         fail(e.getLocalizedMessage());\r
328                 } catch (UnknownFileFormatException e) {\r
329                         e.printStackTrace();\r
330                         fail(e.getLocalizedMessage());\r
331                 }\r
332         }\r
333 \r
334         @Test\r
335         public void ReadIUPredForShortAndLongDisorder() {\r
336                 try {\r
337                         Map<String, Score> scores = SequenceUtil.readIUPred(new File(AllTestSuit.TEST_DATA_PATH, "out.long"));\r
338                         ScoreManager man = ScoreManager.newInstanceSingleScore(scores);\r
339                         assertNotNull(scores);\r
340                         assertEquals(3, scores.size());\r
341 \r
342                         Score score = scores.get("Foobar_dundeefriends");\r
343                         assertNotNull(score);\r
344                         assertEquals(0, score.getRanges().size());\r
345                         assertEquals(568, score.getScores().size());\r
346                         assertEquals("Long", score.getMethod());\r
347 \r
348                         score = scores.get("Foobar");\r
349                         assertNotNull(score);\r
350                         assertEquals(0, score.getRanges().size());\r
351                         assertEquals(481, score.getScores().size());\r
352                         assertEquals("Long", score.getMethod());\r
353 \r
354                         score = scores.get("dundeefriends");\r
355                         assertNotNull(score);\r
356                         assertEquals(0, score.getRanges().size());\r
357                         assertEquals(513, score.getScores().size());\r
358                         assertEquals("Long", score.getMethod());\r
359                 } catch (IOException e) {\r
360                         e.printStackTrace();\r
361                         fail(e.getLocalizedMessage());\r
362                 } catch (UnknownFileFormatException e) {\r
363                         e.printStackTrace();\r
364                         fail(e.getLocalizedMessage());\r
365                 }\r
366         }\r
367 \r
368         @Test\r
369         public void ReadIUPredForGlobDomain() {\r
370                 try {\r
371                         Map<String, Score> scores = SequenceUtil.readIUPred(new File(AllTestSuit.TEST_DATA_PATH, "output.glob"));\r
372                         assertNotNull(scores);\r
373                         assertEquals(2, scores.size());\r
374                         ScoreManager man = ScoreManager.newInstanceSingleScore(scores);\r
375                         assertEquals(2, man.getNumberOfSeq());\r
376                         Score score = scores.get("P53_HUMA");\r
377                         assertNotNull(score);\r
378                         assertEquals(2, score.getRanges().size());\r
379                         assertEquals(0, score.getScores().size());\r
380                         assertEquals("Glob", score.getMethod());\r
381                         score = scores.get("Foobar_dundeefriends");\r
382                         assertEquals(0, score.getRanges().size());\r
383                 } catch (IOException e) {\r
384                         e.printStackTrace();\r
385                         fail(e.getLocalizedMessage());\r
386                 } catch (UnknownFileFormatException e) {\r
387                         e.printStackTrace();\r
388                         fail(e.getLocalizedMessage());\r
389                 }\r
390         }\r
391 \r
392         @Test\r
393         public void ReadAAConResults() {\r
394                 try {\r
395                         InputStream inStream = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "aacon_results.txt");\r
396                         HashSet<Score> result = SequenceUtil.readAAConResults(inStream);\r
397                         inStream.close();\r
398                         assertNotNull(result);\r
399                         assertEquals(result.size(), 18);\r
400 \r
401                         inStream = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "aacon_result_single.out");\r
402                         result = SequenceUtil.readAAConResults(inStream);\r
403                         inStream.close();\r
404                         assertNotNull(result);\r
405                         assertEquals(result.size(), 1);\r
406                         assertEquals(result.iterator().next().getScores().size(), 568);\r
407                 } catch (IOException e) {\r
408                         e.printStackTrace();\r
409                         fail(e.getMessage());\r
410                 }\r
411         }\r
412         @Test\r
413         public void ReadJpredResults() {\r
414                 try {\r
415                         InputStream inStream = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "Jpred.test1.out");\r
416                         List<FastaSequence> result = SequenceUtil.readJpredFile(inStream);\r
417                         inStream.close();\r
418                         assertNotNull(result);\r
419                         assertEquals(result.size(), 19);\r
420                 } catch (IOException e) {\r
421                         e.printStackTrace();\r
422                         fail(e.getMessage());\r
423                 }\r
424         }\r
425 }\r
426 \r