1 /* Copyright (c) 2009 Peter Troshin
\r
2 * Copyright (c) 2013 Alexander Sherstnev
\r
4 * JAva Bioinformatics Analysis Web Services (JABAWS)
\r
7 * This library is free software; you can redistribute it and/or modify it under
\r
8 * the terms of the Apache License version 2 as published
\r
9 * by the Apache Software Foundation This library is distributed in the hope
\r
10 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
\r
11 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
12 * Apache License for more details. A copy of the license is in
\r
13 * apache_license.txt. It is also available here:
\r
15 * @see: http://www.apache.org/licenses/LICENSE-2.0.txt
\r
17 * Any republication or derived work distributed in source code form must include
\r
18 * this copyright and license notice.
\r
20 package compbio.data.sequence;
\r
22 import static org.testng.AssertJUnit.assertEquals;
\r
23 import static org.testng.AssertJUnit.assertFalse;
\r
24 import static org.testng.AssertJUnit.assertNotNull;
\r
25 import static org.testng.AssertJUnit.assertTrue;
\r
26 import static org.testng.AssertJUnit.fail;
\r
28 import java.io.File;
\r
29 import java.io.FileInputStream;
\r
30 import java.io.FileNotFoundException;
\r
31 import java.io.FileOutputStream;
\r
32 import java.io.IOException;
\r
33 import java.io.InputStream;
\r
34 import java.io.PrintWriter;
\r
35 import java.util.HashMap;
\r
36 import java.util.HashSet;
\r
37 import java.util.List;
\r
38 import java.util.Map;
\r
39 import java.util.Set;
\r
41 import org.testng.annotations.Test;
\r
43 import compbio.metadata.AllTestSuit;
\r
45 public class SequenceUtilTester {
\r
48 public void isNonAmbNucleotideSequence() {
\r
49 String dnaseq = "atgatTGACGCTGCTGatgtcgtgagtgga";
\r
50 assertTrue(SequenceUtil.isNonAmbNucleotideSequence(dnaseq));
\r
51 String dirtyDnaseq = "atgAGTggt\taGGTgc\ncgcACTgc gACtcgcGAt cgA ";
\r
52 assertTrue(SequenceUtil.isNonAmbNucleotideSequence(dirtyDnaseq));
\r
53 String nonDna = "atgfctgatgcatgcatgatgctga";
\r
54 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));
\r
56 nonDna = "atgc1tgatgcatgcatgatgctga";
\r
57 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));
\r
59 nonDna = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";
\r
60 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));
\r
61 // String ambDna = "AGTCRYMKSWHBVDN"; // see IUPAC Nucleotide Code
\r
62 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));
\r
66 public void CleanSequence() {
\r
67 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";
\r
68 assertEquals("atgAGTggtaGGTgccgcACTgcgACtcgcGAtcgA".toUpperCase(),SequenceUtil.cleanSequence(dirtySeq));
\r
72 public void DeepCleanSequence() {
\r
73 String dirtySeq = "a!t?g.A;GTggt\ta12GGTgc\ncgc23AC\rTgc gAC<>.,?!|\\|/t@cg-c¬GA=_+(0){]}[:£$&^*\"t cgA ";
\r
74 assertEquals("atgAGTggtaGGTgccgcACTgcgACtcgcGAtcgA".toUpperCase(),SequenceUtil.deepCleanSequence(dirtySeq));
\r
78 public void isProteinSequence() {
\r
79 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";
\r
80 assertFalse(SequenceUtil.isProteinSequence(dirtySeq));
\r
81 String notaSeq = "atgc1tgatgcatgcatgatgctga";
\r
82 assertFalse(SequenceUtil.isProteinSequence(notaSeq));
\r
83 String AAseq = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";
\r
84 assertTrue(SequenceUtil.isProteinSequence(AAseq));
\r
86 assertFalse(SequenceUtil.isProteinSequence(AAseq));
\r
91 public void CleanProteinSequence() {
\r
92 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";
\r
93 assertFalse(SequenceUtil.isProteinSequence(dirtySeq));
\r
94 // This will still be NON protein sequence despite having only correct
\r
95 // letters because the letters match perfectly the nucleotide sequence!
\r
96 assertFalse(SequenceUtil.isProteinSequence(SequenceUtil.cleanProteinSequence(dirtySeq)));
\r
98 String notaSeq = "atgc1tgatgcatgcatgatgmctga";
\r
99 assertFalse(SequenceUtil.isProteinSequence(notaSeq));
\r
100 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil.cleanProteinSequence(notaSeq)));
\r
102 String AAseq = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";
\r
103 assertTrue(SequenceUtil.isProteinSequence(AAseq));
\r
104 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil.cleanProteinSequence(AAseq)));
\r
107 assertFalse(SequenceUtil.isProteinSequence(AAseq));
\r
108 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil.cleanProteinSequence(AAseq)));
\r
112 public void ReadWriteFasta() {
\r
114 FileInputStream fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "TO1381.fasta");
\r
115 assertNotNull(fio);
\r
116 List<FastaSequence> fseqs = SequenceUtil.readFasta(fio);
\r
117 assertNotNull(fseqs);
\r
118 assertEquals(3, fseqs.size());
\r
119 assertEquals(3, fseqs.size());
\r
121 FileOutputStream fou = new FileOutputStream(AllTestSuit.TEST_DATA_PATH + "TO1381.fasta.written");
\r
122 SequenceUtil.writeFasta(fou, fseqs);
\r
124 FileOutputStream fou20 = new FileOutputStream(AllTestSuit.TEST_DATA_PATH + "TO1381.fasta20.written");
\r
125 SequenceUtil.writeFasta(fou20, fseqs, 21);
\r
127 } catch (FileNotFoundException e) {
\r
128 e.printStackTrace();
\r
129 fail(e.getLocalizedMessage());
\r
130 } catch (IOException e) {
\r
131 e.printStackTrace();
\r
132 fail(e.getLocalizedMessage());
\r
136 // Potential Bug :- Sequence names are shortened to 2-3 letters
\r
138 public void testReadFastaWriteClustal() {
\r
141 FileInputStream fio = new FileInputStream(
\r
142 AllTestSuit.TEST_DATA_PATH + "TO1381.fasta");
\r
143 assertNotNull(fio);
\r
144 List<FastaSequence> fseqs = SequenceUtil.readFasta(fio);
\r
145 assertNotNull(fseqs);
\r
148 char gapChar = '-';
\r
149 FileOutputStream fou = new FileOutputStream(
\r
150 AllTestSuit.TEST_DATA_PATH + "TO1381.aln.written");
\r
151 SequenceUtil.writeClustal(fou, fseqs, gapChar);
\r
154 } catch (FileNotFoundException e) {
\r
155 e.printStackTrace();
\r
156 fail(e.getLocalizedMessage());
\r
157 } catch (IOException e) {
\r
158 e.printStackTrace();
\r
159 fail(e.getLocalizedMessage());
\r
164 * This test tests the loading of horizontally formatted Jronn output file
\r
167 public void LoadJronnFile() {
\r
169 FileInputStream fio;
\r
171 fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "jronn.out");
\r
172 Map<String, Score> aseqs = SequenceUtil.readJRonn(fio);
\r
173 assertNotNull(aseqs);
\r
174 assertEquals(aseqs.size(), 3);
\r
175 Score aseq = aseqs.get("Foobar");
\r
176 assertNotNull(aseq);
\r
177 assertNotNull(aseq.getScores());
\r
178 assertEquals(aseq.getScores().size(), aseq.getScores().size());
\r
180 } catch (FileNotFoundException e) {
\r
181 e.printStackTrace();
\r
182 fail(e.getLocalizedMessage());
\r
183 } catch (IOException e) {
\r
184 e.printStackTrace();
\r
185 fail(e.getLocalizedMessage());
\r
186 } catch (UnknownFileFormatException e) {
\r
187 e.printStackTrace();
\r
188 fail(e.getLocalizedMessage());
\r
197 * This test tests the loading of horizontally formatted Jronn output file
\r
201 * M 0.86010 0.88512 0.37094
\r
203 * T 0.79983 0.85864 0.44331
\r
206 @SuppressWarnings("unchecked")
\r
208 public void ReadDisemblResults() {
\r
209 Map<String, Map<String,Set<Range>>> _ranges=new HashMap<String, Map<String,Set<Range>>>();
\r
210 Map<String, Set<Range>> ranges=new HashMap<String,Set<Range>>();
\r
211 Map<String,Map<String, Float>> _values=new HashMap<String, Map<String,Float>>();
\r
212 Map<String, Float> values = new HashMap<String, Float>();
\r
214 rset = new HashSet<Range>();
\r
215 for (String[] se:new String[][] { { "34","41"},{"50","58"},{"83","91"},{"118","127"},{" 160","169"},{" 191","220"},{" 243","252"},{" 287","343"},{" 350","391"},{" 429","485"},{" 497","506"},{"539","547"}}) {
\r
216 rset.add(new Range(se));
\r
218 ranges.put(DisemblResult.COILS.toString(), rset);
\r
219 values.put(DisemblResult.COILS.toString(), Float.valueOf(0.86010f));
\r
220 rset = new HashSet<Range>();
\r
221 for (String[] se:new String[][] { { "355","368"}}) {
\r
222 rset.add(new Range(se));
\r
224 ranges.put(DisemblResult.REM465.toString(), rset);
\r
225 values.put(DisemblResult.REM465.toString(), Float.valueOf(0.88512f));
\r
226 rset = new HashSet<Range>();
\r
227 for (String[] se:new String[][] { { "190","204"}}) {
\r
228 rset.add(new Range(se));
\r
230 ranges.put(DisemblResult.HOTLOOPS.toString(), rset);
\r
231 values.put(DisemblResult.HOTLOOPS.toString(), Float.valueOf(0.37094f));
\r
232 _ranges.put("Foobar_dundeefriends", ranges);
\r
233 _values.put("Foobar_dundeefriends", values);
\r
234 FileInputStream fio;
\r
236 fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "disembl.out");
\r
237 Map<String, Set<Score>> aseqs = SequenceUtil.readDisembl(fio);
\r
238 assertNotNull(aseqs);
\r
239 assertEquals(aseqs.size(), 3);
\r
240 ScoreManager sman = ScoreManager.newInstance(aseqs);
\r
242 for (String fs : aseqs.keySet()) {
\r
243 assertTrue(" Foobar_dundeefriends Foobar dundeefriends ".contains(fs));
\r
244 Set<Score> scores = aseqs.get(fs);
\r
245 assertEquals(scores.size(), 3);
\r
246 for (Score sc:scores) {
\r
247 if (_ranges.containsKey(fs)) {
\r
248 assertEquals("Checking range for Method "+sc.getMethod(),_ranges.get(fs).get(sc.getMethod()), sc.getRanges());
\r
249 assertEquals("Checking first value for Method "+sc.getMethod(), _values.get(fs).get(sc.getMethod()), sc.getScores().get(0));
\r
254 } catch (FileNotFoundException e) {
\r
255 e.printStackTrace();
\r
256 fail(e.getLocalizedMessage());
\r
257 } catch (IOException e) {
\r
258 e.printStackTrace();
\r
259 fail(e.getLocalizedMessage());
\r
260 } catch (UnknownFileFormatException e) {
\r
261 e.printStackTrace();
\r
262 fail(e.getLocalizedMessage());
\r
267 * This method tests the loading of horizontally formatted Jronn output file
\r
271 * >Foobar_dundeefriends
\r
273 * # GlobDoms 2-358, 373-568
\r
275 * # Disorder 1-5, 206-218, 243-250, 288-300, 313-324, 359-372, 475-481
\r
277 * # RESIDUE DYDX RAW SMOOTHED
\r
279 * M 0.0044 -0.2259 -0.2259
\r
281 * T -0.1308 -0.2170 -0.2170
\r
285 * > Second sequence
\r
287 @SuppressWarnings("unchecked")
\r
289 public void ReadGlobPlotResults() {
\r
291 FileInputStream fio;
\r
293 fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "globplot.out");
\r
294 HashMap<String, Set<Score>> aseqs = SequenceUtil.readGlobPlot(fio);
\r
295 assertNotNull(aseqs);
\r
296 assertEquals(aseqs.size(), 3);
\r
298 String fsdf = null;
\r
299 Set<Score> scores = null;
\r
300 for (String fs : aseqs.keySet()) {
\r
301 if ("Foobar_dundeefriends".contains(fs)) {
\r
303 scores = aseqs.get(fs);
\r
305 assertEquals(scores.size(), 5);
\r
308 ScoreManager sm = ScoreManager.newInstanceSingleSequence(scores);
\r
309 sm.writeOut(new PrintWriter(System.out, true));
\r
311 for (Score score : scores) {
\r
312 if (score.getMethod().equals(GlobProtResult.Disorder.toString())) {
\r
313 assertEquals(score.getRanges().size(), 7);
\r
314 assertTrue(score.getScores().isEmpty());
\r
316 if (GlobProtResult.valueOf(score.getMethod()) == GlobProtResult.Dydx) {
\r
317 assertFalse(score.getScores().isEmpty());
\r
318 assertTrue(score.getRanges().isEmpty());
\r
322 } catch (FileNotFoundException e) {
\r
323 e.printStackTrace();
\r
324 fail(e.getLocalizedMessage());
\r
325 } catch (IOException e) {
\r
326 e.printStackTrace();
\r
327 fail(e.getLocalizedMessage());
\r
328 } catch (UnknownFileFormatException e) {
\r
329 e.printStackTrace();
\r
330 fail(e.getLocalizedMessage());
\r
335 public void ReadIUPredForShortAndLongDisorder() {
\r
337 Map<String, Score> scores = SequenceUtil.readIUPred(new File(AllTestSuit.TEST_DATA_PATH, "out.long"));
\r
338 ScoreManager man = ScoreManager.newInstanceSingleScore(scores);
\r
339 assertNotNull(scores);
\r
340 assertEquals(3, scores.size());
\r
342 Score score = scores.get("Foobar_dundeefriends");
\r
343 assertNotNull(score);
\r
344 assertEquals(0, score.getRanges().size());
\r
345 assertEquals(568, score.getScores().size());
\r
346 assertEquals("Long", score.getMethod());
\r
348 score = scores.get("Foobar");
\r
349 assertNotNull(score);
\r
350 assertEquals(0, score.getRanges().size());
\r
351 assertEquals(481, score.getScores().size());
\r
352 assertEquals("Long", score.getMethod());
\r
354 score = scores.get("dundeefriends");
\r
355 assertNotNull(score);
\r
356 assertEquals(0, score.getRanges().size());
\r
357 assertEquals(513, score.getScores().size());
\r
358 assertEquals("Long", score.getMethod());
\r
359 } catch (IOException e) {
\r
360 e.printStackTrace();
\r
361 fail(e.getLocalizedMessage());
\r
362 } catch (UnknownFileFormatException e) {
\r
363 e.printStackTrace();
\r
364 fail(e.getLocalizedMessage());
\r
369 public void ReadIUPredForGlobDomain() {
\r
371 Map<String, Score> scores = SequenceUtil.readIUPred(new File(AllTestSuit.TEST_DATA_PATH, "output.glob"));
\r
372 assertNotNull(scores);
\r
373 assertEquals(2, scores.size());
\r
374 ScoreManager man = ScoreManager.newInstanceSingleScore(scores);
\r
375 assertEquals(2, man.getNumberOfSeq());
\r
376 Score score = scores.get("P53_HUMA");
\r
377 assertNotNull(score);
\r
378 assertEquals(2, score.getRanges().size());
\r
379 assertEquals(0, score.getScores().size());
\r
380 assertEquals("Glob", score.getMethod());
\r
381 score = scores.get("Foobar_dundeefriends");
\r
382 assertEquals(0, score.getRanges().size());
\r
383 } catch (IOException e) {
\r
384 e.printStackTrace();
\r
385 fail(e.getLocalizedMessage());
\r
386 } catch (UnknownFileFormatException e) {
\r
387 e.printStackTrace();
\r
388 fail(e.getLocalizedMessage());
\r
393 public void ReadAAConResults() {
\r
395 InputStream inStream = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "aacon_results.txt");
\r
396 HashSet<Score> result = SequenceUtil.readAAConResults(inStream);
\r
398 assertNotNull(result);
\r
399 assertEquals(result.size(), 18);
\r
401 inStream = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "aacon_result_single.out");
\r
402 result = SequenceUtil.readAAConResults(inStream);
\r
404 assertNotNull(result);
\r
405 assertEquals(result.size(), 1);
\r
406 assertEquals(result.iterator().next().getScores().size(), 568);
\r
407 } catch (IOException e) {
\r
408 e.printStackTrace();
\r
409 fail(e.getMessage());
\r
413 public void ReadJpredResults() {
\r
415 InputStream inStream = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "Jpred.test1.out");
\r
416 List<FastaSequence> result = SequenceUtil.readJpredFile(inStream);
\r
418 assertNotNull(result);
\r
419 assertEquals(result.size(), 19);
\r
420 } catch (IOException e) {
\r
421 e.printStackTrace();
\r
422 fail(e.getMessage());
\r