2 * Copyright (c) 2009 Peter Troshin JAva Bioinformatics Analysis Web Services
\r
3 * (JABAWS) @version: 1.0 This library is free software; you can redistribute it
\r
4 * and/or modify it under the terms of the Apache License version 2 as published
\r
5 * by the Apache Software Foundation This library is distributed in the hope
\r
6 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
\r
7 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
8 * Apache License for more details. A copy of the license is in
\r
9 * apache_license.txt. It is also available here:
\r
10 * @see: http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or
\r
11 * derived work distributed in source code form must include this copyright and
\r
14 package compbio.data.sequence;
\r
16 import static org.testng.AssertJUnit.assertEquals;
\r
17 import static org.testng.AssertJUnit.assertFalse;
\r
18 import static org.testng.AssertJUnit.assertNotNull;
\r
19 import static org.testng.AssertJUnit.assertTrue;
\r
20 import static org.testng.AssertJUnit.fail;
\r
22 import java.io.File;
\r
23 import java.io.FileInputStream;
\r
24 import java.io.FileNotFoundException;
\r
25 import java.io.FileOutputStream;
\r
26 import java.io.IOException;
\r
27 import java.io.InputStream;
\r
28 import java.io.PrintWriter;
\r
29 import java.util.HashMap;
\r
30 import java.util.HashSet;
\r
31 import java.util.List;
\r
32 import java.util.Map;
\r
33 import java.util.Set;
\r
35 import org.testng.annotations.Test;
\r
37 import compbio.metadata.AllTestSuit;
\r
39 public class SequenceUtilTester {
\r
42 public void testisNonAmbNucleotideSequence() {
\r
43 String dnaseq = "atgatTGACGCTGCTGatgtcgtgagtgga";
\r
44 assertTrue(SequenceUtil.isNonAmbNucleotideSequence(dnaseq));
\r
45 String dirtyDnaseq = "atgAGTggt\taGGTgc\ncgcACTgc gACtcgcGAt cgA ";
\r
46 assertTrue(SequenceUtil.isNonAmbNucleotideSequence(dirtyDnaseq));
\r
47 String nonDna = "atgfctgatgcatgcatgatgctga";
\r
48 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));
\r
50 nonDna = "atgc1tgatgcatgcatgatgctga";
\r
51 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));
\r
53 nonDna = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";
\r
54 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));
\r
55 // String ambDna = "AGTCRYMKSWHBVDN"; // see IUPAC Nucleotide Code
\r
56 assertFalse(SequenceUtil.isNonAmbNucleotideSequence(nonDna));
\r
61 public void testCleanSequence() {
\r
62 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";
\r
63 assertEquals("atgAGTggtaGGTgccgcACTgcgACtcgcGAtcgA".toUpperCase(),
\r
64 SequenceUtil.cleanSequence(dirtySeq));
\r
68 public void testDeepCleanSequence() {
\r
69 String dirtySeq = "a!t?g.A;GTggt\ta12GGTgc\ncgc23AC\rTgc gAC<>.,?!|\\|/t@cg-c¬GA=_+(0){]}[:£$&^*\"t cgA ";
\r
70 assertEquals("atgAGTggtaGGTgccgcACTgcgACtcgcGAtcgA".toUpperCase(),
\r
71 SequenceUtil.deepCleanSequence(dirtySeq));
\r
75 public void testisProteinSequence() {
\r
76 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";
\r
77 assertFalse(SequenceUtil.isProteinSequence(dirtySeq));
\r
78 String notaSeq = "atgc1tgatgcatgcatgatgctga";
\r
79 assertFalse(SequenceUtil.isProteinSequence(notaSeq));
\r
80 String AAseq = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";
\r
81 assertTrue(SequenceUtil.isProteinSequence(AAseq));
\r
83 assertFalse(SequenceUtil.isProteinSequence(AAseq));
\r
88 public void testCleanProteinSequence() {
\r
89 String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA ";
\r
90 assertFalse(SequenceUtil.isProteinSequence(dirtySeq));
\r
91 // This will still be NON protein sequence despite having only correct
\r
92 // letters because the letters match perfectly the nucleotide sequence!
\r
93 assertFalse(SequenceUtil.isProteinSequence(SequenceUtil
\r
94 .cleanProteinSequence(dirtySeq)));
\r
96 String notaSeq = "atgc1tgatgcatgcatgatgmctga";
\r
97 assertFalse(SequenceUtil.isProteinSequence(notaSeq));
\r
98 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil
\r
99 .cleanProteinSequence(notaSeq)));
\r
101 String AAseq = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL";
\r
102 assertTrue(SequenceUtil.isProteinSequence(AAseq));
\r
103 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil
\r
104 .cleanProteinSequence(AAseq)));
\r
107 assertFalse(SequenceUtil.isProteinSequence(AAseq));
\r
108 assertTrue(SequenceUtil.isProteinSequence(SequenceUtil
\r
109 .cleanProteinSequence(AAseq)));
\r
113 public void testReadWriteFasta() {
\r
116 FileInputStream fio = new FileInputStream(
\r
117 AllTestSuit.TEST_DATA_PATH + "TO1381.fasta");
\r
118 assertNotNull(fio);
\r
119 List<FastaSequence> fseqs = SequenceUtil.readFasta(fio);
\r
120 assertNotNull(fseqs);
\r
121 assertEquals(3, fseqs.size());
\r
122 assertEquals(3, fseqs.size());
\r
124 FileOutputStream fou = new FileOutputStream(
\r
125 AllTestSuit.TEST_DATA_PATH + "TO1381.fasta.written");
\r
126 SequenceUtil.writeFasta(fou, fseqs);
\r
128 FileOutputStream fou20 = new FileOutputStream(
\r
129 AllTestSuit.TEST_DATA_PATH + "TO1381.fasta20.written");
\r
130 SequenceUtil.writeFasta(fou20, fseqs, 21);
\r
133 } catch (FileNotFoundException e) {
\r
134 e.printStackTrace();
\r
135 fail(e.getLocalizedMessage());
\r
136 } catch (IOException e) {
\r
137 e.printStackTrace();
\r
138 fail(e.getLocalizedMessage());
\r
143 * This test tests the loading of horizontally formatted Jronn output file
\r
146 public void loadJronnFile() {
\r
148 FileInputStream fio;
\r
150 fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH + "jronn.out");
\r
151 Map<String, Score> aseqs = SequenceUtil.readJRonn(fio);
\r
152 assertNotNull(aseqs);
\r
153 assertEquals(aseqs.size(), 3);
\r
154 Score aseq = aseqs.get("Foobar");
\r
155 assertNotNull(aseq);
\r
156 assertNotNull(aseq.getScores());
\r
157 // System.out.println(aseq);
\r
158 assertEquals(aseq.getScores().size(), aseq.getScores().size());
\r
160 } catch (FileNotFoundException e) {
\r
161 e.printStackTrace();
\r
162 fail(e.getLocalizedMessage());
\r
163 } catch (IOException e) {
\r
164 e.printStackTrace();
\r
165 fail(e.getLocalizedMessage());
\r
166 } catch (UnknownFileFormatException e) {
\r
167 e.printStackTrace();
\r
168 fail(e.getLocalizedMessage());
\r
178 * This test tests the loading of horizontally formatted Jronn output file
\r
182 * M 0.86010 0.88512 0.37094
\r
184 * T 0.79983 0.85864 0.44331
\r
187 @SuppressWarnings("unchecked")
\r
189 public void testReadDisemblResults() {
\r
191 FileInputStream fio;
\r
193 fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH
\r
195 Map<String, Set<Score>> aseqs = SequenceUtil.readDisembl(fio);
\r
196 assertNotNull(aseqs);
\r
197 assertEquals(aseqs.size(), 3);
\r
198 ScoreManager sman = ScoreManager.newInstance(aseqs);
\r
200 for (String fs : aseqs.keySet()) {
\r
201 assertTrue(" Foobar_dundeefriends Foobar dundeefriends "
\r
203 Set<Score> scores = aseqs.get(fs);
\r
204 assertEquals(scores.size(), 3);
\r
207 } catch (FileNotFoundException e) {
\r
208 e.printStackTrace();
\r
209 fail(e.getLocalizedMessage());
\r
210 } catch (IOException e) {
\r
211 e.printStackTrace();
\r
212 fail(e.getLocalizedMessage());
\r
213 } catch (UnknownFileFormatException e) {
\r
214 e.printStackTrace();
\r
215 fail(e.getLocalizedMessage());
\r
220 * This test tests the loading of horizontally formatted Jronn output file
\r
224 * >Foobar_dundeefriends
\r
226 * # GlobDoms 2-358, 373-568
\r
228 * # Disorder 1-5, 206-218, 243-250, 288-300, 313-324, 359-372, 475-481
\r
230 * # RESIDUE DYDX RAW SMOOTHED
\r
232 * M 0.0044 -0.2259 -0.2259
\r
234 * T -0.1308 -0.2170 -0.2170
\r
238 * > Second sequence
\r
240 @SuppressWarnings("unchecked")
\r
242 public void testReadGlobPlotResults() {
\r
244 FileInputStream fio;
\r
246 fio = new FileInputStream(AllTestSuit.TEST_DATA_PATH
\r
248 HashMap<String, Set<Score>> aseqs = SequenceUtil.readGlobPlot(fio);
\r
249 assertNotNull(aseqs);
\r
250 assertEquals(aseqs.size(), 3);
\r
252 String fsdf = null;
\r
253 Set<Score> scores = null;
\r
254 for (String fs : aseqs.keySet()) {
\r
255 if ("Foobar_dundeefriends".contains(fs)) {
\r
257 scores = aseqs.get(fs);
\r
259 assertEquals(scores.size(), 5);
\r
262 ScoreManager sm = ScoreManager.newInstanceSingleSequence(scores);
\r
263 sm.writeOut(new PrintWriter(System.out, true));
\r
265 for (Score score : scores) {
\r
267 if (score.getMethod()
\r
268 .equals(GlobProtResult.Disorder.toString())) {
\r
269 assertEquals(score.getRanges().size(), 7);
\r
270 assertTrue(score.getScores().isEmpty());
\r
272 if (GlobProtResult.valueOf(score.getMethod()) == GlobProtResult.Dydx) {
\r
273 assertFalse(score.getScores().isEmpty());
\r
274 assertTrue(score.getRanges().isEmpty());
\r
278 } catch (FileNotFoundException e) {
\r
279 e.printStackTrace();
\r
280 fail(e.getLocalizedMessage());
\r
281 } catch (IOException e) {
\r
282 e.printStackTrace();
\r
283 fail(e.getLocalizedMessage());
\r
284 } catch (UnknownFileFormatException e) {
\r
285 e.printStackTrace();
\r
286 fail(e.getLocalizedMessage());
\r
291 public void testReadIUPredForShortAndLongDisorder() {
\r
293 Map<String, Score> scores = SequenceUtil.readIUPred(new File(
\r
294 AllTestSuit.TEST_DATA_PATH, "output.long"));
\r
295 ScoreManager man = ScoreManager.newInstanceSingleScore(scores);
\r
296 // man.writeOut(new PrintWriter(System.out, true));
\r
297 assertNotNull(scores);
\r
298 assertEquals(2, scores.size());
\r
300 Score score = scores.get("P50_HUMAN");
\r
301 assertNotNull(score);
\r
302 assertEquals(0, score.getRanges().size());
\r
303 assertEquals(393, score.getScores().size());
\r
304 assertEquals("Long", score.getMethod());
\r
305 } catch (IOException e) {
\r
306 e.printStackTrace();
\r
307 fail(e.getLocalizedMessage());
\r
308 } catch (UnknownFileFormatException e) {
\r
309 e.printStackTrace();
\r
310 fail(e.getLocalizedMessage());
\r
315 public void testReadIUPredForGlobDomain() {
\r
317 Map<String, Score> scores = SequenceUtil.readIUPred(new File(
\r
318 AllTestSuit.TEST_DATA_PATH, "output.glob"));
\r
319 assertNotNull(scores);
\r
320 assertEquals(2, scores.size());
\r
321 ScoreManager man = ScoreManager.newInstanceSingleScore(scores);
\r
322 // man.writeOut(new PrintWriter(System.out, true));
\r
323 assertEquals(2, man.getNumberOfSeq());
\r
324 Score score = scores.get("P53_HUMA");
\r
325 assertNotNull(score);
\r
326 assertEquals(2, score.getRanges().size());
\r
327 assertEquals(0, score.getScores().size());
\r
328 assertEquals("Glob", score.getMethod());
\r
330 score = scores.get("Foobar_dundeefriends");
\r
331 assertEquals(0, score.getRanges().size());
\r
332 } catch (IOException e) {
\r
333 e.printStackTrace();
\r
334 fail(e.getLocalizedMessage());
\r
335 } catch (UnknownFileFormatException e) {
\r
336 e.printStackTrace();
\r
337 fail(e.getLocalizedMessage());
\r
341 public void testReadAAConResults() {
\r
343 InputStream inStream = new FileInputStream(
\r
344 AllTestSuit.TEST_DATA_PATH + "aacon_results.txt");
\r
345 HashSet<Score> result = SequenceUtil.readAAConResults(inStream);
\r
347 assertNotNull(result);
\r
348 assertEquals(result.size(), 18);
\r
350 inStream = new FileInputStream(AllTestSuit.TEST_DATA_PATH
\r
351 + "aacon_result_single.out");
\r
352 result = SequenceUtil.readAAConResults(inStream);
\r
354 assertNotNull(result);
\r
355 assertEquals(result.size(), 1);
\r
356 assertEquals(result.iterator().next().getScores().size(), 568);
\r
357 } catch (IOException e) {
\r
358 e.printStackTrace();
\r
359 fail(e.getMessage());
\r