99a8147fc5cfbee363077c6b86d969641050d37a
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /* \r
2  * @(#)SequenceUtil.java 1.0 September 2009\r
3  * \r
4  * Copyright (c) 2009 Peter Troshin\r
5  *  \r
6  * Jalview Web Services version: 2.0     \r
7  * \r
8  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
9  *  Apache License version 2 as published by the Apache Software Foundation\r
10  * \r
11  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
12  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
13  *  License for more details.\r
14  * \r
15  *  A copy of the license is in apache_license.txt. It is also available here:\r
16  * see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
17  * \r
18  * Any republication or derived work distributed in source code form\r
19  * must include this copyright and license notice.\r
20  */\r
21 \r
22 package compbio.data.sequence;\r
23 \r
24 import java.io.BufferedReader;\r
25 import java.io.BufferedWriter;\r
26 import java.io.Closeable;\r
27 import java.io.File;\r
28 import java.io.FileInputStream;\r
29 import java.io.IOException;\r
30 import java.io.InputStream;\r
31 import java.io.InputStreamReader;\r
32 import java.io.OutputStream;\r
33 import java.io.OutputStreamWriter;\r
34 import java.util.ArrayList;\r
35 import java.util.List;\r
36 import java.util.logging.Level;\r
37 import java.util.regex.Matcher;\r
38 import java.util.regex.Pattern;\r
39 \r
40 /**\r
41  * Utility class for operations on sequences\r
42  * \r
43  * @author Petr Troshin\r
44  * @version 1.0\r
45  */\r
46 public final class SequenceUtil {\r
47 \r
48     /**\r
49      * A whitespace character: [\t\n\x0B\f\r]\r
50      */\r
51     public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
52 \r
53     /**\r
54      * A digit\r
55      */\r
56     public static final Pattern DIGIT = Pattern.compile("\\d");\r
57 \r
58     /**\r
59      * Non word\r
60      */\r
61     public static final Pattern NONWORD = Pattern.compile("\\W");\r
62 \r
63     /**\r
64      * Valid Amino acids\r
65      */\r
66     public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
67             Pattern.CASE_INSENSITIVE);\r
68 \r
69     /**\r
70      * inversion of AA pattern\r
71      */\r
72     public static final Pattern NON_AA = Pattern.compile(\r
73             "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
74 \r
75     /**\r
76      * Same as AA pattern but with two additional letters - XU\r
77      */\r
78     public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
79             "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
80 \r
81     /**\r
82      * Nucleotides a, t, g, c, u\r
83      */\r
84     public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
85             Pattern.CASE_INSENSITIVE);\r
86 \r
87     /**\r
88      * Ambiguous nucleotide\r
89      */\r
90     public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
91             "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
92     /**\r
93      * Non nucleotide\r
94      */\r
95     public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
96             Pattern.CASE_INSENSITIVE);\r
97 \r
98     private SequenceUtil() {\r
99     } // utility class, no instantiation\r
100 \r
101     /*\r
102      * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
103      * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
104      * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
105      * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
106      * SysPrefs.newlinechar); pir_out.close(); }\r
107      * \r
108      * public static void write_FastaSeq(OutputStream os, FastaSequence seq)\r
109      * throws IOException { BufferedWriter fasta_out = new BufferedWriter( new\r
110      * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
111      * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
112      * SysPrefs.newlinechar); fasta_out.close(); }\r
113      */\r
114 \r
115     /**\r
116      * @return true is the sequence contains only letters a,c, t, g, u\r
117      */\r
118     public static boolean isNucleotideSequence(final FastaSequence s) {\r
119         return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
120     }\r
121 \r
122     /**\r
123      * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
124      * (!) - B char\r
125      */\r
126     public static boolean isNonAmbNucleotideSequence(String sequence) {\r
127         sequence = SequenceUtil.cleanSequence(sequence);\r
128         if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
129             return false;\r
130         }\r
131         if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
132             return false;\r
133             /*\r
134              * System.out.format("I found the text starting at " +\r
135              * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
136              * nonDNAmatcher.end());\r
137              */\r
138         }\r
139         final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
140         return DNAmatcher.find();\r
141     }\r
142 \r
143     /**\r
144      * Removes all whitespace chars in the sequence string\r
145      * \r
146      * @param sequence\r
147      * @return cleaned up sequence\r
148      */\r
149     public static String cleanSequence(String sequence) {\r
150         assert sequence != null;\r
151         final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
152         sequence = m.replaceAll("").toUpperCase();\r
153         return sequence;\r
154     }\r
155 \r
156     /**\r
157      * Removes all special characters and digits as well as whitespace chars\r
158      * from the sequence\r
159      * \r
160      * @param sequence\r
161      * @return cleaned up sequence\r
162      */\r
163     public static String deepCleanSequence(String sequence) {\r
164         sequence = SequenceUtil.cleanSequence(sequence);\r
165         sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
166         sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
167         final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
168         sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
169         return sequence;\r
170     }\r
171 \r
172     /**\r
173      * \r
174      * @param sequence\r
175      * @return true is the sequence is a protein sequence, false overwise\r
176      */\r
177     public static boolean isProteinSequence(String sequence) {\r
178         sequence = SequenceUtil.cleanSequence(sequence);\r
179         if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
180             return false;\r
181         }\r
182         if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
183             return false;\r
184         }\r
185         if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
186             return false;\r
187         }\r
188         final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
189         return protmatcher.find();\r
190     }\r
191 \r
192     /**\r
193      * Check whether the sequence confirms to amboguous protein sequence\r
194      * \r
195      * @param sequence\r
196      * @return return true only if the sequence if ambiguous protein sequence\r
197      *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
198      *         protein or DNA\r
199      */\r
200     public static boolean isAmbiguosProtein(String sequence) {\r
201         sequence = SequenceUtil.cleanSequence(sequence);\r
202         if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
203             return false;\r
204         }\r
205         if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
206             return false;\r
207         }\r
208         if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
209             return false;\r
210         }\r
211         if (SequenceUtil.AA.matcher(sequence).find()) {\r
212             return false;\r
213         }\r
214         final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
215         return amb_prot.find();\r
216     }\r
217 \r
218     /**\r
219      * Writes list of FastaSequeces into the outstream formatting the sequence\r
220      * so that it contains width chars on each line\r
221      * \r
222      * @param outstream\r
223      * @param sequences\r
224      * @param width\r
225      *            - the maximum number of characters to write in one line\r
226      * @throws IOException\r
227      */\r
228     public static void writeFasta(final OutputStream outstream,\r
229             final List<FastaSequence> sequences, final int width)\r
230             throws IOException {\r
231         final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
232         final BufferedWriter fastawriter = new BufferedWriter(writer);\r
233         for (final FastaSequence fs : sequences) {\r
234             fastawriter.write(fs.getFormatedSequence(width));\r
235         }\r
236         outstream.flush();\r
237         fastawriter.close();\r
238         writer.close();\r
239     }\r
240 \r
241     /**\r
242      * Reads fasta sequences from inStream into the list of FastaSequence\r
243      * objects\r
244      * \r
245      * @param inStream\r
246      *            from\r
247      * @return list of FastaSequence objects\r
248      * @throws IOException\r
249      */\r
250     public static List<FastaSequence> readFasta(final InputStream inStream)\r
251             throws IOException {\r
252         final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
253 \r
254         final BufferedReader infasta = new BufferedReader(\r
255                 new InputStreamReader(inStream, "UTF8"), 16000);\r
256         final Pattern pattern = Pattern.compile("//s+");\r
257 \r
258         String line;\r
259         String sname = "", seqstr = null;\r
260         do {\r
261             line = infasta.readLine();\r
262             if ((line == null) || line.startsWith(">")) {\r
263                 if (seqstr != null) {\r
264                     seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
265                 }\r
266                 sname = line; // remove >\r
267                 seqstr = "";\r
268             } else {\r
269                 final String subseq = pattern.matcher(line).replaceAll("");\r
270                 seqstr += subseq;\r
271             }\r
272         } while (line != null);\r
273 \r
274         infasta.close();\r
275         return seqs;\r
276     }\r
277 \r
278     /**\r
279      * Writes FastaSequence in the file, each sequence will take one line only\r
280      * \r
281      * @param os\r
282      * @param sequences\r
283      * @throws IOException\r
284      */\r
285     public static void writeFasta(final OutputStream os,\r
286             final List<FastaSequence> sequences) throws IOException {\r
287         final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
288         final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
289         for (final FastaSequence fs : sequences) {\r
290             fasta_out.write(fs.getOnelineFasta());\r
291         }\r
292         fasta_out.close();\r
293         outWriter.close();\r
294     }\r
295 \r
296     public static List<AnnotatedSequence> readJRonn(final File result)\r
297             throws IOException, UnknownFileFormatException {\r
298         InputStream input = new FileInputStream(result);\r
299         List<AnnotatedSequence> sequences = readJRonn(input);\r
300         input.close();\r
301         return sequences;\r
302     }\r
303 \r
304     /**\r
305      * Reader for JRonn horizontal file format\r
306      * \r
307      * >Foobar\r
308      * \r
309      * M G D T T A G\r
310      * \r
311      * 0.48 0.42 0.42 0.48 0.52 0.53 0.54\r
312      * \r
313      * All values are tab delimited\r
314      * \r
315      * @param inStream\r
316      * @return\r
317      * @throws IOException\r
318      * @throws UnknownFileFormatException\r
319      */\r
320     public static List<AnnotatedSequence> readJRonn(final InputStream inStream)\r
321             throws IOException, UnknownFileFormatException {\r
322         final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();\r
323 \r
324         final BufferedReader infasta = new BufferedReader(\r
325                 new InputStreamReader(inStream, "UTF8"), 16000);\r
326 \r
327         String line;\r
328         String sname = "";\r
329         do {\r
330             line = infasta.readLine();\r
331             if (line == null || line.isEmpty()) {\r
332                 // skip empty lines\r
333                 continue;\r
334             }\r
335             if (line.startsWith(">")) {\r
336                 // read name\r
337                 sname = line.trim().substring(1);\r
338                 // read sequence line\r
339                 line = infasta.readLine();\r
340                 final String sequence = line.replace("\t", "");\r
341                 // read annotation line\r
342                 line = infasta.readLine();\r
343                 String[] annotValues = line.split("\t");\r
344                 float[] annotation = convertToNumber(annotValues);\r
345                 if (annotation.length != sequence.length()) {\r
346                     throw new UnknownFileFormatException(\r
347                             "File does not look like Jronn horizontally formatted output file!\n"\r
348                                     + JRONN_WRONG_FORMAT_MESSAGE);\r
349                 }\r
350                 seqs.add(new AnnotatedSequence(sname, sequence, annotation));\r
351             }\r
352         } while (line != null);\r
353 \r
354         infasta.close();\r
355         return seqs;\r
356     }\r
357 \r
358     private static float[] convertToNumber(String[] annotValues)\r
359             throws UnknownFileFormatException {\r
360         float[] annotation = new float[annotValues.length];\r
361         try {\r
362             for (int i = 0; i < annotation.length; i++) {\r
363                 annotation[i] = Float.parseFloat(annotValues[i]);\r
364             }\r
365         } catch (NumberFormatException e) {\r
366             throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE, e\r
367                     .getCause());\r
368         }\r
369         return annotation;\r
370     }\r
371 \r
372     private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
373             + ">sequence_name\n "\r
374             + "M        V       S\n"\r
375             + "0.43     0.22    0.65\n"\r
376             + "Where first line is the sequence name,\n"\r
377             + "second line is the tab delimited sequence,\n"\r
378             + "third line contains tab delimited disorder prediction values.\n"\r
379             + "No lines are allowed between these three. Additionally, the number of  "\r
380             + "sequence residues must be equal to the number of the disorder values.";\r
381 \r
382     /**\r
383      * Closes the Closable and logs the exception if any\r
384      * \r
385      * @param log\r
386      * @param stream\r
387      */\r
388     public final static void closeSilently(java.util.logging.Logger log,\r
389             Closeable stream) {\r
390         if (stream != null) {\r
391             try {\r
392                 stream.close();\r
393             } catch (IOException e) {\r
394                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
395             }\r
396         }\r
397     }\r
398 \r
399     public static List<AnnotatedSequence> readDisembl(final File result)\r
400             throws IOException, UnknownFileFormatException {\r
401         InputStream input = new FileInputStream(result);\r
402         List<AnnotatedSequence> sequences = readJRonn(input);\r
403         input.close();\r
404         return sequences;\r
405     }\r
406 }\r