1a3ce5b5b6bcce51309c5e274f2545371700d906
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /* Copyright (c) 2009 Peter Troshin\r
2  *  \r
3  *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 1.0\r
4  * \r
5  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
6  *  Apache License version 2 as published by the Apache Software Foundation\r
7  * \r
8  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
9  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
10  *  License for more details.\r
11  * \r
12  *  A copy of the license is in apache_license.txt. It is also available here:\r
13  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
14  * \r
15  * Any republication or derived work distributed in source code form\r
16  * must include this copyright and license notice.\r
17  */\r
18 \r
19 package compbio.data.sequence;\r
20 \r
21 import java.io.BufferedReader;\r
22 import java.io.BufferedWriter;\r
23 import java.io.Closeable;\r
24 import java.io.IOException;\r
25 import java.io.InputStream;\r
26 import java.io.InputStreamReader;\r
27 import java.io.OutputStream;\r
28 import java.io.OutputStreamWriter;\r
29 import java.util.ArrayList;\r
30 import java.util.List;\r
31 import java.util.logging.Level;\r
32 import java.util.regex.Matcher;\r
33 import java.util.regex.Pattern;\r
34 \r
35 /**\r
36  * Utility class for operations on sequences\r
37  * \r
38  * @author pvtroshin\r
39  * \r
40  *         Date September 2009\r
41  */\r
42 public final class SequenceUtil {\r
43 \r
44     /**\r
45      * A whitespace character: [\t\n\x0B\f\r]\r
46      */\r
47     public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
48 \r
49     /**\r
50      * A digit\r
51      */\r
52     public static final Pattern DIGIT = Pattern.compile("\\d");\r
53 \r
54     /**\r
55      * Non word\r
56      */\r
57     public static final Pattern NONWORD = Pattern.compile("\\W");\r
58 \r
59     /**\r
60      * Valid Amino acids\r
61      */\r
62     public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
63             Pattern.CASE_INSENSITIVE);\r
64 \r
65     /**\r
66      * inversion of AA pattern\r
67      */\r
68     public static final Pattern NON_AA = Pattern.compile(\r
69             "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
70 \r
71     /**\r
72      * Same as AA pattern but with two additional letters - XU\r
73      */\r
74     public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
75             "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
76 \r
77     /**\r
78      * Nucleotides a, t, g, c, u\r
79      */\r
80     public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
81             Pattern.CASE_INSENSITIVE);\r
82 \r
83     /**\r
84      * Ambiguous nucleotide\r
85      */\r
86     public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
87             "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
88     /**\r
89      * Non nucleotide\r
90      */\r
91     public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
92             Pattern.CASE_INSENSITIVE);\r
93 \r
94     private SequenceUtil() {\r
95     } // utility class, no instantiation\r
96 \r
97     /*\r
98      * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
99      * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
100      * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
101      * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
102      * SysPrefs.newlinechar); pir_out.close(); }\r
103      * \r
104      * public static void write_FastaSeq(OutputStream os, FastaSequence seq)\r
105      * throws IOException { BufferedWriter fasta_out = new BufferedWriter( new\r
106      * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
107      * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
108      * SysPrefs.newlinechar); fasta_out.close(); }\r
109      */\r
110 \r
111     /**\r
112      * @return true is the sequence contains only letters a,c, t, g, u\r
113      */\r
114     public static boolean isNucleotideSequence(FastaSequence s) {\r
115         return isNonAmbNucleotideSequence(s.getSequence());\r
116     }\r
117 \r
118     /**\r
119      * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
120      * (!) - B char\r
121      */\r
122     public static boolean isNonAmbNucleotideSequence(String sequence) {\r
123         sequence = cleanSequence(sequence);\r
124         if (DIGIT.matcher(sequence).find()) {\r
125             return false;\r
126         }\r
127         if (NON_NUCLEOTIDE.matcher(sequence).find()) {\r
128             return false;\r
129             /*\r
130              * System.out.format("I found the text starting at " +\r
131              * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
132              * nonDNAmatcher.end());\r
133              */\r
134         }\r
135         Matcher DNAmatcher = NUCLEOTIDE.matcher(sequence);\r
136         return DNAmatcher.find();\r
137     }\r
138 \r
139     /**\r
140      * Removes all whitespace chars in the sequence string\r
141      * \r
142      * @param sequence\r
143      * @return cleaned up sequence\r
144      */\r
145     public static String cleanSequence(String sequence) {\r
146         assert sequence != null;\r
147         final Matcher m = WHITE_SPACE.matcher(sequence);\r
148         sequence = m.replaceAll("").toUpperCase();\r
149         return sequence;\r
150     }\r
151 \r
152     /**\r
153      * Removes all special characters and digits as well as whitespace chars\r
154      * from the sequence\r
155      * \r
156      * @param sequence\r
157      * @return cleaned up sequence\r
158      */\r
159     public static String deepCleanSequence(String sequence) {\r
160         sequence = cleanSequence(sequence);\r
161         sequence = DIGIT.matcher(sequence).replaceAll("");\r
162         sequence = NONWORD.matcher(sequence).replaceAll("");\r
163         Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
164         sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
165         return sequence;\r
166     }\r
167 \r
168     /**\r
169      * \r
170      * @param sequence\r
171      * @return true is the sequence is a protein sequence, false overwise\r
172      */\r
173     public static boolean isProteinSequence(String sequence) {\r
174         sequence = cleanSequence(sequence);\r
175         if (isNonAmbNucleotideSequence(sequence)) {\r
176             return false;\r
177         }\r
178         if (DIGIT.matcher(sequence).find()) {\r
179             return false;\r
180         }\r
181         if (NON_AA.matcher(sequence).find()) {\r
182             return false;\r
183         }\r
184         Matcher protmatcher = AA.matcher(sequence);\r
185         return protmatcher.find();\r
186     }\r
187 \r
188     /**\r
189      * Check whether the sequence confirms to amboguous protein sequence\r
190      * \r
191      * @param sequence\r
192      * @return return true only if the sequence if ambiguous protein sequence\r
193      *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
194      *         protein or DNA\r
195      */\r
196     public static boolean isAmbiguosProtein(String sequence) {\r
197         sequence = cleanSequence(sequence);\r
198         if (isNonAmbNucleotideSequence(sequence)) {\r
199             return false;\r
200         }\r
201         if (DIGIT.matcher(sequence).find()) {\r
202             return false;\r
203         }\r
204         if (NON_AA.matcher(sequence).find()) {\r
205             return false;\r
206         }\r
207         if (AA.matcher(sequence).find()) {\r
208             return false;\r
209         }\r
210         Matcher amb_prot = AMBIGUOUS_AA.matcher(sequence);\r
211         return amb_prot.find();\r
212     }\r
213 \r
214     /**\r
215      * Writes list of FastaSequeces into the outstream formatting the sequence\r
216      * so that it contains width chars on each line\r
217      * \r
218      * @param outstream\r
219      * @param sequences\r
220      * @param width\r
221      *            - the maximum number of characters to write in one line\r
222      * @throws IOException\r
223      */\r
224     public static void writeFasta(OutputStream outstream,\r
225             List<FastaSequence> sequences, int width) throws IOException {\r
226         OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
227         BufferedWriter fastawriter = new BufferedWriter(writer);\r
228         for (FastaSequence fs : sequences) {\r
229             fastawriter.write(fs.getOnelineFasta());\r
230         }\r
231         outstream.flush();\r
232         fastawriter.close();\r
233         writer.close();\r
234     }\r
235 \r
236     /**\r
237      * Reads fasta sequences from inStream into the list of FastaSequence\r
238      * objects\r
239      * \r
240      * @param inStream\r
241      *            from\r
242      * @return list of FastaSequence objects\r
243      * @throws IOException\r
244      */\r
245     public static List<FastaSequence> readFasta(InputStream inStream)\r
246             throws IOException {\r
247         List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
248         InputStreamReader inReader = new InputStreamReader(inStream);\r
249         BufferedReader infasta = new BufferedReader(inReader);\r
250         Pattern pattern = Pattern.compile("//s+");\r
251 \r
252         String line;\r
253         String sname = "", seqstr = null;\r
254         do {\r
255             line = infasta.readLine();\r
256             if (line == null || line.startsWith(">")) {\r
257                 if (seqstr != null)\r
258                     seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
259                 sname = line; // remove >\r
260                 seqstr = "";\r
261             } else {\r
262                 String subseq = pattern.matcher(line).replaceAll("");\r
263                 seqstr += subseq;\r
264             }\r
265         } while (line != null);\r
266         inReader.close();\r
267         infasta.close();\r
268         return seqs;\r
269     }\r
270 \r
271     /**\r
272      * Writes FastaSequence in the file, each sequence will take one line only\r
273      * \r
274      * @param os\r
275      * @param sequences\r
276      * @throws IOException\r
277      */\r
278     public static void writeFasta(OutputStream os, List<FastaSequence> sequences)\r
279             throws IOException {\r
280         OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
281         BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
282         for (FastaSequence fs : sequences) {\r
283             fasta_out.write(fs.getOnelineFasta());\r
284         }\r
285         fasta_out.close();\r
286         outWriter.close();\r
287     }\r
288 \r
289     /**\r
290      * Closes the Closable and logs the exception if any\r
291      * \r
292      * @param log\r
293      * @param stream\r
294      */\r
295     public final static void closeSilently(java.util.logging.Logger log,\r
296             Closeable stream) {\r
297         if (stream != null) {\r
298             try {\r
299                 stream.close();\r
300             } catch (IOException e) {\r
301                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
302             }\r
303         }\r
304     }\r
305 \r
306 }\r