Adding AAConWS
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /*\r
2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
3  * Jalview Web Services version: 2.0 This library is free software; you can\r
4  * redistribute it and/or modify it under the terms of the Apache License\r
5  * version 2 as published by the Apache Software Foundation This library is\r
6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
9  * license is in apache_license.txt. It is also available here: see:\r
10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
11  * work distributed in source code form must include this copyright and license\r
12  * notice.\r
13  */\r
14 \r
15 package compbio.data.sequence;\r
16 \r
17 import java.io.BufferedReader;\r
18 import java.io.BufferedWriter;\r
19 import java.io.Closeable;\r
20 import java.io.File;\r
21 import java.io.FileInputStream;\r
22 import java.io.IOException;\r
23 import java.io.InputStream;\r
24 import java.io.InputStreamReader;\r
25 import java.io.OutputStream;\r
26 import java.io.OutputStreamWriter;\r
27 import java.util.ArrayList;\r
28 import java.util.List;\r
29 import java.util.Scanner;\r
30 import java.util.logging.Level;\r
31 import java.util.regex.Matcher;\r
32 import java.util.regex.Pattern;\r
33 \r
34 /**\r
35  * Utility class for operations on sequences\r
36  * \r
37  * @author Petr Troshin\r
38  * @version 1.0\r
39  */\r
40 public final class SequenceUtil {\r
41 \r
42         /**\r
43          * A whitespace character: [\t\n\x0B\f\r]\r
44          */\r
45         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
46 \r
47         /**\r
48          * A digit\r
49          */\r
50         public static final Pattern DIGIT = Pattern.compile("\\d");\r
51 \r
52         /**\r
53          * Non word\r
54          */\r
55         public static final Pattern NONWORD = Pattern.compile("\\W");\r
56 \r
57         /**\r
58          * Valid Amino acids\r
59          */\r
60         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
61                         Pattern.CASE_INSENSITIVE);\r
62 \r
63         /**\r
64          * inversion of AA pattern\r
65          */\r
66         public static final Pattern NON_AA = Pattern.compile(\r
67                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
68 \r
69         /**\r
70          * Same as AA pattern but with two additional letters - XU\r
71          */\r
72         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
73                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
74 \r
75         /**\r
76          * Nucleotides a, t, g, c, u\r
77          */\r
78         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
79                         Pattern.CASE_INSENSITIVE);\r
80 \r
81         /**\r
82          * Ambiguous nucleotide\r
83          */\r
84         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
85                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
86         /**\r
87          * Non nucleotide\r
88          */\r
89         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
90                         Pattern.CASE_INSENSITIVE);\r
91 \r
92         private SequenceUtil() {\r
93         } // utility class, no instantiation\r
94 \r
95         /*\r
96          * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
97          * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
98          * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
99          * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
100          * SysPrefs.newlinechar); pir_out.close(); } public static void\r
101          * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
102          * BufferedWriter fasta_out = new BufferedWriter( new\r
103          * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
104          * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
105          * SysPrefs.newlinechar); fasta_out.close(); }\r
106          */\r
107 \r
108         /**\r
109          * @return true is the sequence contains only letters a,c, t, g, u\r
110          */\r
111         public static boolean isNucleotideSequence(final FastaSequence s) {\r
112                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
113         }\r
114 \r
115         /**\r
116          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
117          * (!) - B char\r
118          */\r
119         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
120                 sequence = SequenceUtil.cleanSequence(sequence);\r
121                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
122                         return false;\r
123                 }\r
124                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
125                         return false;\r
126                         /*\r
127                          * System.out.format("I found the text starting at " +\r
128                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
129                          * nonDNAmatcher.end());\r
130                          */\r
131                 }\r
132                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
133                 return DNAmatcher.find();\r
134         }\r
135 \r
136         /**\r
137          * Removes all whitespace chars in the sequence string\r
138          * \r
139          * @param sequence\r
140          * @return cleaned up sequence\r
141          */\r
142         public static String cleanSequence(String sequence) {\r
143                 assert sequence != null;\r
144                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
145                 sequence = m.replaceAll("").toUpperCase();\r
146                 return sequence;\r
147         }\r
148 \r
149         /**\r
150          * Removes all special characters and digits as well as whitespace chars\r
151          * from the sequence\r
152          * \r
153          * @param sequence\r
154          * @return cleaned up sequence\r
155          */\r
156         public static String deepCleanSequence(String sequence) {\r
157                 sequence = SequenceUtil.cleanSequence(sequence);\r
158                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
159                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
160                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
161                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
162                 return sequence;\r
163         }\r
164 \r
165         /**\r
166          * @param sequence\r
167          * @return true is the sequence is a protein sequence, false overwise\r
168          */\r
169         public static boolean isProteinSequence(String sequence) {\r
170                 sequence = SequenceUtil.cleanSequence(sequence);\r
171                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
172                         return false;\r
173                 }\r
174                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
175                         return false;\r
176                 }\r
177                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
178                         return false;\r
179                 }\r
180                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
181                 return protmatcher.find();\r
182         }\r
183 \r
184         /**\r
185          * Check whether the sequence confirms to amboguous protein sequence\r
186          * \r
187          * @param sequence\r
188          * @return return true only if the sequence if ambiguous protein sequence\r
189          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
190          *         protein or DNA\r
191          */\r
192         public static boolean isAmbiguosProtein(String sequence) {\r
193                 sequence = SequenceUtil.cleanSequence(sequence);\r
194                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
195                         return false;\r
196                 }\r
197                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
198                         return false;\r
199                 }\r
200                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
201                         return false;\r
202                 }\r
203                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
204                         return false;\r
205                 }\r
206                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
207                 return amb_prot.find();\r
208         }\r
209 \r
210         /**\r
211          * Writes list of FastaSequeces into the outstream formatting the sequence\r
212          * so that it contains width chars on each line\r
213          * \r
214          * @param outstream\r
215          * @param sequences\r
216          * @param width\r
217          *            - the maximum number of characters to write in one line\r
218          * @throws IOException\r
219          */\r
220         public static void writeFasta(final OutputStream outstream,\r
221                         final List<FastaSequence> sequences, final int width)\r
222                         throws IOException {\r
223                 writeFastaKeepTheStream(outstream, sequences, width);\r
224                 outstream.close();\r
225         }\r
226 \r
227         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
228                         final List<FastaSequence> sequences, final int width)\r
229                         throws IOException {\r
230                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
231                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
232                 for (final FastaSequence fs : sequences) {\r
233                         fastawriter.write(">" + fs.getId() + "\n");\r
234                         fastawriter.write(fs.getFormatedSequence(width));\r
235                         fastawriter.write("\n");\r
236                 }\r
237                 fastawriter.flush();\r
238                 writer.flush();\r
239         }\r
240 \r
241         /**\r
242          * Reads fasta sequences from inStream into the list of FastaSequence\r
243          * objects\r
244          * \r
245          * @param inStream\r
246          *            from\r
247          * @return list of FastaSequence objects\r
248          * @throws IOException\r
249          */\r
250         public static List<FastaSequence> readFasta(final InputStream inStream)\r
251                         throws IOException {\r
252                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
253 \r
254                 final BufferedReader infasta = new BufferedReader(\r
255                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
256                 final Pattern pattern = Pattern.compile("//s+");\r
257 \r
258                 String line;\r
259                 String sname = "", seqstr = null;\r
260                 do {\r
261                         line = infasta.readLine();\r
262                         if ((line == null) || line.startsWith(">")) {\r
263                                 if (seqstr != null) {\r
264                                         seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
265                                 }\r
266                                 sname = line; // remove >\r
267                                 seqstr = "";\r
268                         } else {\r
269                                 final String subseq = pattern.matcher(line).replaceAll("");\r
270                                 seqstr += subseq;\r
271                         }\r
272                 } while (line != null);\r
273 \r
274                 infasta.close();\r
275                 return seqs;\r
276         }\r
277 \r
278         /**\r
279          * Writes FastaSequence in the file, each sequence will take one line only\r
280          * \r
281          * @param os\r
282          * @param sequences\r
283          * @throws IOException\r
284          */\r
285         public static void writeFasta(final OutputStream os,\r
286                         final List<FastaSequence> sequences) throws IOException {\r
287                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
288                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
289                 for (final FastaSequence fs : sequences) {\r
290                         fasta_out.write(fs.getOnelineFasta());\r
291                 }\r
292                 fasta_out.close();\r
293                 outWriter.close();\r
294         }\r
295 \r
296         public static List<AnnotatedSequence> readJRonn(final File result)\r
297                         throws IOException, UnknownFileFormatException {\r
298                 InputStream input = new FileInputStream(result);\r
299                 List<AnnotatedSequence> sequences = readJRonn(input);\r
300                 input.close();\r
301                 return sequences;\r
302         }\r
303 \r
304         /**\r
305          * Reader for JRonn horizontal file format >Foobar M G D T T A G 0.48 0.42\r
306          * 0.42 0.48 0.52 0.53 0.54 All values are tab delimited\r
307          * \r
308          * @param inStream\r
309          * @return\r
310          * @throws IOException\r
311          * @throws UnknownFileFormatException\r
312          */\r
313         public static List<AnnotatedSequence> readJRonn(final InputStream inStream)\r
314                         throws IOException, UnknownFileFormatException {\r
315                 final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();\r
316 \r
317                 final BufferedReader infasta = new BufferedReader(\r
318                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
319 \r
320                 String line;\r
321                 String sname = "";\r
322                 do {\r
323                         line = infasta.readLine();\r
324                         if (line == null || line.isEmpty()) {\r
325                                 // skip empty lines\r
326                                 continue;\r
327                         }\r
328                         if (line.startsWith(">")) {\r
329                                 // read name\r
330                                 sname = line.trim().substring(1);\r
331                                 // read sequence line\r
332                                 line = infasta.readLine();\r
333                                 final String sequence = line.replace("\t", "");\r
334                                 // read annotation line\r
335                                 line = infasta.readLine();\r
336                                 String[] annotValues = line.split("\t");\r
337                                 float[] annotation = convertToNumber(annotValues);\r
338                                 if (annotation.length != sequence.length()) {\r
339                                         throw new UnknownFileFormatException(\r
340                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
341                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
342                                 }\r
343                                 seqs.add(new AnnotatedSequence(sname, sequence, annotation));\r
344                         }\r
345                 } while (line != null);\r
346 \r
347                 infasta.close();\r
348                 return seqs;\r
349         }\r
350 \r
351         private static float[] convertToNumber(String[] annotValues)\r
352                         throws UnknownFileFormatException {\r
353                 float[] annotation = new float[annotValues.length];\r
354                 try {\r
355                         for (int i = 0; i < annotation.length; i++) {\r
356                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
357                         }\r
358                 } catch (NumberFormatException e) {\r
359                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
360                                         e.getCause());\r
361                 }\r
362                 return annotation;\r
363         }\r
364 \r
365         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
366                         + ">sequence_name\n "\r
367                         + "M    V       S\n"\r
368                         + "0.43 0.22    0.65\n"\r
369                         + "Where first line is the sequence name,\n"\r
370                         + "second line is the tab delimited sequence,\n"\r
371                         + "third line contains tab delimited disorder prediction values.\n"\r
372                         + "No lines are allowed between these three. Additionally, the number of  "\r
373                         + "sequence residues must be equal to the number of the disorder values.";\r
374 \r
375         /**\r
376          * Closes the Closable and logs the exception if any\r
377          * \r
378          * @param log\r
379          * @param stream\r
380          */\r
381         public final static void closeSilently(java.util.logging.Logger log,\r
382                         Closeable stream) {\r
383                 if (stream != null) {\r
384                         try {\r
385                                 stream.close();\r
386                         } catch (IOException e) {\r
387                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
388                         }\r
389                 }\r
390         }\r
391 \r
392         /**\r
393          * \r
394          * TODO complete!\r
395          * \r
396          * # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512 0.37094 T 0.79983\r
397          * 0.85864 0.44331 .... # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512\r
398          * 0.37094\r
399          * \r
400          * @param input\r
401          * @return\r
402          * @throws IOException\r
403          * @throws UnknownFileFormatException\r
404          */\r
405         public static List<MultiAnnotatedSequence<DisemblResultAnnot>> readDisembl(\r
406                         final InputStream input) throws IOException,\r
407                         UnknownFileFormatException {\r
408                 Scanner scan = new Scanner(input);\r
409                 scan.useDelimiter("# RESIDUE COILS REM465 HOTLOOPS\n");\r
410                 if (!scan.hasNext()) {\r
411                         throw new UnknownFileFormatException(\r
412                                         "In Disembl score format each seqeunce score is expected to start from the line: "\r
413                                                         + "'# RESIDUE COILS REM465 HOTLOOPS\\n'."\r
414                                                         + " No such line was found!");\r
415                 }\r
416 \r
417                 List<MultiAnnotatedSequence<DisemblResultAnnot>> results = new ArrayList<MultiAnnotatedSequence<DisemblResultAnnot>>();\r
418                 int seqCounter = 0;\r
419                 while (scan.hasNext()) {\r
420                         seqCounter++;\r
421                         String singleSeq = scan.next();\r
422                         Scanner scansingle = new Scanner(singleSeq);\r
423                         StringBuffer seqbuffer = new StringBuffer();\r
424                         List<Float> coils = new ArrayList<Float>();\r
425                         List<Float> rem = new ArrayList<Float>();\r
426                         List<Float> hotloops = new ArrayList<Float>();\r
427 \r
428                         MultiAnnotatedSequence<DisemblResultAnnot> disemblRes = new MultiAnnotatedSequence<DisemblResultAnnot>(\r
429                                         DisemblResultAnnot.class);\r
430 \r
431                         while (scansingle.hasNextLine()) {\r
432                                 String valueLine = scansingle.nextLine();\r
433                                 Scanner values = new Scanner(valueLine);\r
434                                 seqbuffer.append(values.next());\r
435                                 coils.add(values.nextFloat());\r
436                                 rem.add(values.nextFloat());\r
437                                 hotloops.add(values.nextFloat());\r
438                                 values.close();\r
439                         }\r
440                         disemblRes.addAnnotation(DisemblResultAnnot.COILS, coils);\r
441                         disemblRes.addAnnotation(DisemblResultAnnot.REM465, rem);\r
442                         disemblRes.addAnnotation(DisemblResultAnnot.HOTLOOPS, hotloops);\r
443                         // TODO\r
444                         // disemblRes.sequence = seqbuffer.toString();\r
445                         scansingle.close();\r
446                         results.add(disemblRes);\r
447                 }\r
448 \r
449                 input.close();\r
450                 return results;\r
451         }\r
452 \r
453 }\r