More work on AAConWS not finished yet!
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /*\r
2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
3  * Jalview Web Services version: 2.0 This library is free software; you can\r
4  * redistribute it and/or modify it under the terms of the Apache License\r
5  * version 2 as published by the Apache Software Foundation This library is\r
6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
9  * license is in apache_license.txt. It is also available here: see:\r
10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
11  * work distributed in source code form must include this copyright and license\r
12  * notice.\r
13  */\r
14 \r
15 package compbio.data.sequence;\r
16 \r
17 import java.io.BufferedReader;\r
18 import java.io.BufferedWriter;\r
19 import java.io.Closeable;\r
20 import java.io.File;\r
21 import java.io.FileInputStream;\r
22 import java.io.IOException;\r
23 import java.io.InputStream;\r
24 import java.io.InputStreamReader;\r
25 import java.io.OutputStream;\r
26 import java.io.OutputStreamWriter;\r
27 import java.util.ArrayList;\r
28 import java.util.List;\r
29 import java.util.Scanner;\r
30 import java.util.logging.Level;\r
31 import java.util.regex.Matcher;\r
32 import java.util.regex.Pattern;\r
33 \r
34 import compbio.conservation.Method;\r
35 \r
36 /**\r
37  * Utility class for operations on sequences\r
38  * \r
39  * @author Petr Troshin\r
40  * @version 1.0\r
41  */\r
42 public final class SequenceUtil {\r
43 \r
44         /**\r
45          * A whitespace character: [\t\n\x0B\f\r]\r
46          */\r
47         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
48 \r
49         /**\r
50          * A digit\r
51          */\r
52         public static final Pattern DIGIT = Pattern.compile("\\d");\r
53 \r
54         /**\r
55          * Non word\r
56          */\r
57         public static final Pattern NONWORD = Pattern.compile("\\W");\r
58 \r
59         /**\r
60          * Valid Amino acids\r
61          */\r
62         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
63                         Pattern.CASE_INSENSITIVE);\r
64 \r
65         /**\r
66          * inversion of AA pattern\r
67          */\r
68         public static final Pattern NON_AA = Pattern.compile(\r
69                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
70 \r
71         /**\r
72          * Same as AA pattern but with two additional letters - XU\r
73          */\r
74         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
75                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
76 \r
77         /**\r
78          * Nucleotides a, t, g, c, u\r
79          */\r
80         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
81                         Pattern.CASE_INSENSITIVE);\r
82 \r
83         /**\r
84          * Ambiguous nucleotide\r
85          */\r
86         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
87                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
88         /**\r
89          * Non nucleotide\r
90          */\r
91         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
92                         Pattern.CASE_INSENSITIVE);\r
93 \r
94         private SequenceUtil() {\r
95         } // utility class, no instantiation\r
96 \r
97         /*\r
98          * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
99          * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
100          * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
101          * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
102          * SysPrefs.newlinechar); pir_out.close(); } public static void\r
103          * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
104          * BufferedWriter fasta_out = new BufferedWriter( new\r
105          * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
106          * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
107          * SysPrefs.newlinechar); fasta_out.close(); }\r
108          */\r
109 \r
110         /**\r
111          * @return true is the sequence contains only letters a,c, t, g, u\r
112          */\r
113         public static boolean isNucleotideSequence(final FastaSequence s) {\r
114                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
115         }\r
116 \r
117         /**\r
118          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
119          * (!) - B char\r
120          */\r
121         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
122                 sequence = SequenceUtil.cleanSequence(sequence);\r
123                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
124                         return false;\r
125                 }\r
126                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
127                         return false;\r
128                         /*\r
129                          * System.out.format("I found the text starting at " +\r
130                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
131                          * nonDNAmatcher.end());\r
132                          */\r
133                 }\r
134                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
135                 return DNAmatcher.find();\r
136         }\r
137 \r
138         /**\r
139          * Removes all whitespace chars in the sequence string\r
140          * \r
141          * @param sequence\r
142          * @return cleaned up sequence\r
143          */\r
144         public static String cleanSequence(String sequence) {\r
145                 assert sequence != null;\r
146                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
147                 sequence = m.replaceAll("").toUpperCase();\r
148                 return sequence;\r
149         }\r
150 \r
151         /**\r
152          * Removes all special characters and digits as well as whitespace chars\r
153          * from the sequence\r
154          * \r
155          * @param sequence\r
156          * @return cleaned up sequence\r
157          */\r
158         public static String deepCleanSequence(String sequence) {\r
159                 sequence = SequenceUtil.cleanSequence(sequence);\r
160                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
161                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
162                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
163                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
164                 return sequence;\r
165         }\r
166 \r
167         /**\r
168          * @param sequence\r
169          * @return true is the sequence is a protein sequence, false overwise\r
170          */\r
171         public static boolean isProteinSequence(String sequence) {\r
172                 sequence = SequenceUtil.cleanSequence(sequence);\r
173                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
174                         return false;\r
175                 }\r
176                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
177                         return false;\r
178                 }\r
179                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
180                         return false;\r
181                 }\r
182                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
183                 return protmatcher.find();\r
184         }\r
185 \r
186         /**\r
187          * Check whether the sequence confirms to amboguous protein sequence\r
188          * \r
189          * @param sequence\r
190          * @return return true only if the sequence if ambiguous protein sequence\r
191          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
192          *         protein or DNA\r
193          */\r
194         public static boolean isAmbiguosProtein(String sequence) {\r
195                 sequence = SequenceUtil.cleanSequence(sequence);\r
196                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
197                         return false;\r
198                 }\r
199                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
200                         return false;\r
201                 }\r
202                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
203                         return false;\r
204                 }\r
205                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
206                         return false;\r
207                 }\r
208                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
209                 return amb_prot.find();\r
210         }\r
211 \r
212         /**\r
213          * Writes list of FastaSequeces into the outstream formatting the sequence\r
214          * so that it contains width chars on each line\r
215          * \r
216          * @param outstream\r
217          * @param sequences\r
218          * @param width\r
219          *            - the maximum number of characters to write in one line\r
220          * @throws IOException\r
221          */\r
222         public static void writeFasta(final OutputStream outstream,\r
223                         final List<FastaSequence> sequences, final int width)\r
224                         throws IOException {\r
225                 writeFastaKeepTheStream(outstream, sequences, width);\r
226                 outstream.close();\r
227         }\r
228 \r
229         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
230                         final List<FastaSequence> sequences, final int width)\r
231                         throws IOException {\r
232                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
233                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
234                 for (final FastaSequence fs : sequences) {\r
235                         fastawriter.write(">" + fs.getId() + "\n");\r
236                         fastawriter.write(fs.getFormatedSequence(width));\r
237                         fastawriter.write("\n");\r
238                 }\r
239                 fastawriter.flush();\r
240                 writer.flush();\r
241         }\r
242 \r
243         /**\r
244          * Reads fasta sequences from inStream into the list of FastaSequence\r
245          * objects\r
246          * \r
247          * @param inStream\r
248          *            from\r
249          * @return list of FastaSequence objects\r
250          * @throws IOException\r
251          */\r
252         public static List<FastaSequence> readFasta(final InputStream inStream)\r
253                         throws IOException {\r
254                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
255 \r
256                 final BufferedReader infasta = new BufferedReader(\r
257                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
258                 final Pattern pattern = Pattern.compile("//s+");\r
259 \r
260                 String line;\r
261                 String sname = "", seqstr = null;\r
262                 do {\r
263                         line = infasta.readLine();\r
264                         if ((line == null) || line.startsWith(">")) {\r
265                                 if (seqstr != null) {\r
266                                         seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
267                                 }\r
268                                 sname = line; // remove >\r
269                                 seqstr = "";\r
270                         } else {\r
271                                 final String subseq = pattern.matcher(line).replaceAll("");\r
272                                 seqstr += subseq;\r
273                         }\r
274                 } while (line != null);\r
275 \r
276                 infasta.close();\r
277                 return seqs;\r
278         }\r
279 \r
280         /**\r
281          * Writes FastaSequence in the file, each sequence will take one line only\r
282          * \r
283          * @param os\r
284          * @param sequences\r
285          * @throws IOException\r
286          */\r
287         public static void writeFasta(final OutputStream os,\r
288                         final List<FastaSequence> sequences) throws IOException {\r
289                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
290                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
291                 for (final FastaSequence fs : sequences) {\r
292                         fasta_out.write(fs.getOnelineFasta());\r
293                 }\r
294                 fasta_out.close();\r
295                 outWriter.close();\r
296         }\r
297 \r
298         public static List<AnnotatedSequence> readJRonn(final File result)\r
299                         throws IOException, UnknownFileFormatException {\r
300                 InputStream input = new FileInputStream(result);\r
301                 List<AnnotatedSequence> sequences = readJRonn(input);\r
302                 input.close();\r
303                 return sequences;\r
304         }\r
305 \r
306         /**\r
307          * Reader for JRonn horizontal file format >Foobar M G D T T A G 0.48 0.42\r
308          * 0.42 0.48 0.52 0.53 0.54 All values are tab delimited\r
309          * \r
310          * @param inStream\r
311          * @return\r
312          * @throws IOException\r
313          * @throws UnknownFileFormatException\r
314          */\r
315         public static List<AnnotatedSequence> readJRonn(final InputStream inStream)\r
316                         throws IOException, UnknownFileFormatException {\r
317                 final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();\r
318 \r
319                 final BufferedReader infasta = new BufferedReader(\r
320                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
321 \r
322                 String line;\r
323                 String sname = "";\r
324                 do {\r
325                         line = infasta.readLine();\r
326                         if (line == null || line.isEmpty()) {\r
327                                 // skip empty lines\r
328                                 continue;\r
329                         }\r
330                         if (line.startsWith(">")) {\r
331                                 // read name\r
332                                 sname = line.trim().substring(1);\r
333                                 // read sequence line\r
334                                 line = infasta.readLine();\r
335                                 final String sequence = line.replace("\t", "");\r
336                                 // read annotation line\r
337                                 line = infasta.readLine();\r
338                                 String[] annotValues = line.split("\t");\r
339                                 float[] annotation = convertToNumber(annotValues);\r
340                                 if (annotation.length != sequence.length()) {\r
341                                         throw new UnknownFileFormatException(\r
342                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
343                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
344                                 }\r
345                                 seqs.add(new AnnotatedSequence(sname, sequence, annotation));\r
346                         }\r
347                 } while (line != null);\r
348 \r
349                 infasta.close();\r
350                 return seqs;\r
351         }\r
352 \r
353         private static float[] convertToNumber(String[] annotValues)\r
354                         throws UnknownFileFormatException {\r
355                 float[] annotation = new float[annotValues.length];\r
356                 try {\r
357                         for (int i = 0; i < annotation.length; i++) {\r
358                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
359                         }\r
360                 } catch (NumberFormatException e) {\r
361                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
362                                         e.getCause());\r
363                 }\r
364                 return annotation;\r
365         }\r
366 \r
367         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
368                         + ">sequence_name\n "\r
369                         + "M    V       S\n"\r
370                         + "0.43 0.22    0.65\n"\r
371                         + "Where first line is the sequence name,\n"\r
372                         + "second line is the tab delimited sequence,\n"\r
373                         + "third line contains tab delimited disorder prediction values.\n"\r
374                         + "No lines are allowed between these three. Additionally, the number of  "\r
375                         + "sequence residues must be equal to the number of the disorder values.";\r
376 \r
377         /**\r
378          * Closes the Closable and logs the exception if any\r
379          * \r
380          * @param log\r
381          * @param stream\r
382          */\r
383         public final static void closeSilently(java.util.logging.Logger log,\r
384                         Closeable stream) {\r
385                 if (stream != null) {\r
386                         try {\r
387                                 stream.close();\r
388                         } catch (IOException e) {\r
389                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
390                         }\r
391                 }\r
392         }\r
393 \r
394         /**\r
395          * \r
396          * TODO complete!\r
397          * \r
398          * # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512 0.37094 T 0.79983\r
399          * 0.85864 0.44331 .... # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512\r
400          * 0.37094\r
401          * \r
402          * @param input\r
403          * @return\r
404          * @throws IOException\r
405          * @throws UnknownFileFormatException\r
406          */\r
407         public static List<MultiAnnotatedSequence<DisemblResultAnnot>> readDisembl(\r
408                         final InputStream input) throws IOException,\r
409                         UnknownFileFormatException {\r
410                 Scanner scan = new Scanner(input);\r
411                 scan.useDelimiter("# RESIDUE COILS REM465 HOTLOOPS\n");\r
412                 if (!scan.hasNext()) {\r
413                         throw new UnknownFileFormatException(\r
414                                         "In Disembl score format each seqeunce score is expected to start from the line: "\r
415                                                         + "'# RESIDUE COILS REM465 HOTLOOPS\\n'."\r
416                                                         + " No such line was found!");\r
417                 }\r
418 \r
419                 List<MultiAnnotatedSequence<DisemblResultAnnot>> results = new ArrayList<MultiAnnotatedSequence<DisemblResultAnnot>>();\r
420                 int seqCounter = 0;\r
421                 while (scan.hasNext()) {\r
422                         seqCounter++;\r
423                         String singleSeq = scan.next();\r
424                         Scanner scansingle = new Scanner(singleSeq);\r
425                         StringBuffer seqbuffer = new StringBuffer();\r
426                         ArrayList<Float> coils = new ArrayList<Float>();\r
427                         ArrayList<Float> rem = new ArrayList<Float>();\r
428                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
429 \r
430                         MultiAnnotatedSequence<DisemblResultAnnot> disemblRes = new MultiAnnotatedSequence<DisemblResultAnnot>(\r
431                                         DisemblResultAnnot.class);\r
432 \r
433                         while (scansingle.hasNextLine()) {\r
434                                 String valueLine = scansingle.nextLine();\r
435                                 Scanner values = new Scanner(valueLine);\r
436                                 seqbuffer.append(values.next());\r
437                                 coils.add(values.nextFloat());\r
438                                 rem.add(values.nextFloat());\r
439                                 hotloops.add(values.nextFloat());\r
440                                 values.close();\r
441                         }\r
442                         disemblRes.addAnnotation(DisemblResultAnnot.COILS, coils);\r
443                         disemblRes.addAnnotation(DisemblResultAnnot.REM465, rem);\r
444                         disemblRes.addAnnotation(DisemblResultAnnot.HOTLOOPS, hotloops);\r
445                         // TODO\r
446                         // disemblRes.sequence = seqbuffer.toString();\r
447                         scansingle.close();\r
448                         results.add(disemblRes);\r
449                 }\r
450 \r
451                 input.close();\r
452                 return results;\r
453         }\r
454 \r
455         /**\r
456          * Read AACon result with no alignment files. This method leaves incoming\r
457          * the InputStream results open!\r
458          * \r
459          * @param results\r
460          *            output file of AAConservation\r
461          * @return {@link MultiAnnotatedSequence}\r
462          */\r
463         public static MultiAnnotatedSequence<Method> readResults(InputStream results) {\r
464                 if (results == null) {\r
465                         throw new NullPointerException(\r
466                                         "InputStream with results must be provided");\r
467                 }\r
468                 MultiAnnotatedSequence<Method> annotations = new MultiAnnotatedSequence<Method>(\r
469                                 Method.class);\r
470                 Scanner sc = new Scanner(results);\r
471                 sc.useDelimiter("#");\r
472                 while (sc.hasNext()) {\r
473                         String line = sc.next();\r
474                         int spacePos = line.indexOf(" ");\r
475                         assert spacePos > 0 : "Space is expected as delimited between method "\r
476                                         + "name and values!";\r
477                         String methodLine = line.substring(0, spacePos);\r
478                         Method method = Method.getMethod(methodLine);\r
479                         assert method != null : "Method " + methodLine\r
480                                         + " is not recognized! ";\r
481                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
482                         ArrayList<Float> values = new ArrayList<Float>();\r
483                         while (valuesScanner.hasNextDouble()) {\r
484                                 Double value = valuesScanner.nextDouble();\r
485                                 values.add(value.floatValue());\r
486                         }\r
487                         annotations.addAnnotation(method, values);\r
488                 }\r
489                 return annotations;\r
490         }\r
491 \r
492 }\r