More javadocs
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /*\r
2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
3  * Jalview Web Services version: 2.0 This library is free software; you can\r
4  * redistribute it and/or modify it under the terms of the Apache License\r
5  * version 2 as published by the Apache Software Foundation This library is\r
6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
9  * license is in apache_license.txt. It is also available here: see:\r
10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
11  * work distributed in source code form must include this copyright and license\r
12  * notice.\r
13  */\r
14 \r
15 package compbio.data.sequence;\r
16 \r
17 import java.io.BufferedReader;\r
18 import java.io.BufferedWriter;\r
19 import java.io.Closeable;\r
20 import java.io.File;\r
21 import java.io.FileInputStream;\r
22 import java.io.IOException;\r
23 import java.io.InputStream;\r
24 import java.io.InputStreamReader;\r
25 import java.io.OutputStream;\r
26 import java.io.OutputStreamWriter;\r
27 import java.util.ArrayList;\r
28 import java.util.HashSet;\r
29 import java.util.List;\r
30 import java.util.Scanner;\r
31 import java.util.logging.Level;\r
32 import java.util.regex.Matcher;\r
33 import java.util.regex.Pattern;\r
34 \r
35 /**\r
36  * Utility class for operations on sequences\r
37  * \r
38  * @author Petr Troshin\r
39  * @version 1.0\r
40  */\r
41 public final class SequenceUtil {\r
42 \r
43         /**\r
44          * A whitespace character: [\t\n\x0B\f\r]\r
45          */\r
46         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
47 \r
48         /**\r
49          * A digit\r
50          */\r
51         public static final Pattern DIGIT = Pattern.compile("\\d");\r
52 \r
53         /**\r
54          * Non word\r
55          */\r
56         public static final Pattern NONWORD = Pattern.compile("\\W");\r
57 \r
58         /**\r
59          * Valid Amino acids\r
60          */\r
61         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
62                         Pattern.CASE_INSENSITIVE);\r
63 \r
64         /**\r
65          * inversion of AA pattern\r
66          */\r
67         public static final Pattern NON_AA = Pattern.compile(\r
68                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
69 \r
70         /**\r
71          * Same as AA pattern but with two additional letters - XU\r
72          */\r
73         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
74                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
75 \r
76         /**\r
77          * Nucleotides a, t, g, c, u\r
78          */\r
79         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
80                         Pattern.CASE_INSENSITIVE);\r
81 \r
82         /**\r
83          * Ambiguous nucleotide\r
84          */\r
85         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
86                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
87         /**\r
88          * Non nucleotide\r
89          */\r
90         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
91                         Pattern.CASE_INSENSITIVE);\r
92 \r
93         private SequenceUtil() {\r
94         } // utility class, no instantiation\r
95 \r
96         /*\r
97          * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
98          * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
99          * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
100          * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
101          * SysPrefs.newlinechar); pir_out.close(); } public static void\r
102          * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
103          * BufferedWriter fasta_out = new BufferedWriter( new\r
104          * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
105          * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
106          * SysPrefs.newlinechar); fasta_out.close(); }\r
107          */\r
108 \r
109         /**\r
110          * @return true is the sequence contains only letters a,c, t, g, u\r
111          */\r
112         public static boolean isNucleotideSequence(final FastaSequence s) {\r
113                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
114         }\r
115 \r
116         /**\r
117          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
118          * (!) - B char\r
119          */\r
120         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
121                 sequence = SequenceUtil.cleanSequence(sequence);\r
122                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
123                         return false;\r
124                 }\r
125                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
126                         return false;\r
127                         /*\r
128                          * System.out.format("I found the text starting at " +\r
129                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
130                          * nonDNAmatcher.end());\r
131                          */\r
132                 }\r
133                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
134                 return DNAmatcher.find();\r
135         }\r
136 \r
137         /**\r
138          * Removes all whitespace chars in the sequence string\r
139          * \r
140          * @param sequence\r
141          * @return cleaned up sequence\r
142          */\r
143         public static String cleanSequence(String sequence) {\r
144                 assert sequence != null;\r
145                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
146                 sequence = m.replaceAll("").toUpperCase();\r
147                 return sequence;\r
148         }\r
149 \r
150         /**\r
151          * Removes all special characters and digits as well as whitespace chars\r
152          * from the sequence\r
153          * \r
154          * @param sequence\r
155          * @return cleaned up sequence\r
156          */\r
157         public static String deepCleanSequence(String sequence) {\r
158                 sequence = SequenceUtil.cleanSequence(sequence);\r
159                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
160                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
161                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
162                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
163                 return sequence;\r
164         }\r
165 \r
166         /**\r
167          * @param sequence\r
168          * @return true is the sequence is a protein sequence, false overwise\r
169          */\r
170         public static boolean isProteinSequence(String sequence) {\r
171                 sequence = SequenceUtil.cleanSequence(sequence);\r
172                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
173                         return false;\r
174                 }\r
175                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
176                         return false;\r
177                 }\r
178                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
179                         return false;\r
180                 }\r
181                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
182                 return protmatcher.find();\r
183         }\r
184 \r
185         /**\r
186          * Check whether the sequence confirms to amboguous protein sequence\r
187          * \r
188          * @param sequence\r
189          * @return return true only if the sequence if ambiguous protein sequence\r
190          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
191          *         protein or DNA\r
192          */\r
193         public static boolean isAmbiguosProtein(String sequence) {\r
194                 sequence = SequenceUtil.cleanSequence(sequence);\r
195                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
196                         return false;\r
197                 }\r
198                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
199                         return false;\r
200                 }\r
201                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
202                         return false;\r
203                 }\r
204                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
205                         return false;\r
206                 }\r
207                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
208                 return amb_prot.find();\r
209         }\r
210 \r
211         /**\r
212          * Writes list of FastaSequeces into the outstream formatting the sequence\r
213          * so that it contains width chars on each line\r
214          * \r
215          * @param outstream\r
216          * @param sequences\r
217          * @param width\r
218          *            - the maximum number of characters to write in one line\r
219          * @throws IOException\r
220          */\r
221         public static void writeFasta(final OutputStream outstream,\r
222                         final List<FastaSequence> sequences, final int width)\r
223                         throws IOException {\r
224                 writeFastaKeepTheStream(outstream, sequences, width);\r
225                 outstream.close();\r
226         }\r
227 \r
228         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
229                         final List<FastaSequence> sequences, final int width)\r
230                         throws IOException {\r
231                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
232                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
233                 for (final FastaSequence fs : sequences) {\r
234                         fastawriter.write(">" + fs.getId() + "\n");\r
235                         fastawriter.write(fs.getFormatedSequence(width));\r
236                         fastawriter.write("\n");\r
237                 }\r
238                 fastawriter.flush();\r
239                 writer.flush();\r
240         }\r
241 \r
242         /**\r
243          * Reads fasta sequences from inStream into the list of FastaSequence\r
244          * objects\r
245          * \r
246          * @param inStream\r
247          *            from\r
248          * @return list of FastaSequence objects\r
249          * @throws IOException\r
250          */\r
251         public static List<FastaSequence> readFasta(final InputStream inStream)\r
252                         throws IOException {\r
253                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
254 \r
255                 final BufferedReader infasta = new BufferedReader(\r
256                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
257                 final Pattern pattern = Pattern.compile("//s+");\r
258 \r
259                 String line;\r
260                 String sname = "", seqstr = null;\r
261                 do {\r
262                         line = infasta.readLine();\r
263                         if ((line == null) || line.startsWith(">")) {\r
264                                 if (seqstr != null) {\r
265                                         seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
266                                 }\r
267                                 sname = line; // remove >\r
268                                 seqstr = "";\r
269                         } else {\r
270                                 final String subseq = pattern.matcher(line).replaceAll("");\r
271                                 seqstr += subseq;\r
272                         }\r
273                 } while (line != null);\r
274 \r
275                 infasta.close();\r
276                 return seqs;\r
277         }\r
278 \r
279         /**\r
280          * Writes FastaSequence in the file, each sequence will take one line only\r
281          * \r
282          * @param os\r
283          * @param sequences\r
284          * @throws IOException\r
285          */\r
286         public static void writeFasta(final OutputStream os,\r
287                         final List<FastaSequence> sequences) throws IOException {\r
288                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
289                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
290                 for (final FastaSequence fs : sequences) {\r
291                         fasta_out.write(fs.getOnelineFasta());\r
292                 }\r
293                 fasta_out.close();\r
294                 outWriter.close();\r
295         }\r
296 \r
297         public static List<AnnotatedSequence> readJRonn(final File result)\r
298                         throws IOException, UnknownFileFormatException {\r
299                 InputStream input = new FileInputStream(result);\r
300                 List<AnnotatedSequence> sequences = readJRonn(input);\r
301                 input.close();\r
302                 return sequences;\r
303         }\r
304 \r
305         /**\r
306          * Reader for JRonn horizontal file format\r
307          * \r
308          * <pre>\r
309          * &gtFoobar M G D T T A G 0.48 0.42\r
310          * 0.42 0.48 0.52 0.53 0.54\r
311          * \r
312          * <pre>\r
313          * Where all values are tab delimited\r
314          * \r
315          * @param inStream\r
316          *            the InputStream connected to the JRonn output file\r
317          * @return List of {@link AnnotatedSequence} objects\r
318          * @throws IOException\r
319          *             is thrown if the inStream has problems accessing the data\r
320          * @throws UnknownFileFormatException\r
321          *             is thrown if the inStream represents an unknown source of\r
322          * data, i.e. not a JRonn output\r
323          */\r
324         public static List<AnnotatedSequence> readJRonn(final InputStream inStream)\r
325                         throws IOException, UnknownFileFormatException {\r
326                 final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();\r
327 \r
328                 final BufferedReader infasta = new BufferedReader(\r
329                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
330 \r
331                 String line;\r
332                 String sname = "";\r
333                 do {\r
334                         line = infasta.readLine();\r
335                         if (line == null || line.isEmpty()) {\r
336                                 // skip empty lines\r
337                                 continue;\r
338                         }\r
339                         if (line.startsWith(">")) {\r
340                                 // read name\r
341                                 sname = line.trim().substring(1);\r
342                                 // read sequence line\r
343                                 line = infasta.readLine();\r
344                                 final String sequence = line.replace("\t", "");\r
345                                 // read annotation line\r
346                                 line = infasta.readLine();\r
347                                 String[] annotValues = line.split("\t");\r
348                                 float[] annotation = convertToNumber(annotValues);\r
349                                 if (annotation.length != sequence.length()) {\r
350                                         throw new UnknownFileFormatException(\r
351                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
352                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
353                                 }\r
354                                 seqs.add(new AnnotatedSequence(sname, sequence, annotation));\r
355                         }\r
356                 } while (line != null);\r
357 \r
358                 infasta.close();\r
359                 return seqs;\r
360         }\r
361 \r
362         private static float[] convertToNumber(String[] annotValues)\r
363                         throws UnknownFileFormatException {\r
364                 float[] annotation = new float[annotValues.length];\r
365                 try {\r
366                         for (int i = 0; i < annotation.length; i++) {\r
367                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
368                         }\r
369                 } catch (NumberFormatException e) {\r
370                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
371                                         e.getCause());\r
372                 }\r
373                 return annotation;\r
374         }\r
375 \r
376         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
377                         + ">sequence_name\n "\r
378                         + "M    V       S\n"\r
379                         + "0.43 0.22    0.65\n"\r
380                         + "Where first line is the sequence name,\n"\r
381                         + "second line is the tab delimited sequence,\n"\r
382                         + "third line contains tab delimited disorder prediction values.\n"\r
383                         + "No lines are allowed between these three. Additionally, the number of  "\r
384                         + "sequence residues must be equal to the number of the disorder values.";\r
385 \r
386         /**\r
387          * Closes the Closable and logs the exception if any\r
388          * \r
389          * @param log\r
390          * @param stream\r
391          */\r
392         public final static void closeSilently(java.util.logging.Logger log,\r
393                         Closeable stream) {\r
394                 if (stream != null) {\r
395                         try {\r
396                                 stream.close();\r
397                         } catch (IOException e) {\r
398                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
399                         }\r
400                 }\r
401         }\r
402 \r
403         /**\r
404          * \r
405          * TODO complete!\r
406          * \r
407          * # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512 0.37094 T 0.79983\r
408          * 0.85864 0.44331 .... # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512\r
409          * 0.37094\r
410          * \r
411          * @param input\r
412          * @return\r
413          * @throws IOException\r
414          * @throws UnknownFileFormatException\r
415          */\r
416         static List<MultiAnnotatedSequence<DisemblResultAnnot>> readDisembl(\r
417                         final InputStream input) throws IOException,\r
418                         UnknownFileFormatException {\r
419                 Scanner scan = new Scanner(input);\r
420                 scan.useDelimiter("# RESIDUE COILS REM465 HOTLOOPS\n");\r
421                 if (!scan.hasNext()) {\r
422                         throw new UnknownFileFormatException(\r
423                                         "In Disembl score format each seqeunce score is expected to start from the line: "\r
424                                                         + "'# RESIDUE COILS REM465 HOTLOOPS\\n'."\r
425                                                         + " No such line was found!");\r
426                 }\r
427 \r
428                 List<MultiAnnotatedSequence<DisemblResultAnnot>> results = new ArrayList<MultiAnnotatedSequence<DisemblResultAnnot>>();\r
429                 int seqCounter = 0;\r
430                 while (scan.hasNext()) {\r
431                         seqCounter++;\r
432                         String singleSeq = scan.next();\r
433                         Scanner scansingle = new Scanner(singleSeq);\r
434                         StringBuffer seqbuffer = new StringBuffer();\r
435                         ArrayList<Float> coils = new ArrayList<Float>();\r
436                         ArrayList<Float> rem = new ArrayList<Float>();\r
437                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
438 \r
439                         MultiAnnotatedSequence<DisemblResultAnnot> disemblRes = new MultiAnnotatedSequence<DisemblResultAnnot>(\r
440                                         DisemblResultAnnot.class);\r
441 \r
442                         while (scansingle.hasNextLine()) {\r
443                                 String valueLine = scansingle.nextLine();\r
444                                 Scanner values = new Scanner(valueLine);\r
445                                 seqbuffer.append(values.next());\r
446                                 coils.add(values.nextFloat());\r
447                                 rem.add(values.nextFloat());\r
448                                 hotloops.add(values.nextFloat());\r
449                                 values.close();\r
450                         }\r
451                         disemblRes.addAnnotation(DisemblResultAnnot.COILS, coils);\r
452                         disemblRes.addAnnotation(DisemblResultAnnot.REM465, rem);\r
453                         disemblRes.addAnnotation(DisemblResultAnnot.HOTLOOPS, hotloops);\r
454                         // TODO\r
455                         // disemblRes.sequence = seqbuffer.toString();\r
456                         scansingle.close();\r
457                         results.add(disemblRes);\r
458                 }\r
459 \r
460                 input.close();\r
461                 return results;\r
462         }\r
463 \r
464         /**\r
465          * Read AACon result with no alignment files. This method leaves incoming\r
466          * the InputStream results open!\r
467          * \r
468          * @param results\r
469          *            output file of AAConservation\r
470          * @return Map with keys {@link Method} -> float[]\r
471          */\r
472         public static HashSet<Score> readAAConResults(InputStream results) {\r
473                 if (results == null) {\r
474                         throw new NullPointerException(\r
475                                         "InputStream with results must be provided");\r
476                 }\r
477                 HashSet<Score> annotations = new HashSet<Score>();\r
478                 Scanner sc = new Scanner(results);\r
479                 sc.useDelimiter("#");\r
480                 while (sc.hasNext()) {\r
481                         String line = sc.next();\r
482                         int spacePos = line.indexOf(" ");\r
483                         assert spacePos > 0 : "Space is expected as delimited between method "\r
484                                         + "name and values!";\r
485                         String methodLine = line.substring(0, spacePos);\r
486                         Method method = Method.getMethod(methodLine);\r
487                         assert method != null : "Method " + methodLine\r
488                                         + " is not recognized! ";\r
489                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
490                         ArrayList<Float> values = new ArrayList<Float>();\r
491                         while (valuesScanner.hasNextDouble()) {\r
492                                 Double value = valuesScanner.nextDouble();\r
493                                 values.add(value.floatValue());\r
494                         }\r
495                         annotations.add(new Score(method, values));\r
496                 }\r
497                 return annotations;\r
498         }\r
499 \r
500 }\r