Parser for DisEMBL results and finished runner, few test cases. DisemblTester has...
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /*\r
2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
3  * Jalview Web Services version: 2.0 This library is free software; you can\r
4  * redistribute it and/or modify it under the terms of the Apache License\r
5  * version 2 as published by the Apache Software Foundation This library is\r
6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
9  * license is in apache_license.txt. It is also available here: see:\r
10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
11  * work distributed in source code form must include this copyright and license\r
12  * notice.\r
13  */\r
14 \r
15 package compbio.data.sequence;\r
16 \r
17 import java.io.BufferedReader;\r
18 import java.io.BufferedWriter;\r
19 import java.io.Closeable;\r
20 import java.io.File;\r
21 import java.io.FileInputStream;\r
22 import java.io.IOException;\r
23 import java.io.InputStream;\r
24 import java.io.InputStreamReader;\r
25 import java.io.OutputStream;\r
26 import java.io.OutputStreamWriter;\r
27 import java.util.ArrayList;\r
28 import java.util.HashMap;\r
29 import java.util.HashSet;\r
30 import java.util.List;\r
31 import java.util.Map;\r
32 import java.util.Scanner;\r
33 import java.util.Set;\r
34 import java.util.logging.Level;\r
35 import java.util.regex.Matcher;\r
36 import java.util.regex.Pattern;\r
37 \r
38 /**\r
39  * Utility class for operations on sequences\r
40  * \r
41  * @author Petr Troshin\r
42  * @version 1.0\r
43  */\r
44 public final class SequenceUtil {\r
45 \r
46         /**\r
47          * A whitespace character: [\t\n\x0B\f\r]\r
48          */\r
49         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
50 \r
51         /**\r
52          * A digit\r
53          */\r
54         public static final Pattern DIGIT = Pattern.compile("\\d");\r
55 \r
56         /**\r
57          * Non word\r
58          */\r
59         public static final Pattern NONWORD = Pattern.compile("\\W");\r
60 \r
61         /**\r
62          * Valid Amino acids\r
63          */\r
64         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
65                         Pattern.CASE_INSENSITIVE);\r
66 \r
67         /**\r
68          * inversion of AA pattern\r
69          */\r
70         public static final Pattern NON_AA = Pattern.compile(\r
71                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
72 \r
73         /**\r
74          * Same as AA pattern but with two additional letters - XU\r
75          */\r
76         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
77                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
78 \r
79         /**\r
80          * Nucleotides a, t, g, c, u\r
81          */\r
82         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
83                         Pattern.CASE_INSENSITIVE);\r
84 \r
85         /**\r
86          * Ambiguous nucleotide\r
87          */\r
88         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
89                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
90         /**\r
91          * Non nucleotide\r
92          */\r
93         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
94                         Pattern.CASE_INSENSITIVE);\r
95 \r
96         private SequenceUtil() {\r
97         } // utility class, no instantiation\r
98 \r
99         /*\r
100          * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
101          * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
102          * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
103          * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
104          * SysPrefs.newlinechar); pir_out.close(); } public static void\r
105          * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
106          * BufferedWriter fasta_out = new BufferedWriter( new\r
107          * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
108          * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
109          * SysPrefs.newlinechar); fasta_out.close(); }\r
110          */\r
111 \r
112         /**\r
113          * @return true is the sequence contains only letters a,c, t, g, u\r
114          */\r
115         public static boolean isNucleotideSequence(final FastaSequence s) {\r
116                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
117         }\r
118 \r
119         /**\r
120          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
121          * (!) - B char\r
122          */\r
123         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
124                 sequence = SequenceUtil.cleanSequence(sequence);\r
125                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
126                         return false;\r
127                 }\r
128                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
129                         return false;\r
130                         /*\r
131                          * System.out.format("I found the text starting at " +\r
132                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
133                          * nonDNAmatcher.end());\r
134                          */\r
135                 }\r
136                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
137                 return DNAmatcher.find();\r
138         }\r
139 \r
140         /**\r
141          * Removes all whitespace chars in the sequence string\r
142          * \r
143          * @param sequence\r
144          * @return cleaned up sequence\r
145          */\r
146         public static String cleanSequence(String sequence) {\r
147                 assert sequence != null;\r
148                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
149                 sequence = m.replaceAll("").toUpperCase();\r
150                 return sequence;\r
151         }\r
152 \r
153         /**\r
154          * Removes all special characters and digits as well as whitespace chars\r
155          * from the sequence\r
156          * \r
157          * @param sequence\r
158          * @return cleaned up sequence\r
159          */\r
160         public static String deepCleanSequence(String sequence) {\r
161                 sequence = SequenceUtil.cleanSequence(sequence);\r
162                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
163                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
164                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
165                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
166                 return sequence;\r
167         }\r
168 \r
169         /**\r
170          * @param sequence\r
171          * @return true is the sequence is a protein sequence, false overwise\r
172          */\r
173         public static boolean isProteinSequence(String sequence) {\r
174                 sequence = SequenceUtil.cleanSequence(sequence);\r
175                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
176                         return false;\r
177                 }\r
178                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
179                         return false;\r
180                 }\r
181                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
182                         return false;\r
183                 }\r
184                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
185                 return protmatcher.find();\r
186         }\r
187 \r
188         /**\r
189          * Check whether the sequence confirms to amboguous protein sequence\r
190          * \r
191          * @param sequence\r
192          * @return return true only if the sequence if ambiguous protein sequence\r
193          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
194          *         protein or DNA\r
195          */\r
196         public static boolean isAmbiguosProtein(String sequence) {\r
197                 sequence = SequenceUtil.cleanSequence(sequence);\r
198                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
199                         return false;\r
200                 }\r
201                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
202                         return false;\r
203                 }\r
204                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
205                         return false;\r
206                 }\r
207                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
208                         return false;\r
209                 }\r
210                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
211                 return amb_prot.find();\r
212         }\r
213 \r
214         /**\r
215          * Writes list of FastaSequeces into the outstream formatting the sequence\r
216          * so that it contains width chars on each line\r
217          * \r
218          * @param outstream\r
219          * @param sequences\r
220          * @param width\r
221          *            - the maximum number of characters to write in one line\r
222          * @throws IOException\r
223          */\r
224         public static void writeFasta(final OutputStream outstream,\r
225                         final List<FastaSequence> sequences, final int width)\r
226                         throws IOException {\r
227                 writeFastaKeepTheStream(outstream, sequences, width);\r
228                 outstream.close();\r
229         }\r
230 \r
231         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
232                         final List<FastaSequence> sequences, final int width)\r
233                         throws IOException {\r
234                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
235                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
236                 for (final FastaSequence fs : sequences) {\r
237                         fastawriter.write(">" + fs.getId() + "\n");\r
238                         fastawriter.write(fs.getFormatedSequence(width));\r
239                         fastawriter.write("\n");\r
240                 }\r
241                 fastawriter.flush();\r
242                 writer.flush();\r
243         }\r
244 \r
245         /**\r
246          * Reads fasta sequences from inStream into the list of FastaSequence\r
247          * objects\r
248          * \r
249          * @param inStream\r
250          *            from\r
251          * @return list of FastaSequence objects\r
252          * @throws IOException\r
253          */\r
254         public static List<FastaSequence> readFasta(final InputStream inStream)\r
255                         throws IOException {\r
256                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
257 \r
258                 final BufferedReader infasta = new BufferedReader(\r
259                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
260                 final Pattern pattern = Pattern.compile("//s+");\r
261 \r
262                 String line;\r
263                 String sname = "", seqstr = null;\r
264                 do {\r
265                         line = infasta.readLine();\r
266                         if ((line == null) || line.startsWith(">")) {\r
267                                 if (seqstr != null) {\r
268                                         seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
269                                 }\r
270                                 sname = line; // remove >\r
271                                 seqstr = "";\r
272                         } else {\r
273                                 final String subseq = pattern.matcher(line).replaceAll("");\r
274                                 seqstr += subseq;\r
275                         }\r
276                 } while (line != null);\r
277 \r
278                 infasta.close();\r
279                 return seqs;\r
280         }\r
281 \r
282         /**\r
283          * Writes FastaSequence in the file, each sequence will take one line only\r
284          * \r
285          * @param os\r
286          * @param sequences\r
287          * @throws IOException\r
288          */\r
289         public static void writeFasta(final OutputStream os,\r
290                         final List<FastaSequence> sequences) throws IOException {\r
291                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
292                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
293                 for (final FastaSequence fs : sequences) {\r
294                         fasta_out.write(fs.getOnelineFasta());\r
295                 }\r
296                 fasta_out.close();\r
297                 outWriter.close();\r
298         }\r
299 \r
300         public static Map<String, Score> readJRonn(final File result)\r
301                         throws IOException, UnknownFileFormatException {\r
302                 InputStream input = new FileInputStream(result);\r
303                 Map<String, Score> sequences = readJRonn(input);\r
304                 input.close();\r
305                 return sequences;\r
306         }\r
307 \r
308         /**\r
309          * Reader for JRonn horizontal file format\r
310          * \r
311          * <pre>\r
312          * &gtFoobar M G D T T A G 0.48 0.42\r
313          * 0.42 0.48 0.52 0.53 0.54\r
314          * \r
315          * <pre>\r
316          * Where all values are tab delimited\r
317          * \r
318          * @param inStream\r
319          *            the InputStream connected to the JRonn output file\r
320          * @return List of {@link AnnotatedSequence} objects\r
321          * @throws IOException\r
322          *             is thrown if the inStream has problems accessing the data\r
323          * @throws UnknownFileFormatException\r
324          *             is thrown if the inStream represents an unknown source of\r
325          * data, i.e. not a JRonn output\r
326          */\r
327         public static Map<String, Score> readJRonn(final InputStream inStream)\r
328                         throws IOException, UnknownFileFormatException {\r
329                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
330 \r
331                 final BufferedReader infasta = new BufferedReader(\r
332                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
333 \r
334                 String line;\r
335                 String sname = "";\r
336                 do {\r
337                         line = infasta.readLine();\r
338                         if (line == null || line.isEmpty()) {\r
339                                 // skip empty lines\r
340                                 continue;\r
341                         }\r
342                         if (line.startsWith(">")) {\r
343                                 // read name\r
344                                 sname = line.trim().substring(1);\r
345                                 // read sequence line\r
346                                 line = infasta.readLine();\r
347                                 final String sequence = line.replace("\t", "");\r
348                                 // read annotation line\r
349                                 line = infasta.readLine();\r
350                                 String[] annotValues = line.split("\t");\r
351                                 float[] annotation = convertToNumber(annotValues);\r
352                                 if (annotation.length != sequence.length()) {\r
353                                         throw new UnknownFileFormatException(\r
354                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
355                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
356                                 }\r
357                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
358                         }\r
359                 } while (line != null);\r
360 \r
361                 infasta.close();\r
362                 return seqs;\r
363         }\r
364         private static float[] convertToNumber(String[] annotValues)\r
365                         throws UnknownFileFormatException {\r
366                 float[] annotation = new float[annotValues.length];\r
367                 try {\r
368                         for (int i = 0; i < annotation.length; i++) {\r
369                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
370                         }\r
371                 } catch (NumberFormatException e) {\r
372                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
373                                         e.getCause());\r
374                 }\r
375                 return annotation;\r
376         }\r
377 \r
378         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
379                         + ">sequence_name\n "\r
380                         + "M    V       S\n"\r
381                         + "0.43 0.22    0.65\n"\r
382                         + "Where first line is the sequence name,\n"\r
383                         + "second line is the tab delimited sequence,\n"\r
384                         + "third line contains tab delimited disorder prediction values.\n"\r
385                         + "No lines are allowed between these three. Additionally, the number of  "\r
386                         + "sequence residues must be equal to the number of the disorder values.";\r
387 \r
388         /**\r
389          * Closes the Closable and logs the exception if any\r
390          * \r
391          * @param log\r
392          * @param stream\r
393          */\r
394         public final static void closeSilently(java.util.logging.Logger log,\r
395                         Closeable stream) {\r
396                 if (stream != null) {\r
397                         try {\r
398                                 stream.close();\r
399                         } catch (IOException e) {\r
400                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
401                         }\r
402                 }\r
403         }\r
404 \r
405         /**\r
406          * \r
407          * TODO complete!\r
408          * \r
409          * >Sequence name\r
410          * \r
411          * RESIDUE COILS REM465 HOTLOOPS\r
412          * \r
413          * M 0.86010 0.88512 0.37094\r
414          * \r
415          * T 0.79983 0.85864 0.44331 ....\r
416 \r
417          * >Next Sequence name \r
418          * RESIDUE COILS REM465 HOTLOOPS\r
419          * \r
420          * M 0.86010 0.88512 0.37094\r
421          * \r
422          * \r
423          * @param input\r
424          * @return\r
425          * @throws IOException\r
426          * @throws UnknownFileFormatException\r
427          */\r
428         public static Map<FastaSequence, Set<Score>> readDisembl(final InputStream input)\r
429                         throws IOException, UnknownFileFormatException {\r
430                 Scanner scan = new Scanner(input);\r
431                 scan.useDelimiter(">");\r
432                 if (!scan.hasNext()) {\r
433                         throw new UnknownFileFormatException(\r
434                                         "In Disembl score format each sequence score is expected " +\r
435                                         "to start from the line: >Sequence name "\r
436                                                         + " No such line was found!");\r
437                 }\r
438 \r
439                 Map<FastaSequence, Set<Score>> results = new HashMap<FastaSequence, Set<Score>>();\r
440                 int seqCounter = 0;\r
441                 while (scan.hasNext()) {\r
442                         seqCounter++;\r
443                         String singleSeq = scan.next();\r
444                         Scanner scansingle = new Scanner(singleSeq);\r
445                         if(!scansingle.hasNextLine()) {\r
446                                 throw new RuntimeException("The input looks like an incomplete disembl file - cannot parse!");\r
447                         }\r
448                         \r
449                         StringBuffer seqbuffer = new StringBuffer();\r
450                         ArrayList<Float> coils = new ArrayList<Float>();\r
451                         ArrayList<Float> rem = new ArrayList<Float>();\r
452                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
453 \r
454                         String sequenceName = scansingle.nextLine().trim();\r
455                         String title =  scansingle.nextLine();\r
456                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
457                         \r
458                         while (scansingle.hasNext()) {\r
459                                 seqbuffer.append(scansingle.next());\r
460                                 coils.add(scansingle.nextFloat());\r
461                                 rem.add(scansingle.nextFloat());\r
462                                 hotloops.add(scansingle.nextFloat());\r
463                         }\r
464                         FastaSequence fs = new FastaSequence(sequenceName,seqbuffer.toString());\r
465                         Set<Score> scores = new HashSet<Score>();\r
466                         scores.add(new Score(DisemblResultAnnot.COILS, coils));\r
467                         scores.add(new Score(DisemblResultAnnot.HOTLOOPS, hotloops));\r
468                         scores.add(new Score(DisemblResultAnnot.REM465, rem));\r
469                         results.put(fs, scores);\r
470 \r
471                         scansingle.close();\r
472                 }\r
473 \r
474                 input.close();\r
475                 return results;\r
476         }\r
477         \r
478         public static  Map<String, Set<Score>> removeSequences(Map<FastaSequence, Set<Score>> disemblResults) { \r
479                 Map<String, Set<Score>> seqNameScores = new HashMap<String, Set<Score>>();\r
480                 for(Map.Entry<FastaSequence,Set<Score>> dres: disemblResults.entrySet()) {\r
481                         seqNameScores.put(dres.getKey().getId(),dres.getValue()); \r
482                 }\r
483                 return seqNameScores;\r
484         }\r
485         \r
486         /**\r
487          * Read AACon result with no alignment files. This method leaves incoming\r
488          * the InputStream results open!\r
489          * \r
490          * @param results\r
491          *            output file of AAConservation\r
492          * @return Map with keys {@link ConservationMethod} -> float[]\r
493          */\r
494         public static HashSet<Score> readAAConResults(InputStream results) {\r
495                 if (results == null) {\r
496                         throw new NullPointerException(\r
497                                         "InputStream with results must be provided");\r
498                 }\r
499                 HashSet<Score> annotations = new HashSet<Score>();\r
500                 Scanner sc = new Scanner(results);\r
501                 sc.useDelimiter("#");\r
502                 while (sc.hasNext()) {\r
503                         String line = sc.next();\r
504                         int spacePos = line.indexOf(" ");\r
505                         assert spacePos > 0 : "Space is expected as delimited between method "\r
506                                         + "name and values!";\r
507                         String methodLine = line.substring(0, spacePos);\r
508                         ConservationMethod method = ConservationMethod\r
509                                         .getMethod(methodLine);\r
510                         assert method != null : "Method " + methodLine\r
511                                         + " is not recognized! ";\r
512                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
513                         ArrayList<Float> values = new ArrayList<Float>();\r
514                         while (valuesScanner.hasNextDouble()) {\r
515                                 Double value = valuesScanner.nextDouble();\r
516                                 values.add(value.floatValue());\r
517                         }\r
518                         annotations.add(new Score(method, values));\r
519                 }\r
520                 return annotations;\r
521         }\r
522 \r
523         /**\r
524          * Reads and parses Fasta or Clustal formatted file into a list of\r
525          * FastaSequence objects\r
526          * \r
527          * @param inFilePath\r
528          *            the path to the input file\r
529          * @throws IOException\r
530          *             if the file denoted by inFilePath cannot be read\r
531          * @throws UnknownFileFormatException\r
532          *             if the inFilePath points to the file which format cannot be\r
533          *             recognised\r
534          * @return the List of FastaSequence objects\r
535          * \r
536          */\r
537         public static List<FastaSequence> openInputStream(String inFilePath)\r
538                         throws IOException, UnknownFileFormatException {\r
539 \r
540                 // This stream gets closed in isValidClustalFile method\r
541                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
542                 // This stream is closed in the calling methods\r
543                 InputStream inStr = new FileInputStream(inFilePath);\r
544                 List<FastaSequence> fastaSeqs = null;\r
545                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
546                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
547                         // alignment cannot be null see\r
548                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
549                         fastaSeqs = al.getSequences();\r
550                 } else {\r
551                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
552                 }\r
553                 return fastaSeqs;\r
554         }\r
555 \r
556 }\r