Javadoc fixes
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /* Copyright (c) 2011 Peter Troshin\r
2  *  \r
3  *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0     \r
4  * \r
5  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
6  *  Apache License version 2 as published by the Apache Software Foundation\r
7  * \r
8  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
9  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
10  *  License for more details.\r
11  * \r
12  *  A copy of the license is in apache_license.txt. It is also available here:\r
13  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
14  * \r
15  * Any republication or derived work distributed in source code form\r
16  * must include this copyright and license notice.\r
17  */\r
18 \r
19 package compbio.data.sequence;\r
20 \r
21 import java.io.BufferedReader;\r
22 import java.io.BufferedWriter;\r
23 import java.io.Closeable;\r
24 import java.io.File;\r
25 import java.io.FileInputStream;\r
26 import java.io.IOException;\r
27 import java.io.InputStream;\r
28 import java.io.InputStreamReader;\r
29 import java.io.OutputStream;\r
30 import java.io.OutputStreamWriter;\r
31 import java.util.ArrayList;\r
32 import java.util.HashMap;\r
33 import java.util.HashSet;\r
34 import java.util.List;\r
35 import java.util.Map;\r
36 import java.util.Scanner;\r
37 import java.util.Set;\r
38 import java.util.TreeSet;\r
39 import java.util.logging.Level;\r
40 import java.util.regex.Matcher;\r
41 import java.util.regex.Pattern;\r
42 \r
43 import compbio.util.Util;\r
44 \r
45 /**\r
46  * Utility class for operations on sequences\r
47  * \r
48  * @author Peter Troshin\r
49  * @since 1.0\r
50  * @version 2.0 June 2011\r
51  */\r
52 public final class SequenceUtil {\r
53 \r
54         /**\r
55          * A whitespace character: [\t\n\x0B\f\r]\r
56          */\r
57         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
58 \r
59         /**\r
60          * A digit\r
61          */\r
62         public static final Pattern DIGIT = Pattern.compile("\\d");\r
63 \r
64         /**\r
65          * Non word\r
66          */\r
67         public static final Pattern NONWORD = Pattern.compile("\\W");\r
68 \r
69         /**\r
70          * Valid Amino acids\r
71          */\r
72         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
73                         Pattern.CASE_INSENSITIVE);\r
74 \r
75         /**\r
76          * inversion of AA pattern\r
77          */\r
78         public static final Pattern NON_AA = Pattern.compile(\r
79                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
80 \r
81         /**\r
82          * Same as AA pattern but with two additional letters - XU\r
83          */\r
84         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
85                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
86 \r
87         /**\r
88          * Nucleotides a, t, g, c, u\r
89          */\r
90         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
91                         Pattern.CASE_INSENSITIVE);\r
92 \r
93         /**\r
94          * Ambiguous nucleotide\r
95          */\r
96         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
97                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
98         /**\r
99          * Non nucleotide\r
100          */\r
101         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
102                         Pattern.CASE_INSENSITIVE);\r
103 \r
104         private SequenceUtil() {\r
105         } // utility class, no instantiation\r
106 \r
107         /**\r
108          * @return true is the sequence contains only letters a,c, t, g, u\r
109          */\r
110         public static boolean isNucleotideSequence(final FastaSequence s) {\r
111                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
112         }\r
113 \r
114         /**\r
115          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
116          * (!) - B char\r
117          */\r
118         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
119                 sequence = SequenceUtil.cleanSequence(sequence);\r
120                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
121                         return false;\r
122                 }\r
123                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
124                         return false;\r
125                         /*\r
126                          * System.out.format("I found the text starting at " +\r
127                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
128                          * nonDNAmatcher.end());\r
129                          */\r
130                 }\r
131                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
132                 return DNAmatcher.find();\r
133         }\r
134 \r
135         /**\r
136          * Removes all whitespace chars in the sequence string\r
137          * \r
138          * @param sequence\r
139          * @return cleaned up sequence\r
140          */\r
141         public static String cleanSequence(String sequence) {\r
142                 assert sequence != null;\r
143                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
144                 sequence = m.replaceAll("").toUpperCase();\r
145                 return sequence;\r
146         }\r
147 \r
148         /**\r
149          * Removes all special characters and digits as well as whitespace chars\r
150          * from the sequence\r
151          * \r
152          * @param sequence\r
153          * @return cleaned up sequence\r
154          */\r
155         public static String deepCleanSequence(String sequence) {\r
156                 sequence = SequenceUtil.cleanSequence(sequence);\r
157                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
158                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
159                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
160                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
161                 return sequence;\r
162         }\r
163 \r
164         /**\r
165          * Remove all non AA chars from the sequence\r
166          * \r
167          * @param sequence\r
168          *            the sequence to clean\r
169          * @return cleaned sequence\r
170          */\r
171         public static String cleanProteinSequence(String sequence) {\r
172                 return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");\r
173         }\r
174 \r
175         /**\r
176          * @param sequence\r
177          * @return true is the sequence is a protein sequence, false overwise\r
178          */\r
179         public static boolean isProteinSequence(String sequence) {\r
180                 sequence = SequenceUtil.cleanSequence(sequence);\r
181                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
182                         return false;\r
183                 }\r
184                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
185                         return false;\r
186                 }\r
187                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
188                         return false;\r
189                 }\r
190                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
191                 return protmatcher.find();\r
192         }\r
193 \r
194         /**\r
195          * Check whether the sequence confirms to amboguous protein sequence\r
196          * \r
197          * @param sequence\r
198          * @return return true only if the sequence if ambiguous protein sequence\r
199          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
200          *         protein or DNA\r
201          */\r
202         public static boolean isAmbiguosProtein(String sequence) {\r
203                 sequence = SequenceUtil.cleanSequence(sequence);\r
204                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
205                         return false;\r
206                 }\r
207                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
208                         return false;\r
209                 }\r
210                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
211                         return false;\r
212                 }\r
213                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
214                         return false;\r
215                 }\r
216                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
217                 return amb_prot.find();\r
218         }\r
219 \r
220         /**\r
221          * Writes list of FastaSequeces into the outstream formatting the sequence\r
222          * so that it contains width chars on each line\r
223          * \r
224          * @param outstream\r
225          * @param sequences\r
226          * @param width\r
227          *            - the maximum number of characters to write in one line\r
228          * @throws IOException\r
229          */\r
230         public static void writeFasta(final OutputStream outstream,\r
231                         final List<FastaSequence> sequences, final int width)\r
232                         throws IOException {\r
233                 writeFastaKeepTheStream(outstream, sequences, width);\r
234                 outstream.close();\r
235         }\r
236 \r
237         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
238                         final List<FastaSequence> sequences, final int width)\r
239                         throws IOException {\r
240                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
241                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
242                 for (final FastaSequence fs : sequences) {\r
243                         fastawriter.write(">" + fs.getId() + "\n");\r
244                         fastawriter.write(fs.getFormatedSequence(width));\r
245                         fastawriter.write("\n");\r
246                 }\r
247                 fastawriter.flush();\r
248                 writer.flush();\r
249         }\r
250 \r
251         /**\r
252          * Reads fasta sequences from inStream into the list of FastaSequence\r
253          * objects\r
254          * \r
255          * @param inStream\r
256          *            from\r
257          * @return list of FastaSequence objects\r
258          * @throws IOException\r
259          */\r
260         public static List<FastaSequence> readFasta(final InputStream inStream)\r
261                         throws IOException {\r
262                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
263                 FastaReader reader = new FastaReader(inStream);\r
264                 while (reader.hasNext()) {\r
265                         seqs.add(reader.next());\r
266                 }\r
267                 inStream.close();\r
268                 return seqs;\r
269         }\r
270 \r
271         /**\r
272          * Writes FastaSequence in the file, each sequence will take one line only\r
273          * \r
274          * @param os\r
275          * @param sequences\r
276          * @throws IOException\r
277          */\r
278         public static void writeFasta(final OutputStream os,\r
279                         final List<FastaSequence> sequences) throws IOException {\r
280                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
281                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
282                 for (final FastaSequence fs : sequences) {\r
283                         fasta_out.write(fs.getOnelineFasta());\r
284                 }\r
285                 fasta_out.close();\r
286                 outWriter.close();\r
287         }\r
288 \r
289         /**\r
290          * Read IUPred output\r
291          * \r
292          * @param result\r
293          * @return Map key->sequence name, value->Score\r
294          * @throws IOException\r
295          * @throws UnknownFileFormatException\r
296          */\r
297         public static Map<String, Score> readIUPred(final File result)\r
298                         throws IOException, UnknownFileFormatException {\r
299                 InputStream input = new FileInputStream(result);\r
300                 Map<String, Score> sequences = readIUPred(input,\r
301                                 IUPredResult.getType(result));\r
302                 input.close();\r
303                 return sequences;\r
304         }\r
305 \r
306         // Check the type of the file e.g. long| short or domain\r
307         // and read\r
308         /**\r
309          * ## Long Disorder\r
310          * \r
311          * # P53_HUMAN\r
312          * \r
313          * 1 M 0.9943\r
314          * \r
315          * 2 E 0.9917\r
316          * \r
317          * 3 E 0.9879\r
318          * \r
319          * (every line)\r
320          * \r
321          * @throws IOException\r
322          * @throws UnknownFileFormatException\r
323          * \r
324          * \r
325          */\r
326         private static Map<String, Score> readIUPred(InputStream input,\r
327                         IUPredResult type) throws IOException, UnknownFileFormatException {\r
328 \r
329                 Score score = null;\r
330                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
331                 Scanner scan = new Scanner(input);\r
332                 scan.useDelimiter("#");\r
333                 while (scan.hasNext()) {\r
334                         String nextEntry = scan.next();\r
335                         Scanner entry = new Scanner(nextEntry);\r
336                         String name = entry.nextLine().trim();\r
337                         // inside entry:\r
338                         if (IUPredResult.Glob == type) {\r
339                                 // parse domains\r
340                                 TreeSet<Range> ranges = parseIUPredDomains(entry);\r
341                                 score = new Score(type, ranges);\r
342                         } else {\r
343                                 // parse short | long\r
344                                 float[] scores = parseIUPredScores(entry);\r
345                                 score = new Score(type, scores);\r
346                         }\r
347                         entry.close();\r
348                         seqs.put(name, score);\r
349                 }\r
350 \r
351                 scan.close();\r
352                 return seqs;\r
353         }\r
354 \r
355         /**\r
356          * # P53_HUMA\r
357          * \r
358          * Number of globular domains: 2\r
359          * \r
360          * globular domain 1. 98 - 269\r
361          * \r
362          * globular domain 2. 431 - 482\r
363          * \r
364          * >P53_HUMA\r
365          * \r
366          * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp\r
367          * \r
368          * @param scan\r
369          */\r
370         private static TreeSet<Range> parseIUPredDomains(Scanner scan) {\r
371                 String header = "Number of globular domains:";\r
372                 String domainPref = "globular domain";\r
373                 TreeSet<Range> ranges = new TreeSet<Range>();\r
374                 String line = scan.nextLine().trim();\r
375                 assert line.startsWith(header);\r
376                 line = line.substring(header.length()).trim();\r
377                 int domainNum = Integer.parseInt(line);\r
378                 if (domainNum == 0) {\r
379                         return ranges;\r
380                 }\r
381 \r
382                 for (int i = 0; i < domainNum; i++) {\r
383                         assert scan.hasNextLine();\r
384                         line = scan.nextLine();\r
385                         assert line.trim().startsWith(domainPref);\r
386                         line = line.substring(line.indexOf(".") + 1).trim();\r
387                         Range r = new Range(line.split("-"));\r
388                         ranges.add(r);\r
389                 }\r
390 \r
391                 return ranges;\r
392         }\r
393         /*\r
394          * 1 M 0.9943\r
395          * \r
396          * 2 E 0.9917\r
397          */\r
398         private static float[] parseIUPredScores(Scanner scan)\r
399                         throws UnknownFileFormatException {\r
400                 List<String> annotation = new ArrayList<String>();\r
401                 while (scan.hasNextLine()) {\r
402                         String line = scan.nextLine().trim();\r
403                         String[] val = line.split("\\s+");\r
404                         annotation.add(val[2]);\r
405                 }\r
406                 return convertToNumber(annotation\r
407                                 .toArray(new String[annotation.size()]));\r
408         }\r
409 \r
410         public static Map<String, Score> readJRonn(final File result)\r
411                         throws IOException, UnknownFileFormatException {\r
412                 InputStream input = new FileInputStream(result);\r
413                 Map<String, Score> sequences = readJRonn(input);\r
414                 input.close();\r
415                 return sequences;\r
416         }\r
417 \r
418         /**\r
419          * Reader for JRonn horizontal file format\r
420          * \r
421          * <pre>\r
422          * &gtFoobar M G D T T A G 0.48 0.42\r
423          * 0.42 0.48 0.52 0.53 0.54\r
424          * \r
425          * <pre>\r
426          * Where all values are tab delimited\r
427          * \r
428          * @param inStream\r
429          *            the InputStream connected to the JRonn output file\r
430          * @return Map key=sequence name value=Score\r
431          * @throws IOException\r
432          *             is thrown if the inStream has problems accessing the data\r
433          * @throws UnknownFileFormatException\r
434          *             is thrown if the inStream represents an unknown source of\r
435          * data, i.e. not a JRonn output\r
436          */\r
437         public static Map<String, Score> readJRonn(final InputStream inStream)\r
438                         throws IOException, UnknownFileFormatException {\r
439                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
440 \r
441                 final BufferedReader infasta = new BufferedReader(\r
442                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
443 \r
444                 String line;\r
445                 String sname = "";\r
446                 do {\r
447                         line = infasta.readLine();\r
448                         if (line == null || line.isEmpty()) {\r
449                                 // skip empty lines\r
450                                 continue;\r
451                         }\r
452                         if (line.startsWith(">")) {\r
453                                 // read name\r
454                                 sname = line.trim().substring(1);\r
455                                 // read sequence line\r
456                                 line = infasta.readLine();\r
457                                 final String sequence = line.replace("\t", "");\r
458                                 // read annotation line\r
459                                 line = infasta.readLine();\r
460                                 String[] annotValues = line.split("\t");\r
461                                 float[] annotation = convertToNumber(annotValues);\r
462                                 if (annotation.length != sequence.length()) {\r
463                                         throw new UnknownFileFormatException(\r
464                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
465                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
466                                 }\r
467                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
468                         }\r
469                 } while (line != null);\r
470 \r
471                 infasta.close();\r
472                 return seqs;\r
473         }\r
474 \r
475         private static float[] convertToNumber(String[] annotValues)\r
476                         throws UnknownFileFormatException {\r
477                 float[] annotation = new float[annotValues.length];\r
478                 try {\r
479                         for (int i = 0; i < annotation.length; i++) {\r
480                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
481                         }\r
482                 } catch (NumberFormatException e) {\r
483                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
484                                         e.getCause());\r
485                 }\r
486                 return annotation;\r
487         }\r
488 \r
489         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
490                         + ">sequence_name\n "\r
491                         + "M    V       S\n"\r
492                         + "0.43 0.22    0.65\n"\r
493                         + "Where first line is the sequence name,\n"\r
494                         + "second line is the tab delimited sequence,\n"\r
495                         + "third line contains tab delimited disorder prediction values.\n"\r
496                         + "No lines are allowed between these three. Additionally, the number of  "\r
497                         + "sequence residues must be equal to the number of the disorder values.";\r
498 \r
499         /**\r
500          * Closes the Closable and logs the exception if any\r
501          * \r
502          * @param log\r
503          * @param stream\r
504          */\r
505         public final static void closeSilently(java.util.logging.Logger log,\r
506                         Closeable stream) {\r
507                 if (stream != null) {\r
508                         try {\r
509                                 stream.close();\r
510                         } catch (IOException e) {\r
511                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
512                         }\r
513                 }\r
514         }\r
515 \r
516         /**\r
517          * \r
518          > Foobar_dundeefriends\r
519          * \r
520          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
521          * \r
522          * # REM465 355-368\r
523          * \r
524          * # HOTLOOPS 190-204\r
525          * \r
526          * # RESIDUE COILS REM465 HOTLOOPS\r
527          * \r
528          * M 0.86010 0.88512 0.37094\r
529          * \r
530          * T 0.79983 0.85864 0.44331\r
531          * \r
532          * >Next Sequence name\r
533          * \r
534          * \r
535          * @param input\r
536          *            the InputStream\r
537          * @return Map key=sequence name, value=set of score\r
538          * @throws IOException\r
539          * @throws UnknownFileFormatException\r
540          */\r
541         public static HashMap<String, Set<Score>> readDisembl(\r
542                         final InputStream input) throws IOException,\r
543                         UnknownFileFormatException {\r
544                 Scanner scan = new Scanner(input);\r
545                 scan.useDelimiter(">");\r
546                 if (!scan.hasNext()) {\r
547                         throw new UnknownFileFormatException(\r
548                                         "In Disembl score format each sequence score is expected "\r
549                                                         + "to start from the line: >Sequence name "\r
550                                                         + " No such line was found!");\r
551                 }\r
552 \r
553                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
554                 int seqCounter = 0;\r
555                 while (scan.hasNext()) {\r
556                         seqCounter++;\r
557                         String singleSeq = scan.next();\r
558                         Scanner scansingle = new Scanner(singleSeq);\r
559                         if (!scansingle.hasNextLine()) {\r
560                                 throw new RuntimeException(\r
561                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
562                         }\r
563 \r
564                         StringBuffer seqbuffer = new StringBuffer();\r
565                         ArrayList<Float> coils = new ArrayList<Float>();\r
566                         ArrayList<Float> rem = new ArrayList<Float>();\r
567                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
568 \r
569                         String sequenceName = scansingle.nextLine().trim();\r
570                         TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
571                                         scansingle.nextLine());\r
572                         TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
573                                         scansingle.nextLine());\r
574                         TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
575                                         scansingle.nextLine());\r
576 \r
577                         String title = scansingle.nextLine();\r
578                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
579 \r
580                         while (scansingle.hasNext()) {\r
581                                 seqbuffer.append(scansingle.next());\r
582                                 coils.add(scansingle.nextFloat());\r
583                                 rem.add(scansingle.nextFloat());\r
584                                 hotloops.add(scansingle.nextFloat());\r
585                         }\r
586                         /*\r
587                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
588                          * seqbuffer.toString());\r
589                          */\r
590                         HashSet<Score> scores = new HashSet<Score>();\r
591                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
592                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));\r
593                         scores.add(new Score(DisemblResult.REM465, rem, loopsR));\r
594                         results.put(sequenceName, scores);\r
595 \r
596                         scansingle.close();\r
597                 }\r
598                 scan.close();\r
599                 input.close();\r
600                 return results;\r
601         }\r
602 \r
603         /**\r
604          * Parsing:\r
605          * \r
606          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
607          * 350-391, 429-485, 497-506, 539-547\r
608          * \r
609          * # REM465 355-368\r
610          * \r
611          * # HOTLOOPS 190-204\r
612          * \r
613          * @param lines\r
614          * @return\r
615          */\r
616         private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
617                 TreeSet<Range> ranges = new TreeSet<Range>();\r
618 \r
619                 Scanner scan = new Scanner(lines);\r
620 \r
621                 assert scan.hasNext();\r
622                 String del = scan.next();\r
623                 assert "#".equals(del); // pass delimiter #\r
624                 String type = scan.next(); // pass enum name e.g. COILS\r
625                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
626                                 + resultType.toString();\r
627 \r
628                 // beginning of the ranges\r
629                 scan.useDelimiter(",");\r
630                 while (scan.hasNext()) {\r
631                         String range = scan.next();\r
632                         if (!Util.isEmpty(range)) {\r
633                                 ranges.add(new Range(range.split("-")));\r
634                         }\r
635                 }\r
636                 return ranges;\r
637         }\r
638 \r
639         /**\r
640          * \r
641          > Foobar_dundeefriends\r
642          * \r
643          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
644          * \r
645          * # REM465 355-368\r
646          * \r
647          * # HOTLOOPS 190-204\r
648          * \r
649          * # RESIDUE COILS REM465 HOTLOOPS\r
650          * \r
651          * M 0.86010 0.88512 0.37094\r
652          * \r
653          * T 0.79983 0.85864 0.44331\r
654          * \r
655          * >Next Sequence name\r
656          * \r
657          * \r
658          * @param input\r
659          * @return Map key=sequence name, value=set of score\r
660          * @throws IOException\r
661          * @throws UnknownFileFormatException\r
662          */\r
663         public static HashMap<String, Set<Score>> readGlobPlot(\r
664                         final InputStream input) throws IOException,\r
665                         UnknownFileFormatException {\r
666                 Scanner scan = new Scanner(input);\r
667                 scan.useDelimiter(">");\r
668                 if (!scan.hasNext()) {\r
669                         throw new UnknownFileFormatException(\r
670                                         "In GlobPlot score format each sequence score is expected "\r
671                                                         + "to start from the line: >Sequence name "\r
672                                                         + " No such line was found!");\r
673                 }\r
674 \r
675                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
676                 int seqCounter = 0;\r
677                 while (scan.hasNext()) {\r
678                         seqCounter++;\r
679                         String singleSeq = scan.next();\r
680                         Scanner scansingle = new Scanner(singleSeq);\r
681                         if (!scansingle.hasNextLine()) {\r
682                                 throw new RuntimeException(\r
683                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
684                         }\r
685 \r
686                         StringBuffer seqbuffer = new StringBuffer();\r
687                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
688                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
689                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
690 \r
691                         String sequenceName = scansingle.nextLine().trim();\r
692                         TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
693                                         scansingle.nextLine());\r
694                         TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
695                                         scansingle.nextLine());\r
696 \r
697                         String title = scansingle.nextLine();\r
698                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
699 \r
700                         while (scansingle.hasNext()) {\r
701                                 seqbuffer.append(scansingle.next());\r
702                                 dydxScore.add(scansingle.nextFloat());\r
703                                 rawScore.add(scansingle.nextFloat());\r
704                                 smoothedScore.add(scansingle.nextFloat());\r
705                         }\r
706                         /*\r
707                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
708                          * seqbuffer.toString());\r
709                          */\r
710                         Set<Score> scores = new TreeSet<Score>();\r
711                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
712                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
713                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
714                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
715                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
716                         results.put(sequenceName, scores);\r
717 \r
718                         scansingle.close();\r
719                 }\r
720                 scan.close();\r
721                 input.close();\r
722                 return results;\r
723         }\r
724         /**\r
725          * Read AACon result with no alignment files. This method leaves incoming\r
726          * InputStream open!\r
727          * \r
728          * @param results\r
729          *            output file of AAConservation\r
730          * @return Map with keys {@link ConservationMethod} -> float[]\r
731          */\r
732         public static HashSet<Score> readAAConResults(InputStream results) {\r
733                 if (results == null) {\r
734                         throw new NullPointerException(\r
735                                         "InputStream with results must be provided");\r
736                 }\r
737                 HashSet<Score> annotations = new HashSet<Score>();\r
738                 Scanner sc = new Scanner(results);\r
739                 sc.useDelimiter("#");\r
740                 while (sc.hasNext()) {\r
741                         String line = sc.next();\r
742                         int spacePos = line.indexOf(" ");\r
743                         assert spacePos > 0 : "Space is expected as delimited between method "\r
744                                         + "name and values!";\r
745                         String methodLine = line.substring(0, spacePos);\r
746                         ConservationMethod method = ConservationMethod\r
747                                         .getMethod(methodLine);\r
748                         assert method != null : "Method " + methodLine\r
749                                         + " is not recognized! ";\r
750                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
751                         ArrayList<Float> values = new ArrayList<Float>();\r
752                         while (valuesScanner.hasNextDouble()) {\r
753                                 Double value = valuesScanner.nextDouble();\r
754                                 values.add(value.floatValue());\r
755                         }\r
756                         annotations.add(new Score(method, values));\r
757                 }\r
758                 return annotations;\r
759         }\r
760 \r
761         /**\r
762          * Reads and parses Fasta or Clustal formatted file into a list of\r
763          * FastaSequence objects\r
764          * \r
765          * @param inFilePath\r
766          *            the path to the input file\r
767          * @throws IOException\r
768          *             if the file denoted by inFilePath cannot be read\r
769          * @throws UnknownFileFormatException\r
770          *             if the inFilePath points to the file which format cannot be\r
771          *             recognised\r
772          * @return the List of FastaSequence objects\r
773          * \r
774          */\r
775         public static List<FastaSequence> openInputStream(String inFilePath)\r
776                         throws IOException, UnknownFileFormatException {\r
777 \r
778                 // This stream gets closed in isValidClustalFile method\r
779                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
780                 // This stream is closed in the calling methods\r
781                 InputStream inStr = new FileInputStream(inFilePath);\r
782                 List<FastaSequence> fastaSeqs = null;\r
783                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
784                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
785                         // alignment cannot be null see\r
786                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
787                         fastaSeqs = al.getSequences();\r
788                 } else {\r
789                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
790                 }\r
791                 return fastaSeqs;\r
792         }\r
793 \r
794 }\r
795 \r
796 enum DisemblResult {\r
797         /** These contains ranges and scores */\r
798         COILS, REM465, HOTLOOPS\r
799 }\r
800 enum GlobProtResult {\r
801         /** This a range with no scores */\r
802         GlobDoms,\r
803         /** This a range with no scores */\r
804         Disorder,\r
805         /** This a score with no range */\r
806         Dydx,\r
807         /** This a score with no range */\r
808         SmoothedScore,\r
809         /** This a score with no range */\r
810         RawScore\r
811 }\r
812 \r
813 enum IUPredResult {\r
814         /**\r
815          * Short disorder\r
816          */\r
817         Short,\r
818         /**\r
819          * Long disorder\r
820          */\r
821         Long,\r
822         /**\r
823          * Globular domains\r
824          */\r
825         Glob;\r
826 \r
827         static IUPredResult getType(File file) {\r
828                 assert file != null;\r
829                 String name = file.getName();\r
830                 if (name.endsWith(Long.toString().toLowerCase())) {\r
831                         return Long;\r
832                 }\r
833                 if (name.endsWith(Short.toString().toLowerCase())) {\r
834                         return Short;\r
835                 }\r
836                 if (name.endsWith(Glob.toString().toLowerCase())) {\r
837                         return Glob;\r
838                 }\r
839                 throw new AssertionError(\r
840                                 "IUPred result file type cannot be recognised! "\r
841                                                 + "\nFile must ends with one of [glob, long or short]"\r
842                                                 + "\n but given file name was: " + file.getName());\r
843         }\r
844 }