d65494e7fcca85ecdb655d4b5210974d2573d310
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /*\r
2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
3  * Jalview Web Services version: 2.0 This library is free software; you can\r
4  * redistribute it and/or modify it under the terms of the Apache License\r
5  * version 2 as published by the Apache Software Foundation This library is\r
6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
9  * license is in apache_license.txt. It is also available here: see:\r
10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
11  * work distributed in source code form must include this copyright and license\r
12  * notice.\r
13  */\r
14 \r
15 package compbio.data.sequence;\r
16 \r
17 import java.io.BufferedReader;\r
18 import java.io.BufferedWriter;\r
19 import java.io.Closeable;\r
20 import java.io.File;\r
21 import java.io.FileInputStream;\r
22 import java.io.IOException;\r
23 import java.io.InputStream;\r
24 import java.io.InputStreamReader;\r
25 import java.io.OutputStream;\r
26 import java.io.OutputStreamWriter;\r
27 import java.util.ArrayList;\r
28 import java.util.HashMap;\r
29 import java.util.HashSet;\r
30 import java.util.List;\r
31 import java.util.Map;\r
32 import java.util.Scanner;\r
33 import java.util.Set;\r
34 import java.util.TreeSet;\r
35 import java.util.logging.Level;\r
36 import java.util.regex.Matcher;\r
37 import java.util.regex.Pattern;\r
38 \r
39 import compbio.util.Util;\r
40 \r
41 /**\r
42  * Utility class for operations on sequences\r
43  * \r
44  * @author Peter Troshin\r
45  * @since 1.0\r
46  * @version 2.0 June 2011\r
47  */\r
48 public final class SequenceUtil {\r
49 \r
50         /**\r
51          * A whitespace character: [\t\n\x0B\f\r]\r
52          */\r
53         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
54 \r
55         /**\r
56          * A digit\r
57          */\r
58         public static final Pattern DIGIT = Pattern.compile("\\d");\r
59 \r
60         /**\r
61          * Non word\r
62          */\r
63         public static final Pattern NONWORD = Pattern.compile("\\W");\r
64 \r
65         /**\r
66          * Valid Amino acids\r
67          */\r
68         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
69                         Pattern.CASE_INSENSITIVE);\r
70 \r
71         /**\r
72          * inversion of AA pattern\r
73          */\r
74         public static final Pattern NON_AA = Pattern.compile(\r
75                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
76 \r
77         /**\r
78          * Same as AA pattern but with two additional letters - XU\r
79          */\r
80         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
81                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
82 \r
83         /**\r
84          * Nucleotides a, t, g, c, u\r
85          */\r
86         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
87                         Pattern.CASE_INSENSITIVE);\r
88 \r
89         /**\r
90          * Ambiguous nucleotide\r
91          */\r
92         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
93                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
94         /**\r
95          * Non nucleotide\r
96          */\r
97         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
98                         Pattern.CASE_INSENSITIVE);\r
99 \r
100         private SequenceUtil() {\r
101         } // utility class, no instantiation\r
102 \r
103         /**\r
104          * @return true is the sequence contains only letters a,c, t, g, u\r
105          */\r
106         public static boolean isNucleotideSequence(final FastaSequence s) {\r
107                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
108         }\r
109 \r
110         /**\r
111          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
112          * (!) - B char\r
113          */\r
114         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
115                 sequence = SequenceUtil.cleanSequence(sequence);\r
116                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
117                         return false;\r
118                 }\r
119                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
120                         return false;\r
121                         /*\r
122                          * System.out.format("I found the text starting at " +\r
123                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
124                          * nonDNAmatcher.end());\r
125                          */\r
126                 }\r
127                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
128                 return DNAmatcher.find();\r
129         }\r
130 \r
131         /**\r
132          * Removes all whitespace chars in the sequence string\r
133          * \r
134          * @param sequence\r
135          * @return cleaned up sequence\r
136          */\r
137         public static String cleanSequence(String sequence) {\r
138                 assert sequence != null;\r
139                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
140                 sequence = m.replaceAll("").toUpperCase();\r
141                 return sequence;\r
142         }\r
143 \r
144         /**\r
145          * Removes all special characters and digits as well as whitespace chars\r
146          * from the sequence\r
147          * \r
148          * @param sequence\r
149          * @return cleaned up sequence\r
150          */\r
151         public static String deepCleanSequence(String sequence) {\r
152                 sequence = SequenceUtil.cleanSequence(sequence);\r
153                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
154                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
155                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
156                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
157                 return sequence;\r
158         }\r
159 \r
160         /**\r
161          * Remove all non AA chars from the sequence\r
162          * \r
163          * @param sequence\r
164          *            the sequence to clean\r
165          * @return cleaned sequence\r
166          */\r
167         public static String cleanProteinSequence(String sequence) {\r
168                 return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");\r
169         }\r
170 \r
171         /**\r
172          * @param sequence\r
173          * @return true is the sequence is a protein sequence, false overwise\r
174          */\r
175         public static boolean isProteinSequence(String sequence) {\r
176                 sequence = SequenceUtil.cleanSequence(sequence);\r
177                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
178                         return false;\r
179                 }\r
180                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
181                         return false;\r
182                 }\r
183                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
184                         return false;\r
185                 }\r
186                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
187                 return protmatcher.find();\r
188         }\r
189 \r
190         /**\r
191          * Check whether the sequence confirms to amboguous protein sequence\r
192          * \r
193          * @param sequence\r
194          * @return return true only if the sequence if ambiguous protein sequence\r
195          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
196          *         protein or DNA\r
197          */\r
198         public static boolean isAmbiguosProtein(String sequence) {\r
199                 sequence = SequenceUtil.cleanSequence(sequence);\r
200                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
201                         return false;\r
202                 }\r
203                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
204                         return false;\r
205                 }\r
206                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
207                         return false;\r
208                 }\r
209                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
210                         return false;\r
211                 }\r
212                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
213                 return amb_prot.find();\r
214         }\r
215 \r
216         /**\r
217          * Writes list of FastaSequeces into the outstream formatting the sequence\r
218          * so that it contains width chars on each line\r
219          * \r
220          * @param outstream\r
221          * @param sequences\r
222          * @param width\r
223          *            - the maximum number of characters to write in one line\r
224          * @throws IOException\r
225          */\r
226         public static void writeFasta(final OutputStream outstream,\r
227                         final List<FastaSequence> sequences, final int width)\r
228                         throws IOException {\r
229                 writeFastaKeepTheStream(outstream, sequences, width);\r
230                 outstream.close();\r
231         }\r
232 \r
233         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
234                         final List<FastaSequence> sequences, final int width)\r
235                         throws IOException {\r
236                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
237                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
238                 for (final FastaSequence fs : sequences) {\r
239                         fastawriter.write(">" + fs.getId() + "\n");\r
240                         fastawriter.write(fs.getFormatedSequence(width));\r
241                         fastawriter.write("\n");\r
242                 }\r
243                 fastawriter.flush();\r
244                 writer.flush();\r
245         }\r
246 \r
247         /**\r
248          * Reads fasta sequences from inStream into the list of FastaSequence\r
249          * objects\r
250          * \r
251          * @param inStream\r
252          *            from\r
253          * @return list of FastaSequence objects\r
254          * @throws IOException\r
255          */\r
256         public static List<FastaSequence> readFasta(final InputStream inStream)\r
257                         throws IOException {\r
258                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
259                 FastaReader reader = new FastaReader(inStream);\r
260                 while (reader.hasNext()) {\r
261                         seqs.add(reader.next());\r
262                 }\r
263                 inStream.close();\r
264                 return seqs;\r
265         }\r
266 \r
267         /**\r
268          * Writes FastaSequence in the file, each sequence will take one line only\r
269          * \r
270          * @param os\r
271          * @param sequences\r
272          * @throws IOException\r
273          */\r
274         public static void writeFasta(final OutputStream os,\r
275                         final List<FastaSequence> sequences) throws IOException {\r
276                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
277                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
278                 for (final FastaSequence fs : sequences) {\r
279                         fasta_out.write(fs.getOnelineFasta());\r
280                 }\r
281                 fasta_out.close();\r
282                 outWriter.close();\r
283         }\r
284 \r
285         /**\r
286          * Read IUPred output\r
287          * \r
288          * @param result\r
289          * @return\r
290          * @throws IOException\r
291          * @throws UnknownFileFormatException\r
292          */\r
293         public static Map<String, Score> readIUPred(final File result)\r
294                         throws IOException, UnknownFileFormatException {\r
295                 InputStream input = new FileInputStream(result);\r
296                 Map<String, Score> sequences = readIUPred(input,\r
297                                 IUPredResult.getType(result));\r
298                 input.close();\r
299                 return sequences;\r
300         }\r
301 \r
302         // Check the type of the file e.g. long| short or domain\r
303         // and read\r
304         /**\r
305          * ## Long Disorder\r
306          * \r
307          * # P53_HUMAN\r
308          * \r
309          * 1 M 0.9943\r
310          * \r
311          * 2 E 0.9917\r
312          * \r
313          * 3 E 0.9879\r
314          * \r
315          * (every line)\r
316          * \r
317          * @throws IOException\r
318          * @throws UnknownFileFormatException\r
319          * \r
320          * \r
321          */\r
322         private static Map<String, Score> readIUPred(InputStream input,\r
323                         IUPredResult type) throws IOException, UnknownFileFormatException {\r
324 \r
325                 Score score = null;\r
326                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
327                 Scanner scan = new Scanner(input);\r
328                 scan.useDelimiter("#");\r
329                 while (scan.hasNext()) {\r
330                         String nextEntry = scan.next();\r
331                         Scanner entry = new Scanner(nextEntry);\r
332                         String name = entry.nextLine().trim();\r
333                         // inside entry:\r
334                         if (IUPredResult.Glob == type) {\r
335                                 // parse domains\r
336                                 TreeSet<Range> ranges = parseIUPredDomains(entry);\r
337                                 score = new Score(type, ranges);\r
338                         } else {\r
339                                 // parse short | long\r
340                                 float[] scores = parseIUPredScores(entry);\r
341                                 score = new Score(type, scores);\r
342                         }\r
343                         entry.close();\r
344                         seqs.put(name, score);\r
345                 }\r
346 \r
347                 scan.close();\r
348                 return seqs;\r
349         }\r
350 \r
351         /**\r
352          * # P53_HUMA\r
353          * \r
354          * Number of globular domains: 2\r
355          * \r
356          * globular domain 1. 98 - 269\r
357          * \r
358          * globular domain 2. 431 - 482\r
359          * \r
360          * >P53_HUMA\r
361          * \r
362          * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp\r
363          * \r
364          * @param scan\r
365          */\r
366         private static TreeSet<Range> parseIUPredDomains(Scanner scan) {\r
367                 String header = "Number of globular domains:";\r
368                 String domainPref = "globular domain";\r
369                 TreeSet<Range> ranges = new TreeSet<Range>();\r
370                 String line = scan.nextLine().trim();\r
371                 assert line.startsWith(header);\r
372                 line = line.substring(header.length()).trim();\r
373                 int domainNum = Integer.parseInt(line);\r
374                 if (domainNum == 0) {\r
375                         return ranges;\r
376                 }\r
377 \r
378                 for (int i = 0; i < domainNum; i++) {\r
379                         assert scan.hasNextLine();\r
380                         line = scan.nextLine();\r
381                         assert line.trim().startsWith(domainPref);\r
382                         line = line.substring(line.indexOf(".") + 1).trim();\r
383                         Range r = new Range(line.split("-"));\r
384                         ranges.add(r);\r
385                 }\r
386 \r
387                 return ranges;\r
388         }\r
389         /*\r
390          * 1 M 0.9943\r
391          * \r
392          * 2 E 0.9917\r
393          */\r
394         private static float[] parseIUPredScores(Scanner scan)\r
395                         throws UnknownFileFormatException {\r
396                 List<String> annotation = new ArrayList<String>();\r
397                 while (scan.hasNextLine()) {\r
398                         String line = scan.nextLine().trim();\r
399                         String[] val = line.split("\\s+");\r
400                         annotation.add(val[2]);\r
401                 }\r
402                 return convertToNumber(annotation\r
403                                 .toArray(new String[annotation.size()]));\r
404         }\r
405 \r
406         public static Map<String, Score> readJRonn(final File result)\r
407                         throws IOException, UnknownFileFormatException {\r
408                 InputStream input = new FileInputStream(result);\r
409                 Map<String, Score> sequences = readJRonn(input);\r
410                 input.close();\r
411                 return sequences;\r
412         }\r
413 \r
414         /**\r
415          * Reader for JRonn horizontal file format\r
416          * \r
417          * <pre>\r
418          * &gtFoobar M G D T T A G 0.48 0.42\r
419          * 0.42 0.48 0.52 0.53 0.54\r
420          * \r
421          * <pre>\r
422          * Where all values are tab delimited\r
423          * \r
424          * @param inStream\r
425          *            the InputStream connected to the JRonn output file\r
426          * @return List of {@link AnnotatedSequence} objects\r
427          * @throws IOException\r
428          *             is thrown if the inStream has problems accessing the data\r
429          * @throws UnknownFileFormatException\r
430          *             is thrown if the inStream represents an unknown source of\r
431          * data, i.e. not a JRonn output\r
432          */\r
433         public static Map<String, Score> readJRonn(final InputStream inStream)\r
434                         throws IOException, UnknownFileFormatException {\r
435                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
436 \r
437                 final BufferedReader infasta = new BufferedReader(\r
438                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
439 \r
440                 String line;\r
441                 String sname = "";\r
442                 do {\r
443                         line = infasta.readLine();\r
444                         if (line == null || line.isEmpty()) {\r
445                                 // skip empty lines\r
446                                 continue;\r
447                         }\r
448                         if (line.startsWith(">")) {\r
449                                 // read name\r
450                                 sname = line.trim().substring(1);\r
451                                 // read sequence line\r
452                                 line = infasta.readLine();\r
453                                 final String sequence = line.replace("\t", "");\r
454                                 // read annotation line\r
455                                 line = infasta.readLine();\r
456                                 String[] annotValues = line.split("\t");\r
457                                 float[] annotation = convertToNumber(annotValues);\r
458                                 if (annotation.length != sequence.length()) {\r
459                                         throw new UnknownFileFormatException(\r
460                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
461                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
462                                 }\r
463                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
464                         }\r
465                 } while (line != null);\r
466 \r
467                 infasta.close();\r
468                 return seqs;\r
469         }\r
470 \r
471         private static float[] convertToNumber(String[] annotValues)\r
472                         throws UnknownFileFormatException {\r
473                 float[] annotation = new float[annotValues.length];\r
474                 try {\r
475                         for (int i = 0; i < annotation.length; i++) {\r
476                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
477                         }\r
478                 } catch (NumberFormatException e) {\r
479                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
480                                         e.getCause());\r
481                 }\r
482                 return annotation;\r
483         }\r
484 \r
485         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
486                         + ">sequence_name\n "\r
487                         + "M    V       S\n"\r
488                         + "0.43 0.22    0.65\n"\r
489                         + "Where first line is the sequence name,\n"\r
490                         + "second line is the tab delimited sequence,\n"\r
491                         + "third line contains tab delimited disorder prediction values.\n"\r
492                         + "No lines are allowed between these three. Additionally, the number of  "\r
493                         + "sequence residues must be equal to the number of the disorder values.";\r
494 \r
495         /**\r
496          * Closes the Closable and logs the exception if any\r
497          * \r
498          * @param log\r
499          * @param stream\r
500          */\r
501         public final static void closeSilently(java.util.logging.Logger log,\r
502                         Closeable stream) {\r
503                 if (stream != null) {\r
504                         try {\r
505                                 stream.close();\r
506                         } catch (IOException e) {\r
507                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
508                         }\r
509                 }\r
510         }\r
511 \r
512         /**\r
513          * \r
514          > Foobar_dundeefriends\r
515          * \r
516          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
517          * \r
518          * # REM465 355-368\r
519          * \r
520          * # HOTLOOPS 190-204\r
521          * \r
522          * # RESIDUE COILS REM465 HOTLOOPS\r
523          * \r
524          * M 0.86010 0.88512 0.37094\r
525          * \r
526          * T 0.79983 0.85864 0.44331\r
527          * \r
528          * >Next Sequence name\r
529          * \r
530          * \r
531          * @param input\r
532          * @return\r
533          * @throws IOException\r
534          * @throws UnknownFileFormatException\r
535          */\r
536         public static HashMap<String, Set<Score>> readDisembl(\r
537                         final InputStream input) throws IOException,\r
538                         UnknownFileFormatException {\r
539                 Scanner scan = new Scanner(input);\r
540                 scan.useDelimiter(">");\r
541                 if (!scan.hasNext()) {\r
542                         throw new UnknownFileFormatException(\r
543                                         "In Disembl score format each sequence score is expected "\r
544                                                         + "to start from the line: >Sequence name "\r
545                                                         + " No such line was found!");\r
546                 }\r
547 \r
548                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
549                 int seqCounter = 0;\r
550                 while (scan.hasNext()) {\r
551                         seqCounter++;\r
552                         String singleSeq = scan.next();\r
553                         Scanner scansingle = new Scanner(singleSeq);\r
554                         if (!scansingle.hasNextLine()) {\r
555                                 throw new RuntimeException(\r
556                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
557                         }\r
558 \r
559                         StringBuffer seqbuffer = new StringBuffer();\r
560                         ArrayList<Float> coils = new ArrayList<Float>();\r
561                         ArrayList<Float> rem = new ArrayList<Float>();\r
562                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
563 \r
564                         String sequenceName = scansingle.nextLine().trim();\r
565                         TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
566                                         scansingle.nextLine());\r
567                         TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
568                                         scansingle.nextLine());\r
569                         TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
570                                         scansingle.nextLine());\r
571 \r
572                         String title = scansingle.nextLine();\r
573                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
574 \r
575                         while (scansingle.hasNext()) {\r
576                                 seqbuffer.append(scansingle.next());\r
577                                 coils.add(scansingle.nextFloat());\r
578                                 rem.add(scansingle.nextFloat());\r
579                                 hotloops.add(scansingle.nextFloat());\r
580                         }\r
581                         /*\r
582                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
583                          * seqbuffer.toString());\r
584                          */\r
585                         HashSet<Score> scores = new HashSet<Score>();\r
586                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
587                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));\r
588                         scores.add(new Score(DisemblResult.REM465, rem, loopsR));\r
589                         results.put(sequenceName, scores);\r
590 \r
591                         scansingle.close();\r
592                 }\r
593                 scan.close();\r
594                 input.close();\r
595                 return results;\r
596         }\r
597 \r
598         /**\r
599          * Parsing:\r
600          * \r
601          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
602          * 350-391, 429-485, 497-506, 539-547\r
603          * \r
604          * # REM465 355-368\r
605          * \r
606          * # HOTLOOPS 190-204\r
607          * \r
608          * @param lines\r
609          * @return\r
610          */\r
611         private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
612                 TreeSet<Range> ranges = new TreeSet<Range>();\r
613 \r
614                 Scanner scan = new Scanner(lines);\r
615 \r
616                 assert scan.hasNext();\r
617                 String del = scan.next();\r
618                 assert "#".equals(del); // pass delimiter #\r
619                 String type = scan.next(); // pass enum name e.g. COILS\r
620                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
621                                 + resultType.toString();\r
622 \r
623                 // beginning of the ranges\r
624                 scan.useDelimiter(",");\r
625                 while (scan.hasNext()) {\r
626                         String range = scan.next();\r
627                         if (!Util.isEmpty(range)) {\r
628                                 ranges.add(new Range(range.split("-")));\r
629                         }\r
630                 }\r
631                 return ranges;\r
632         }\r
633 \r
634         /**\r
635          * \r
636          > Foobar_dundeefriends\r
637          * \r
638          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
639          * \r
640          * # REM465 355-368\r
641          * \r
642          * # HOTLOOPS 190-204\r
643          * \r
644          * # RESIDUE COILS REM465 HOTLOOPS\r
645          * \r
646          * M 0.86010 0.88512 0.37094\r
647          * \r
648          * T 0.79983 0.85864 0.44331\r
649          * \r
650          * >Next Sequence name\r
651          * \r
652          * \r
653          * @param input\r
654          * @return\r
655          * @throws IOException\r
656          * @throws UnknownFileFormatException\r
657          */\r
658         public static HashMap<String, Set<Score>> readGlobPlot(\r
659                         final InputStream input) throws IOException,\r
660                         UnknownFileFormatException {\r
661                 Scanner scan = new Scanner(input);\r
662                 scan.useDelimiter(">");\r
663                 if (!scan.hasNext()) {\r
664                         throw new UnknownFileFormatException(\r
665                                         "In GlobPlot score format each sequence score is expected "\r
666                                                         + "to start from the line: >Sequence name "\r
667                                                         + " No such line was found!");\r
668                 }\r
669 \r
670                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
671                 int seqCounter = 0;\r
672                 while (scan.hasNext()) {\r
673                         seqCounter++;\r
674                         String singleSeq = scan.next();\r
675                         Scanner scansingle = new Scanner(singleSeq);\r
676                         if (!scansingle.hasNextLine()) {\r
677                                 throw new RuntimeException(\r
678                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
679                         }\r
680 \r
681                         StringBuffer seqbuffer = new StringBuffer();\r
682                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
683                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
684                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
685 \r
686                         String sequenceName = scansingle.nextLine().trim();\r
687                         TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
688                                         scansingle.nextLine());\r
689                         TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
690                                         scansingle.nextLine());\r
691 \r
692                         String title = scansingle.nextLine();\r
693                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
694 \r
695                         while (scansingle.hasNext()) {\r
696                                 seqbuffer.append(scansingle.next());\r
697                                 dydxScore.add(scansingle.nextFloat());\r
698                                 rawScore.add(scansingle.nextFloat());\r
699                                 smoothedScore.add(scansingle.nextFloat());\r
700                         }\r
701                         /*\r
702                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
703                          * seqbuffer.toString());\r
704                          */\r
705                         Set<Score> scores = new TreeSet<Score>();\r
706                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
707                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
708                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
709                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
710                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
711                         results.put(sequenceName, scores);\r
712 \r
713                         scansingle.close();\r
714                 }\r
715                 scan.close();\r
716                 input.close();\r
717                 return results;\r
718         }\r
719         /**\r
720          * Read AACon result with no alignment files. This method leaves incoming\r
721          * InputStream open!\r
722          * \r
723          * @param results\r
724          *            output file of AAConservation\r
725          * @return Map with keys {@link ConservationMethod} -> float[]\r
726          */\r
727         public static HashSet<Score> readAAConResults(InputStream results) {\r
728                 if (results == null) {\r
729                         throw new NullPointerException(\r
730                                         "InputStream with results must be provided");\r
731                 }\r
732                 HashSet<Score> annotations = new HashSet<Score>();\r
733                 Scanner sc = new Scanner(results);\r
734                 sc.useDelimiter("#");\r
735                 while (sc.hasNext()) {\r
736                         String line = sc.next();\r
737                         int spacePos = line.indexOf(" ");\r
738                         assert spacePos > 0 : "Space is expected as delimited between method "\r
739                                         + "name and values!";\r
740                         String methodLine = line.substring(0, spacePos);\r
741                         ConservationMethod method = ConservationMethod\r
742                                         .getMethod(methodLine);\r
743                         assert method != null : "Method " + methodLine\r
744                                         + " is not recognized! ";\r
745                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
746                         ArrayList<Float> values = new ArrayList<Float>();\r
747                         while (valuesScanner.hasNextDouble()) {\r
748                                 Double value = valuesScanner.nextDouble();\r
749                                 values.add(value.floatValue());\r
750                         }\r
751                         annotations.add(new Score(method, values));\r
752                 }\r
753                 return annotations;\r
754         }\r
755 \r
756         /**\r
757          * Reads and parses Fasta or Clustal formatted file into a list of\r
758          * FastaSequence objects\r
759          * \r
760          * @param inFilePath\r
761          *            the path to the input file\r
762          * @throws IOException\r
763          *             if the file denoted by inFilePath cannot be read\r
764          * @throws UnknownFileFormatException\r
765          *             if the inFilePath points to the file which format cannot be\r
766          *             recognised\r
767          * @return the List of FastaSequence objects\r
768          * \r
769          */\r
770         public static List<FastaSequence> openInputStream(String inFilePath)\r
771                         throws IOException, UnknownFileFormatException {\r
772 \r
773                 // This stream gets closed in isValidClustalFile method\r
774                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
775                 // This stream is closed in the calling methods\r
776                 InputStream inStr = new FileInputStream(inFilePath);\r
777                 List<FastaSequence> fastaSeqs = null;\r
778                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
779                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
780                         // alignment cannot be null see\r
781                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
782                         fastaSeqs = al.getSequences();\r
783                 } else {\r
784                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
785                 }\r
786                 return fastaSeqs;\r
787         }\r
788 \r
789 }\r
790 \r
791 enum DisemblResult {\r
792         /** These contains ranges and scores */\r
793         COILS, REM465, HOTLOOPS\r
794 }\r
795 enum GlobProtResult {\r
796         /** This a range with no scores */\r
797         GlobDoms,\r
798         /** This a range with no scores */\r
799         Disorder,\r
800         /** This a score with no range */\r
801         Dydx,\r
802         /** This a score with no range */\r
803         SmoothedScore,\r
804         /** This a score with no range */\r
805         RawScore\r
806 }\r
807 \r
808 enum IUPredResult {\r
809         /**\r
810          * Short disorder\r
811          */\r
812         Short,\r
813         /**\r
814          * Long disorder\r
815          */\r
816         Long,\r
817         /**\r
818          * Globular domains\r
819          */\r
820         Glob;\r
821 \r
822         static IUPredResult getType(File file) {\r
823                 assert file != null;\r
824                 String name = file.getName();\r
825                 if (name.endsWith(Long.toString().toLowerCase())) {\r
826                         return Long;\r
827                 }\r
828                 if (name.endsWith(Short.toString().toLowerCase())) {\r
829                         return Short;\r
830                 }\r
831                 if (name.endsWith(Glob.toString().toLowerCase())) {\r
832                         return Glob;\r
833                 }\r
834                 throw new AssertionError(\r
835                                 "IUPred result file type cannot be recognised! "\r
836                                                 + "\nFile must ends with one of [glob, long or short]"\r
837                                                 + "\n but given file name was: " + file.getName());\r
838         }\r
839 }