d0a6cd7039f50d43707ce9054070bc06e9f8bce3
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /*\r
2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
3  * Jalview Web Services version: 2.0 This library is free software; you can\r
4  * redistribute it and/or modify it under the terms of the Apache License\r
5  * version 2 as published by the Apache Software Foundation This library is\r
6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
9  * license is in apache_license.txt. It is also available here: see:\r
10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
11  * work distributed in source code form must include this copyright and license\r
12  * notice.\r
13  */\r
14 \r
15 package compbio.data.sequence;\r
16 \r
17 import java.io.BufferedReader;\r
18 import java.io.BufferedWriter;\r
19 import java.io.Closeable;\r
20 import java.io.File;\r
21 import java.io.FileInputStream;\r
22 import java.io.IOException;\r
23 import java.io.InputStream;\r
24 import java.io.InputStreamReader;\r
25 import java.io.OutputStream;\r
26 import java.io.OutputStreamWriter;\r
27 import java.util.ArrayList;\r
28 import java.util.HashMap;\r
29 import java.util.HashSet;\r
30 import java.util.List;\r
31 import java.util.Map;\r
32 import java.util.Scanner;\r
33 import java.util.Set;\r
34 import java.util.TreeSet;\r
35 import java.util.logging.Level;\r
36 import java.util.regex.Matcher;\r
37 import java.util.regex.Pattern;\r
38 \r
39 import compbio.util.Util;\r
40 \r
41 /**\r
42  * Utility class for operations on sequences\r
43  * \r
44  * @author Petr Troshin\r
45  * @version 1.0\r
46  */\r
47 public final class SequenceUtil {\r
48 \r
49         /**\r
50          * A whitespace character: [\t\n\x0B\f\r]\r
51          */\r
52         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
53 \r
54         /**\r
55          * A digit\r
56          */\r
57         public static final Pattern DIGIT = Pattern.compile("\\d");\r
58 \r
59         /**\r
60          * Non word\r
61          */\r
62         public static final Pattern NONWORD = Pattern.compile("\\W");\r
63 \r
64         /**\r
65          * Valid Amino acids\r
66          */\r
67         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
68                         Pattern.CASE_INSENSITIVE);\r
69 \r
70         /**\r
71          * inversion of AA pattern\r
72          */\r
73         public static final Pattern NON_AA = Pattern.compile(\r
74                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
75 \r
76         /**\r
77          * Same as AA pattern but with two additional letters - XU\r
78          */\r
79         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
80                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
81 \r
82         /**\r
83          * Nucleotides a, t, g, c, u\r
84          */\r
85         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
86                         Pattern.CASE_INSENSITIVE);\r
87 \r
88         /**\r
89          * Ambiguous nucleotide\r
90          */\r
91         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
92                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
93         /**\r
94          * Non nucleotide\r
95          */\r
96         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
97                         Pattern.CASE_INSENSITIVE);\r
98 \r
99         private SequenceUtil() {\r
100         } // utility class, no instantiation\r
101 \r
102         /*\r
103          * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
104          * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
105          * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
106          * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
107          * SysPrefs.newlinechar); pir_out.close(); } public static void\r
108          * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
109          * BufferedWriter fasta_out = new BufferedWriter( new\r
110          * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
111          * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
112          * SysPrefs.newlinechar); fasta_out.close(); }\r
113          */\r
114 \r
115         /**\r
116          * @return true is the sequence contains only letters a,c, t, g, u\r
117          */\r
118         public static boolean isNucleotideSequence(final FastaSequence s) {\r
119                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
120         }\r
121 \r
122         /**\r
123          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
124          * (!) - B char\r
125          */\r
126         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
127                 sequence = SequenceUtil.cleanSequence(sequence);\r
128                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
129                         return false;\r
130                 }\r
131                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
132                         return false;\r
133                         /*\r
134                          * System.out.format("I found the text starting at " +\r
135                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
136                          * nonDNAmatcher.end());\r
137                          */\r
138                 }\r
139                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
140                 return DNAmatcher.find();\r
141         }\r
142 \r
143         /**\r
144          * Removes all whitespace chars in the sequence string\r
145          * \r
146          * @param sequence\r
147          * @return cleaned up sequence\r
148          */\r
149         public static String cleanSequence(String sequence) {\r
150                 assert sequence != null;\r
151                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
152                 sequence = m.replaceAll("").toUpperCase();\r
153                 return sequence;\r
154         }\r
155 \r
156         /**\r
157          * Removes all special characters and digits as well as whitespace chars\r
158          * from the sequence\r
159          * \r
160          * @param sequence\r
161          * @return cleaned up sequence\r
162          */\r
163         public static String deepCleanSequence(String sequence) {\r
164                 sequence = SequenceUtil.cleanSequence(sequence);\r
165                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
166                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
167                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
168                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
169                 return sequence;\r
170         }\r
171 \r
172         /**\r
173          * @param sequence\r
174          * @return true is the sequence is a protein sequence, false overwise\r
175          */\r
176         public static boolean isProteinSequence(String sequence) {\r
177                 sequence = SequenceUtil.cleanSequence(sequence);\r
178                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
179                         return false;\r
180                 }\r
181                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
182                         return false;\r
183                 }\r
184                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
185                         return false;\r
186                 }\r
187                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
188                 return protmatcher.find();\r
189         }\r
190 \r
191         /**\r
192          * Check whether the sequence confirms to amboguous protein sequence\r
193          * \r
194          * @param sequence\r
195          * @return return true only if the sequence if ambiguous protein sequence\r
196          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
197          *         protein or DNA\r
198          */\r
199         public static boolean isAmbiguosProtein(String sequence) {\r
200                 sequence = SequenceUtil.cleanSequence(sequence);\r
201                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
202                         return false;\r
203                 }\r
204                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
205                         return false;\r
206                 }\r
207                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
208                         return false;\r
209                 }\r
210                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
211                         return false;\r
212                 }\r
213                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
214                 return amb_prot.find();\r
215         }\r
216 \r
217         /**\r
218          * Writes list of FastaSequeces into the outstream formatting the sequence\r
219          * so that it contains width chars on each line\r
220          * \r
221          * @param outstream\r
222          * @param sequences\r
223          * @param width\r
224          *            - the maximum number of characters to write in one line\r
225          * @throws IOException\r
226          */\r
227         public static void writeFasta(final OutputStream outstream,\r
228                         final List<FastaSequence> sequences, final int width)\r
229                         throws IOException {\r
230                 writeFastaKeepTheStream(outstream, sequences, width);\r
231                 outstream.close();\r
232         }\r
233 \r
234         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
235                         final List<FastaSequence> sequences, final int width)\r
236                         throws IOException {\r
237                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
238                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
239                 for (final FastaSequence fs : sequences) {\r
240                         fastawriter.write(">" + fs.getId() + "\n");\r
241                         fastawriter.write(fs.getFormatedSequence(width));\r
242                         fastawriter.write("\n");\r
243                 }\r
244                 fastawriter.flush();\r
245                 writer.flush();\r
246         }\r
247 \r
248         /**\r
249          * Reads fasta sequences from inStream into the list of FastaSequence\r
250          * objects\r
251          * \r
252          * @param inStream\r
253          *            from\r
254          * @return list of FastaSequence objects\r
255          * @throws IOException\r
256          */\r
257         public static List<FastaSequence> readFasta(final InputStream inStream)\r
258                         throws IOException {\r
259                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
260 \r
261                 final BufferedReader infasta = new BufferedReader(\r
262                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
263                 final Pattern pattern = Pattern.compile("//s+");\r
264 \r
265                 String line;\r
266                 String sname = "", seqstr = null;\r
267                 do {\r
268                         line = infasta.readLine();\r
269                         if ((line == null) || line.startsWith(">")) {\r
270                                 if (seqstr != null) {\r
271                                         seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
272                                 }\r
273                                 sname = line; // remove >\r
274                                 seqstr = "";\r
275                         } else {\r
276                                 final String subseq = pattern.matcher(line).replaceAll("");\r
277                                 seqstr += subseq;\r
278                         }\r
279                 } while (line != null);\r
280 \r
281                 infasta.close();\r
282                 return seqs;\r
283         }\r
284 \r
285         /**\r
286          * Writes FastaSequence in the file, each sequence will take one line only\r
287          * \r
288          * @param os\r
289          * @param sequences\r
290          * @throws IOException\r
291          */\r
292         public static void writeFasta(final OutputStream os,\r
293                         final List<FastaSequence> sequences) throws IOException {\r
294                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
295                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
296                 for (final FastaSequence fs : sequences) {\r
297                         fasta_out.write(fs.getOnelineFasta());\r
298                 }\r
299                 fasta_out.close();\r
300                 outWriter.close();\r
301         }\r
302 \r
303         public static Map<String, Score> readJRonn(final File result)\r
304                         throws IOException, UnknownFileFormatException {\r
305                 InputStream input = new FileInputStream(result);\r
306                 Map<String, Score> sequences = readJRonn(input);\r
307                 input.close();\r
308                 return sequences;\r
309         }\r
310 \r
311         /**\r
312          * Reader for JRonn horizontal file format\r
313          * \r
314          * <pre>\r
315          * &gtFoobar M G D T T A G 0.48 0.42\r
316          * 0.42 0.48 0.52 0.53 0.54\r
317          * \r
318          * <pre>\r
319          * Where all values are tab delimited\r
320          * \r
321          * @param inStream\r
322          *            the InputStream connected to the JRonn output file\r
323          * @return List of {@link AnnotatedSequence} objects\r
324          * @throws IOException\r
325          *             is thrown if the inStream has problems accessing the data\r
326          * @throws UnknownFileFormatException\r
327          *             is thrown if the inStream represents an unknown source of\r
328          * data, i.e. not a JRonn output\r
329          */\r
330         public static Map<String, Score> readJRonn(final InputStream inStream)\r
331                         throws IOException, UnknownFileFormatException {\r
332                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
333 \r
334                 final BufferedReader infasta = new BufferedReader(\r
335                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
336 \r
337                 String line;\r
338                 String sname = "";\r
339                 do {\r
340                         line = infasta.readLine();\r
341                         if (line == null || line.isEmpty()) {\r
342                                 // skip empty lines\r
343                                 continue;\r
344                         }\r
345                         if (line.startsWith(">")) {\r
346                                 // read name\r
347                                 sname = line.trim().substring(1);\r
348                                 // read sequence line\r
349                                 line = infasta.readLine();\r
350                                 final String sequence = line.replace("\t", "");\r
351                                 // read annotation line\r
352                                 line = infasta.readLine();\r
353                                 String[] annotValues = line.split("\t");\r
354                                 float[] annotation = convertToNumber(annotValues);\r
355                                 if (annotation.length != sequence.length()) {\r
356                                         throw new UnknownFileFormatException(\r
357                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
358                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
359                                 }\r
360                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
361                         }\r
362                 } while (line != null);\r
363 \r
364                 infasta.close();\r
365                 return seqs;\r
366         }\r
367 \r
368         private static float[] convertToNumber(String[] annotValues)\r
369                         throws UnknownFileFormatException {\r
370                 float[] annotation = new float[annotValues.length];\r
371                 try {\r
372                         for (int i = 0; i < annotation.length; i++) {\r
373                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
374                         }\r
375                 } catch (NumberFormatException e) {\r
376                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
377                                         e.getCause());\r
378                 }\r
379                 return annotation;\r
380         }\r
381 \r
382         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
383                         + ">sequence_name\n "\r
384                         + "M    V       S\n"\r
385                         + "0.43 0.22    0.65\n"\r
386                         + "Where first line is the sequence name,\n"\r
387                         + "second line is the tab delimited sequence,\n"\r
388                         + "third line contains tab delimited disorder prediction values.\n"\r
389                         + "No lines are allowed between these three. Additionally, the number of  "\r
390                         + "sequence residues must be equal to the number of the disorder values.";\r
391 \r
392         /**\r
393          * Closes the Closable and logs the exception if any\r
394          * \r
395          * @param log\r
396          * @param stream\r
397          */\r
398         public final static void closeSilently(java.util.logging.Logger log,\r
399                         Closeable stream) {\r
400                 if (stream != null) {\r
401                         try {\r
402                                 stream.close();\r
403                         } catch (IOException e) {\r
404                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
405                         }\r
406                 }\r
407         }\r
408 \r
409         /**\r
410          * \r
411          > Foobar_dundeefriends\r
412          * \r
413          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
414          * \r
415          * # REM465 355-368\r
416          * \r
417          * # HOTLOOPS 190-204\r
418          * \r
419          * # RESIDUE COILS REM465 HOTLOOPS\r
420          * \r
421          * M 0.86010 0.88512 0.37094\r
422          * \r
423          * T 0.79983 0.85864 0.44331\r
424          * \r
425          * >Next Sequence name\r
426          * \r
427          * \r
428          * @param input\r
429          * @return\r
430          * @throws IOException\r
431          * @throws UnknownFileFormatException\r
432          */\r
433         public static HashMap<String, Set<Score>> readDisembl(\r
434                         final InputStream input) throws IOException,\r
435                         UnknownFileFormatException {\r
436                 Scanner scan = new Scanner(input);\r
437                 scan.useDelimiter(">");\r
438                 if (!scan.hasNext()) {\r
439                         throw new UnknownFileFormatException(\r
440                                         "In Disembl score format each sequence score is expected "\r
441                                                         + "to start from the line: >Sequence name "\r
442                                                         + " No such line was found!");\r
443                 }\r
444 \r
445                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
446                 int seqCounter = 0;\r
447                 while (scan.hasNext()) {\r
448                         seqCounter++;\r
449                         String singleSeq = scan.next();\r
450                         Scanner scansingle = new Scanner(singleSeq);\r
451                         if (!scansingle.hasNextLine()) {\r
452                                 throw new RuntimeException(\r
453                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
454                         }\r
455 \r
456                         StringBuffer seqbuffer = new StringBuffer();\r
457                         ArrayList<Float> coils = new ArrayList<Float>();\r
458                         ArrayList<Float> rem = new ArrayList<Float>();\r
459                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
460 \r
461                         String sequenceName = scansingle.nextLine().trim();\r
462                         TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
463                                         scansingle.nextLine());\r
464                         TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
465                                         scansingle.nextLine());\r
466                         TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
467                                         scansingle.nextLine());\r
468 \r
469                         String title = scansingle.nextLine();\r
470                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
471 \r
472                         while (scansingle.hasNext()) {\r
473                                 seqbuffer.append(scansingle.next());\r
474                                 coils.add(scansingle.nextFloat());\r
475                                 rem.add(scansingle.nextFloat());\r
476                                 hotloops.add(scansingle.nextFloat());\r
477                         }\r
478                         /*\r
479                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
480                          * seqbuffer.toString());\r
481                          */\r
482                         HashSet<Score> scores = new HashSet<Score>();\r
483                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
484                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));\r
485                         scores.add(new Score(DisemblResult.REM465, rem, loopsR));\r
486                         results.put(sequenceName, scores);\r
487 \r
488                         scansingle.close();\r
489                 }\r
490                 scan.close();\r
491                 input.close();\r
492                 return results;\r
493         }\r
494 \r
495         /**\r
496          * Parsing:\r
497          * \r
498          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
499          * 350-391, 429-485, 497-506, 539-547\r
500          * \r
501          * # REM465 355-368\r
502          * \r
503          * # HOTLOOPS 190-204\r
504          * \r
505          * @param lines\r
506          * @return\r
507          */\r
508         private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
509                 TreeSet<Range> ranges = new TreeSet<Range>();\r
510 \r
511                 Scanner scan = new Scanner(lines);\r
512 \r
513                 assert scan.hasNext();\r
514                 String del = scan.next();\r
515                 assert "#".equals(del); // pass delimiter #\r
516                 String type = scan.next(); // pass enum name e.g. COILS\r
517                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
518                                 + resultType.toString();\r
519 \r
520                 // beginning of the ranges\r
521                 scan.useDelimiter(",");\r
522                 while (scan.hasNext()) {\r
523                         String range = scan.next();\r
524                         if (!Util.isEmpty(range)) {\r
525                                 ranges.add(new Range(range.split("-")));\r
526                         }\r
527                 }\r
528                 return ranges;\r
529         }\r
530 \r
531         /**\r
532          * \r
533          > Foobar_dundeefriends\r
534          * \r
535          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
536          * \r
537          * # REM465 355-368\r
538          * \r
539          * # HOTLOOPS 190-204\r
540          * \r
541          * # RESIDUE COILS REM465 HOTLOOPS\r
542          * \r
543          * M 0.86010 0.88512 0.37094\r
544          * \r
545          * T 0.79983 0.85864 0.44331\r
546          * \r
547          * >Next Sequence name\r
548          * \r
549          * \r
550          * @param input\r
551          * @return\r
552          * @throws IOException\r
553          * @throws UnknownFileFormatException\r
554          */\r
555         public static HashMap<String, Set<Score>> readGlobPlot(\r
556                         final InputStream input) throws IOException,\r
557                         UnknownFileFormatException {\r
558                 Scanner scan = new Scanner(input);\r
559                 scan.useDelimiter(">");\r
560                 if (!scan.hasNext()) {\r
561                         throw new UnknownFileFormatException(\r
562                                         "In GlobPlot score format each sequence score is expected "\r
563                                                         + "to start from the line: >Sequence name "\r
564                                                         + " No such line was found!");\r
565                 }\r
566 \r
567                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
568                 int seqCounter = 0;\r
569                 while (scan.hasNext()) {\r
570                         seqCounter++;\r
571                         String singleSeq = scan.next();\r
572                         Scanner scansingle = new Scanner(singleSeq);\r
573                         if (!scansingle.hasNextLine()) {\r
574                                 throw new RuntimeException(\r
575                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
576                         }\r
577 \r
578                         StringBuffer seqbuffer = new StringBuffer();\r
579                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
580                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
581                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
582 \r
583                         String sequenceName = scansingle.nextLine().trim();\r
584                         TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
585                                         scansingle.nextLine());\r
586                         TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
587                                         scansingle.nextLine());\r
588 \r
589                         String title = scansingle.nextLine();\r
590                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
591 \r
592                         while (scansingle.hasNext()) {\r
593                                 seqbuffer.append(scansingle.next());\r
594                                 dydxScore.add(scansingle.nextFloat());\r
595                                 rawScore.add(scansingle.nextFloat());\r
596                                 smoothedScore.add(scansingle.nextFloat());\r
597                         }\r
598                         /*\r
599                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
600                          * seqbuffer.toString());\r
601                          */\r
602                         HashSet<Score> scores = new HashSet<Score>();\r
603                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
604                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
605                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
606                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
607                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
608                         results.put(sequenceName, scores);\r
609 \r
610                         scansingle.close();\r
611                 }\r
612                 scan.close();\r
613                 input.close();\r
614                 return results;\r
615         }\r
616         /**\r
617          * Read AACon result with no alignment files. This method leaves incoming\r
618          * InputStream open!\r
619          * \r
620          * @param results\r
621          *            output file of AAConservation\r
622          * @return Map with keys {@link ConservationMethod} -> float[]\r
623          */\r
624         public static HashSet<Score> readAAConResults(InputStream results) {\r
625                 if (results == null) {\r
626                         throw new NullPointerException(\r
627                                         "InputStream with results must be provided");\r
628                 }\r
629                 HashSet<Score> annotations = new HashSet<Score>();\r
630                 Scanner sc = new Scanner(results);\r
631                 sc.useDelimiter("#");\r
632                 while (sc.hasNext()) {\r
633                         String line = sc.next();\r
634                         int spacePos = line.indexOf(" ");\r
635                         assert spacePos > 0 : "Space is expected as delimited between method "\r
636                                         + "name and values!";\r
637                         String methodLine = line.substring(0, spacePos);\r
638                         ConservationMethod method = ConservationMethod\r
639                                         .getMethod(methodLine);\r
640                         assert method != null : "Method " + methodLine\r
641                                         + " is not recognized! ";\r
642                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
643                         ArrayList<Float> values = new ArrayList<Float>();\r
644                         while (valuesScanner.hasNextDouble()) {\r
645                                 Double value = valuesScanner.nextDouble();\r
646                                 values.add(value.floatValue());\r
647                         }\r
648                         annotations.add(new Score(method, values));\r
649                 }\r
650                 return annotations;\r
651         }\r
652 \r
653         /**\r
654          * Reads and parses Fasta or Clustal formatted file into a list of\r
655          * FastaSequence objects\r
656          * \r
657          * @param inFilePath\r
658          *            the path to the input file\r
659          * @throws IOException\r
660          *             if the file denoted by inFilePath cannot be read\r
661          * @throws UnknownFileFormatException\r
662          *             if the inFilePath points to the file which format cannot be\r
663          *             recognised\r
664          * @return the List of FastaSequence objects\r
665          * \r
666          */\r
667         public static List<FastaSequence> openInputStream(String inFilePath)\r
668                         throws IOException, UnknownFileFormatException {\r
669 \r
670                 // This stream gets closed in isValidClustalFile method\r
671                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
672                 // This stream is closed in the calling methods\r
673                 InputStream inStr = new FileInputStream(inFilePath);\r
674                 List<FastaSequence> fastaSeqs = null;\r
675                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
676                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
677                         // alignment cannot be null see\r
678                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
679                         fastaSeqs = al.getSequences();\r
680                 } else {\r
681                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
682                 }\r
683                 return fastaSeqs;\r
684         }\r
685 \r
686 }\r
687 \r
688 enum DisemblResult {\r
689         /** These contains ranges and scores */\r
690         COILS, REM465, HOTLOOPS\r
691 }\r
692 enum GlobProtResult {\r
693         /** This a range with no scores */\r
694         GlobDoms,\r
695         /** This a range with no scores */\r
696         Disorder,\r
697         /** This a score with no range */\r
698         Dydx,\r
699         /** This a score with no range */\r
700         SmoothedScore,\r
701         /** This a score with no range */\r
702         RawScore\r
703 }