7e6c672efa86b5270109223f3cea8ada642f7b8a
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /*\r
2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
3  * Jalview Web Services version: 2.0 This library is free software; you can\r
4  * redistribute it and/or modify it under the terms of the Apache License\r
5  * version 2 as published by the Apache Software Foundation This library is\r
6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
9  * license is in apache_license.txt. It is also available here: see:\r
10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
11  * work distributed in source code form must include this copyright and license\r
12  * notice.\r
13  */\r
14 \r
15 package compbio.data.sequence;\r
16 \r
17 import java.io.BufferedReader;\r
18 import java.io.BufferedWriter;\r
19 import java.io.Closeable;\r
20 import java.io.File;\r
21 import java.io.FileInputStream;\r
22 import java.io.IOException;\r
23 import java.io.InputStream;\r
24 import java.io.InputStreamReader;\r
25 import java.io.OutputStream;\r
26 import java.io.OutputStreamWriter;\r
27 import java.util.ArrayList;\r
28 import java.util.HashMap;\r
29 import java.util.HashSet;\r
30 import java.util.List;\r
31 import java.util.Map;\r
32 import java.util.Scanner;\r
33 import java.util.Set;\r
34 import java.util.SortedSet;\r
35 import java.util.TreeSet;\r
36 import java.util.logging.Level;\r
37 import java.util.regex.Matcher;\r
38 import java.util.regex.Pattern;\r
39 \r
40 import compbio.util.Util;\r
41 \r
42 /**\r
43  * Utility class for operations on sequences\r
44  * \r
45  * @author Petr Troshin\r
46  * @version 1.0\r
47  */\r
48 public final class SequenceUtil {\r
49 \r
50         /**\r
51          * A whitespace character: [\t\n\x0B\f\r]\r
52          */\r
53         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
54 \r
55         /**\r
56          * A digit\r
57          */\r
58         public static final Pattern DIGIT = Pattern.compile("\\d");\r
59 \r
60         /**\r
61          * Non word\r
62          */\r
63         public static final Pattern NONWORD = Pattern.compile("\\W");\r
64 \r
65         /**\r
66          * Valid Amino acids\r
67          */\r
68         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
69                         Pattern.CASE_INSENSITIVE);\r
70 \r
71         /**\r
72          * inversion of AA pattern\r
73          */\r
74         public static final Pattern NON_AA = Pattern.compile(\r
75                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
76 \r
77         /**\r
78          * Same as AA pattern but with two additional letters - XU\r
79          */\r
80         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
81                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
82 \r
83         /**\r
84          * Nucleotides a, t, g, c, u\r
85          */\r
86         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
87                         Pattern.CASE_INSENSITIVE);\r
88 \r
89         /**\r
90          * Ambiguous nucleotide\r
91          */\r
92         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
93                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
94         /**\r
95          * Non nucleotide\r
96          */\r
97         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
98                         Pattern.CASE_INSENSITIVE);\r
99 \r
100         private SequenceUtil() {\r
101         } // utility class, no instantiation\r
102 \r
103         /*\r
104          * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
105          * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
106          * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
107          * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
108          * SysPrefs.newlinechar); pir_out.close(); } public static void\r
109          * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
110          * BufferedWriter fasta_out = new BufferedWriter( new\r
111          * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
112          * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
113          * SysPrefs.newlinechar); fasta_out.close(); }\r
114          */\r
115 \r
116         /**\r
117          * @return true is the sequence contains only letters a,c, t, g, u\r
118          */\r
119         public static boolean isNucleotideSequence(final FastaSequence s) {\r
120                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
121         }\r
122 \r
123         /**\r
124          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
125          * (!) - B char\r
126          */\r
127         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
128                 sequence = SequenceUtil.cleanSequence(sequence);\r
129                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
130                         return false;\r
131                 }\r
132                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
133                         return false;\r
134                         /*\r
135                          * System.out.format("I found the text starting at " +\r
136                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
137                          * nonDNAmatcher.end());\r
138                          */\r
139                 }\r
140                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
141                 return DNAmatcher.find();\r
142         }\r
143 \r
144         /**\r
145          * Removes all whitespace chars in the sequence string\r
146          * \r
147          * @param sequence\r
148          * @return cleaned up sequence\r
149          */\r
150         public static String cleanSequence(String sequence) {\r
151                 assert sequence != null;\r
152                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
153                 sequence = m.replaceAll("").toUpperCase();\r
154                 return sequence;\r
155         }\r
156 \r
157         /**\r
158          * Removes all special characters and digits as well as whitespace chars\r
159          * from the sequence\r
160          * \r
161          * @param sequence\r
162          * @return cleaned up sequence\r
163          */\r
164         public static String deepCleanSequence(String sequence) {\r
165                 sequence = SequenceUtil.cleanSequence(sequence);\r
166                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
167                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
168                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
169                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
170                 return sequence;\r
171         }\r
172 \r
173         /**\r
174          * @param sequence\r
175          * @return true is the sequence is a protein sequence, false overwise\r
176          */\r
177         public static boolean isProteinSequence(String sequence) {\r
178                 sequence = SequenceUtil.cleanSequence(sequence);\r
179                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
180                         return false;\r
181                 }\r
182                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
183                         return false;\r
184                 }\r
185                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
186                         return false;\r
187                 }\r
188                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
189                 return protmatcher.find();\r
190         }\r
191 \r
192         /**\r
193          * Check whether the sequence confirms to amboguous protein sequence\r
194          * \r
195          * @param sequence\r
196          * @return return true only if the sequence if ambiguous protein sequence\r
197          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
198          *         protein or DNA\r
199          */\r
200         public static boolean isAmbiguosProtein(String sequence) {\r
201                 sequence = SequenceUtil.cleanSequence(sequence);\r
202                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
203                         return false;\r
204                 }\r
205                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
206                         return false;\r
207                 }\r
208                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
209                         return false;\r
210                 }\r
211                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
212                         return false;\r
213                 }\r
214                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
215                 return amb_prot.find();\r
216         }\r
217 \r
218         /**\r
219          * Writes list of FastaSequeces into the outstream formatting the sequence\r
220          * so that it contains width chars on each line\r
221          * \r
222          * @param outstream\r
223          * @param sequences\r
224          * @param width\r
225          *            - the maximum number of characters to write in one line\r
226          * @throws IOException\r
227          */\r
228         public static void writeFasta(final OutputStream outstream,\r
229                         final List<FastaSequence> sequences, final int width)\r
230                         throws IOException {\r
231                 writeFastaKeepTheStream(outstream, sequences, width);\r
232                 outstream.close();\r
233         }\r
234 \r
235         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
236                         final List<FastaSequence> sequences, final int width)\r
237                         throws IOException {\r
238                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
239                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
240                 for (final FastaSequence fs : sequences) {\r
241                         fastawriter.write(">" + fs.getId() + "\n");\r
242                         fastawriter.write(fs.getFormatedSequence(width));\r
243                         fastawriter.write("\n");\r
244                 }\r
245                 fastawriter.flush();\r
246                 writer.flush();\r
247         }\r
248 \r
249         /**\r
250          * Reads fasta sequences from inStream into the list of FastaSequence\r
251          * objects\r
252          * \r
253          * @param inStream\r
254          *            from\r
255          * @return list of FastaSequence objects\r
256          * @throws IOException\r
257          */\r
258         public static List<FastaSequence> readFasta(final InputStream inStream)\r
259                         throws IOException {\r
260                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
261 \r
262                 final BufferedReader infasta = new BufferedReader(\r
263                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
264                 final Pattern pattern = Pattern.compile("//s+");\r
265 \r
266                 String line;\r
267                 String sname = "", seqstr = null;\r
268                 do {\r
269                         line = infasta.readLine();\r
270                         if ((line == null) || line.startsWith(">")) {\r
271                                 if (seqstr != null) {\r
272                                         seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
273                                 }\r
274                                 sname = line; // remove >\r
275                                 seqstr = "";\r
276                         } else {\r
277                                 final String subseq = pattern.matcher(line).replaceAll("");\r
278                                 seqstr += subseq;\r
279                         }\r
280                 } while (line != null);\r
281 \r
282                 infasta.close();\r
283                 return seqs;\r
284         }\r
285 \r
286         /**\r
287          * Writes FastaSequence in the file, each sequence will take one line only\r
288          * \r
289          * @param os\r
290          * @param sequences\r
291          * @throws IOException\r
292          */\r
293         public static void writeFasta(final OutputStream os,\r
294                         final List<FastaSequence> sequences) throws IOException {\r
295                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
296                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
297                 for (final FastaSequence fs : sequences) {\r
298                         fasta_out.write(fs.getOnelineFasta());\r
299                 }\r
300                 fasta_out.close();\r
301                 outWriter.close();\r
302         }\r
303 \r
304         public static Map<String, Score> readJRonn(final File result)\r
305                         throws IOException, UnknownFileFormatException {\r
306                 InputStream input = new FileInputStream(result);\r
307                 Map<String, Score> sequences = readJRonn(input);\r
308                 input.close();\r
309                 return sequences;\r
310         }\r
311 \r
312         /**\r
313          * Reader for JRonn horizontal file format\r
314          * \r
315          * <pre>\r
316          * &gtFoobar M G D T T A G 0.48 0.42\r
317          * 0.42 0.48 0.52 0.53 0.54\r
318          * \r
319          * <pre>\r
320          * Where all values are tab delimited\r
321          * \r
322          * @param inStream\r
323          *            the InputStream connected to the JRonn output file\r
324          * @return List of {@link AnnotatedSequence} objects\r
325          * @throws IOException\r
326          *             is thrown if the inStream has problems accessing the data\r
327          * @throws UnknownFileFormatException\r
328          *             is thrown if the inStream represents an unknown source of\r
329          * data, i.e. not a JRonn output\r
330          */\r
331         public static Map<String, Score> readJRonn(final InputStream inStream)\r
332                         throws IOException, UnknownFileFormatException {\r
333                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
334 \r
335                 final BufferedReader infasta = new BufferedReader(\r
336                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
337 \r
338                 String line;\r
339                 String sname = "";\r
340                 do {\r
341                         line = infasta.readLine();\r
342                         if (line == null || line.isEmpty()) {\r
343                                 // skip empty lines\r
344                                 continue;\r
345                         }\r
346                         if (line.startsWith(">")) {\r
347                                 // read name\r
348                                 sname = line.trim().substring(1);\r
349                                 // read sequence line\r
350                                 line = infasta.readLine();\r
351                                 final String sequence = line.replace("\t", "");\r
352                                 // read annotation line\r
353                                 line = infasta.readLine();\r
354                                 String[] annotValues = line.split("\t");\r
355                                 float[] annotation = convertToNumber(annotValues);\r
356                                 if (annotation.length != sequence.length()) {\r
357                                         throw new UnknownFileFormatException(\r
358                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
359                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
360                                 }\r
361                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
362                         }\r
363                 } while (line != null);\r
364 \r
365                 infasta.close();\r
366                 return seqs;\r
367         }\r
368         private static float[] convertToNumber(String[] annotValues)\r
369                         throws UnknownFileFormatException {\r
370                 float[] annotation = new float[annotValues.length];\r
371                 try {\r
372                         for (int i = 0; i < annotation.length; i++) {\r
373                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
374                         }\r
375                 } catch (NumberFormatException e) {\r
376                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
377                                         e.getCause());\r
378                 }\r
379                 return annotation;\r
380         }\r
381 \r
382         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
383                         + ">sequence_name\n "\r
384                         + "M    V       S\n"\r
385                         + "0.43 0.22    0.65\n"\r
386                         + "Where first line is the sequence name,\n"\r
387                         + "second line is the tab delimited sequence,\n"\r
388                         + "third line contains tab delimited disorder prediction values.\n"\r
389                         + "No lines are allowed between these three. Additionally, the number of  "\r
390                         + "sequence residues must be equal to the number of the disorder values.";\r
391 \r
392         /**\r
393          * Closes the Closable and logs the exception if any\r
394          * \r
395          * @param log\r
396          * @param stream\r
397          */\r
398         public final static void closeSilently(java.util.logging.Logger log,\r
399                         Closeable stream) {\r
400                 if (stream != null) {\r
401                         try {\r
402                                 stream.close();\r
403                         } catch (IOException e) {\r
404                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
405                         }\r
406                 }\r
407         }\r
408 \r
409         /**\r
410          * \r
411          > Foobar_dundeefriends\r
412          * \r
413          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
414          * \r
415          * # REM465 355-368\r
416          * \r
417          * # HOTLOOPS 190-204\r
418          * \r
419          * # RESIDUE COILS REM465 HOTLOOPS\r
420          * \r
421          * M 0.86010 0.88512 0.37094\r
422          * \r
423          * T 0.79983 0.85864 0.44331\r
424          * \r
425          * >Next Sequence name\r
426          * \r
427          * \r
428          * @param input\r
429          * @return\r
430          * @throws IOException\r
431          * @throws UnknownFileFormatException\r
432          */\r
433         public static Map<FastaSequence, Set<Score>> readDisembl(\r
434                         final InputStream input) throws IOException,\r
435                         UnknownFileFormatException {\r
436                 Scanner scan = new Scanner(input);\r
437                 scan.useDelimiter(">");\r
438                 if (!scan.hasNext()) {\r
439                         throw new UnknownFileFormatException(\r
440                                         "In Disembl score format each sequence score is expected "\r
441                                                         + "to start from the line: >Sequence name "\r
442                                                         + " No such line was found!");\r
443                 }\r
444 \r
445                 Map<FastaSequence, Set<Score>> results = new HashMap<FastaSequence, Set<Score>>();\r
446                 int seqCounter = 0;\r
447                 while (scan.hasNext()) {\r
448                         seqCounter++;\r
449                         String singleSeq = scan.next();\r
450                         Scanner scansingle = new Scanner(singleSeq);\r
451                         if (!scansingle.hasNextLine()) {\r
452                                 throw new RuntimeException(\r
453                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
454                         }\r
455 \r
456                         StringBuffer seqbuffer = new StringBuffer();\r
457                         ArrayList<Float> coils = new ArrayList<Float>();\r
458                         ArrayList<Float> rem = new ArrayList<Float>();\r
459                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
460 \r
461                         String sequenceName = scansingle.nextLine().trim();\r
462                         SortedSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
463                                         scansingle.nextLine());\r
464                         SortedSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
465                                         scansingle.nextLine());\r
466                         SortedSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
467                                         scansingle.nextLine());\r
468 \r
469                         String title = scansingle.nextLine();\r
470                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
471 \r
472                         while (scansingle.hasNext()) {\r
473                                 seqbuffer.append(scansingle.next());\r
474                                 coils.add(scansingle.nextFloat());\r
475                                 rem.add(scansingle.nextFloat());\r
476                                 hotloops.add(scansingle.nextFloat());\r
477                         }\r
478                         FastaSequence fs = new FastaSequence(sequenceName,\r
479                                         seqbuffer.toString());\r
480                         Set<Score> scores = new HashSet<Score>();\r
481                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
482                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));\r
483                         scores.add(new Score(DisemblResult.REM465, rem, loopsR));\r
484                         results.put(fs, scores);\r
485 \r
486                         scansingle.close();\r
487                 }\r
488 \r
489                 input.close();\r
490                 return results;\r
491         }\r
492 \r
493         /**\r
494          * Parsing:\r
495          * \r
496          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
497          * 350-391, 429-485, 497-506, 539-547\r
498          * \r
499          * # REM465 355-368\r
500          * \r
501          * # HOTLOOPS 190-204\r
502          * \r
503          * @param lines\r
504          * @return\r
505          */\r
506         private static SortedSet<Range> parseRanges(Enum resultType, String lines) {\r
507                 SortedSet<Range> ranges = new TreeSet<Range>();\r
508                 Scanner scan = new Scanner(lines);\r
509 \r
510                 assert scan.hasNext();\r
511                 assert "#".equals(scan.next()); // pass delimiter #\r
512                 String type = scan.next(); // pass enum name e.g. COILS\r
513                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
514                                 + resultType.toString();\r
515 \r
516                 // beginning of the ranges\r
517                 scan.useDelimiter(",");\r
518                 while (scan.hasNext()) {\r
519                         String range = scan.next();\r
520                         if (!Util.isEmpty(range)) {\r
521                                 ranges.add(new Range(range.split("-")));\r
522                         }\r
523                 }\r
524                 return ranges;\r
525         }\r
526 \r
527         public static Map<String, Set<Score>> removeSequences(\r
528                         Map<FastaSequence, Set<Score>> disemblResults) {\r
529                 Map<String, Set<Score>> seqNameScores = new HashMap<String, Set<Score>>();\r
530                 for (Map.Entry<FastaSequence, Set<Score>> dres : disemblResults\r
531                                 .entrySet()) {\r
532                         seqNameScores.put(dres.getKey().getId(), dres.getValue());\r
533                 }\r
534                 return seqNameScores;\r
535         }\r
536 \r
537         /**\r
538          * \r
539          > Foobar_dundeefriends\r
540          * \r
541          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
542          * \r
543          * # REM465 355-368\r
544          * \r
545          * # HOTLOOPS 190-204\r
546          * \r
547          * # RESIDUE COILS REM465 HOTLOOPS\r
548          * \r
549          * M 0.86010 0.88512 0.37094\r
550          * \r
551          * T 0.79983 0.85864 0.44331\r
552          * \r
553          * >Next Sequence name\r
554          * \r
555          * \r
556          * @param input\r
557          * @return\r
558          * @throws IOException\r
559          * @throws UnknownFileFormatException\r
560          */\r
561         public static Map<FastaSequence, Set<Score>> readGlobPlot(\r
562                         final InputStream input) throws IOException,\r
563                         UnknownFileFormatException {\r
564                 Scanner scan = new Scanner(input);\r
565                 scan.useDelimiter(">");\r
566                 if (!scan.hasNext()) {\r
567                         throw new UnknownFileFormatException(\r
568                                         "In GlobPlot score format each sequence score is expected "\r
569                                                         + "to start from the line: >Sequence name "\r
570                                                         + " No such line was found!");\r
571                 }\r
572 \r
573                 Map<FastaSequence, Set<Score>> results = new HashMap<FastaSequence, Set<Score>>();\r
574                 int seqCounter = 0;\r
575                 while (scan.hasNext()) {\r
576                         seqCounter++;\r
577                         String singleSeq = scan.next();\r
578                         Scanner scansingle = new Scanner(singleSeq);\r
579                         if (!scansingle.hasNextLine()) {\r
580                                 throw new RuntimeException(\r
581                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
582                         }\r
583 \r
584                         StringBuffer seqbuffer = new StringBuffer();\r
585                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
586                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
587                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
588 \r
589                         String sequenceName = scansingle.nextLine().trim();\r
590                         SortedSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
591                                         scansingle.nextLine());\r
592                         SortedSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
593                                         scansingle.nextLine());\r
594 \r
595                         String title = scansingle.nextLine();\r
596                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
597 \r
598                         while (scansingle.hasNext()) {\r
599                                 seqbuffer.append(scansingle.next());\r
600                                 dydxScore.add(scansingle.nextFloat());\r
601                                 rawScore.add(scansingle.nextFloat());\r
602                                 smoothedScore.add(scansingle.nextFloat());\r
603                         }\r
604                         FastaSequence fs = new FastaSequence(sequenceName,\r
605                                         seqbuffer.toString());\r
606                         Set<Score> scores = new HashSet<Score>();\r
607                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
608                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
609                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
610                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
611                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
612                         results.put(fs, scores);\r
613 \r
614                         scansingle.close();\r
615                 }\r
616 \r
617                 input.close();\r
618                 return results;\r
619         }\r
620         /**\r
621          * Read AACon result with no alignment files. This method leaves incoming\r
622          * the InputStream results open!\r
623          * \r
624          * @param results\r
625          *            output file of AAConservation\r
626          * @return Map with keys {@link ConservationMethod} -> float[]\r
627          */\r
628         public static HashSet<Score> readAAConResults(InputStream results) {\r
629                 if (results == null) {\r
630                         throw new NullPointerException(\r
631                                         "InputStream with results must be provided");\r
632                 }\r
633                 HashSet<Score> annotations = new HashSet<Score>();\r
634                 Scanner sc = new Scanner(results);\r
635                 sc.useDelimiter("#");\r
636                 while (sc.hasNext()) {\r
637                         String line = sc.next();\r
638                         int spacePos = line.indexOf(" ");\r
639                         assert spacePos > 0 : "Space is expected as delimited between method "\r
640                                         + "name and values!";\r
641                         String methodLine = line.substring(0, spacePos);\r
642                         ConservationMethod method = ConservationMethod\r
643                                         .getMethod(methodLine);\r
644                         assert method != null : "Method " + methodLine\r
645                                         + " is not recognized! ";\r
646                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
647                         ArrayList<Float> values = new ArrayList<Float>();\r
648                         while (valuesScanner.hasNextDouble()) {\r
649                                 Double value = valuesScanner.nextDouble();\r
650                                 values.add(value.floatValue());\r
651                         }\r
652                         annotations.add(new Score(method, values));\r
653                 }\r
654                 return annotations;\r
655         }\r
656 \r
657         /**\r
658          * Reads and parses Fasta or Clustal formatted file into a list of\r
659          * FastaSequence objects\r
660          * \r
661          * @param inFilePath\r
662          *            the path to the input file\r
663          * @throws IOException\r
664          *             if the file denoted by inFilePath cannot be read\r
665          * @throws UnknownFileFormatException\r
666          *             if the inFilePath points to the file which format cannot be\r
667          *             recognised\r
668          * @return the List of FastaSequence objects\r
669          * \r
670          */\r
671         public static List<FastaSequence> openInputStream(String inFilePath)\r
672                         throws IOException, UnknownFileFormatException {\r
673 \r
674                 // This stream gets closed in isValidClustalFile method\r
675                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
676                 // This stream is closed in the calling methods\r
677                 InputStream inStr = new FileInputStream(inFilePath);\r
678                 List<FastaSequence> fastaSeqs = null;\r
679                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
680                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
681                         // alignment cannot be null see\r
682                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
683                         fastaSeqs = al.getSequences();\r
684                 } else {\r
685                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
686                 }\r
687                 return fastaSeqs;\r
688         }\r
689 \r
690 }\r
691 \r
692 enum DisemblResult {\r
693         /** These contains ranges and scores */\r
694         COILS, REM465, HOTLOOPS\r
695 }\r
696 enum GlobProtResult {\r
697         /** This a range with no scores */\r
698         GlobDoms,\r
699         /** This a range with no scores */\r
700         Disorder,\r
701         /** This a score with no range */\r
702         Dydx,\r
703         /** This a score with no range */\r
704         SmoothedScore,\r
705         /** This a score with no range */\r
706         RawScore\r
707 }