new services are registered in wsbuild
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /*\r
2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
3  * Jalview Web Services version: 2.0 This library is free software; you can\r
4  * redistribute it and/or modify it under the terms of the Apache License\r
5  * version 2 as published by the Apache Software Foundation This library is\r
6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
9  * license is in apache_license.txt. It is also available here: see:\r
10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
11  * work distributed in source code form must include this copyright and license\r
12  * notice.\r
13  */\r
14 \r
15 package compbio.data.sequence;\r
16 \r
17 import java.io.BufferedReader;\r
18 import java.io.BufferedWriter;\r
19 import java.io.Closeable;\r
20 import java.io.File;\r
21 import java.io.FileInputStream;\r
22 import java.io.IOException;\r
23 import java.io.InputStream;\r
24 import java.io.InputStreamReader;\r
25 import java.io.OutputStream;\r
26 import java.io.OutputStreamWriter;\r
27 import java.util.ArrayList;\r
28 import java.util.HashMap;\r
29 import java.util.HashSet;\r
30 import java.util.List;\r
31 import java.util.Map;\r
32 import java.util.Scanner;\r
33 import java.util.TreeSet;\r
34 import java.util.logging.Level;\r
35 import java.util.regex.Matcher;\r
36 import java.util.regex.Pattern;\r
37 \r
38 import compbio.util.Util;\r
39 \r
40 /**\r
41  * Utility class for operations on sequences\r
42  * \r
43  * @author Petr Troshin\r
44  * @version 1.0\r
45  */\r
46 public final class SequenceUtil {\r
47 \r
48         /**\r
49          * A whitespace character: [\t\n\x0B\f\r]\r
50          */\r
51         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
52 \r
53         /**\r
54          * A digit\r
55          */\r
56         public static final Pattern DIGIT = Pattern.compile("\\d");\r
57 \r
58         /**\r
59          * Non word\r
60          */\r
61         public static final Pattern NONWORD = Pattern.compile("\\W");\r
62 \r
63         /**\r
64          * Valid Amino acids\r
65          */\r
66         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
67                         Pattern.CASE_INSENSITIVE);\r
68 \r
69         /**\r
70          * inversion of AA pattern\r
71          */\r
72         public static final Pattern NON_AA = Pattern.compile(\r
73                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
74 \r
75         /**\r
76          * Same as AA pattern but with two additional letters - XU\r
77          */\r
78         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
79                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
80 \r
81         /**\r
82          * Nucleotides a, t, g, c, u\r
83          */\r
84         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
85                         Pattern.CASE_INSENSITIVE);\r
86 \r
87         /**\r
88          * Ambiguous nucleotide\r
89          */\r
90         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
91                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
92         /**\r
93          * Non nucleotide\r
94          */\r
95         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
96                         Pattern.CASE_INSENSITIVE);\r
97 \r
98         private SequenceUtil() {\r
99         } // utility class, no instantiation\r
100 \r
101         /*\r
102          * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
103          * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
104          * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
105          * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
106          * SysPrefs.newlinechar); pir_out.close(); } public static void\r
107          * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
108          * BufferedWriter fasta_out = new BufferedWriter( new\r
109          * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
110          * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
111          * SysPrefs.newlinechar); fasta_out.close(); }\r
112          */\r
113 \r
114         /**\r
115          * @return true is the sequence contains only letters a,c, t, g, u\r
116          */\r
117         public static boolean isNucleotideSequence(final FastaSequence s) {\r
118                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
119         }\r
120 \r
121         /**\r
122          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
123          * (!) - B char\r
124          */\r
125         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
126                 sequence = SequenceUtil.cleanSequence(sequence);\r
127                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
128                         return false;\r
129                 }\r
130                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
131                         return false;\r
132                         /*\r
133                          * System.out.format("I found the text starting at " +\r
134                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
135                          * nonDNAmatcher.end());\r
136                          */\r
137                 }\r
138                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
139                 return DNAmatcher.find();\r
140         }\r
141 \r
142         /**\r
143          * Removes all whitespace chars in the sequence string\r
144          * \r
145          * @param sequence\r
146          * @return cleaned up sequence\r
147          */\r
148         public static String cleanSequence(String sequence) {\r
149                 assert sequence != null;\r
150                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
151                 sequence = m.replaceAll("").toUpperCase();\r
152                 return sequence;\r
153         }\r
154 \r
155         /**\r
156          * Removes all special characters and digits as well as whitespace chars\r
157          * from the sequence\r
158          * \r
159          * @param sequence\r
160          * @return cleaned up sequence\r
161          */\r
162         public static String deepCleanSequence(String sequence) {\r
163                 sequence = SequenceUtil.cleanSequence(sequence);\r
164                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
165                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
166                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
167                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
168                 return sequence;\r
169         }\r
170 \r
171         /**\r
172          * @param sequence\r
173          * @return true is the sequence is a protein sequence, false overwise\r
174          */\r
175         public static boolean isProteinSequence(String sequence) {\r
176                 sequence = SequenceUtil.cleanSequence(sequence);\r
177                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
178                         return false;\r
179                 }\r
180                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
181                         return false;\r
182                 }\r
183                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
184                         return false;\r
185                 }\r
186                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
187                 return protmatcher.find();\r
188         }\r
189 \r
190         /**\r
191          * Check whether the sequence confirms to amboguous protein sequence\r
192          * \r
193          * @param sequence\r
194          * @return return true only if the sequence if ambiguous protein sequence\r
195          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
196          *         protein or DNA\r
197          */\r
198         public static boolean isAmbiguosProtein(String sequence) {\r
199                 sequence = SequenceUtil.cleanSequence(sequence);\r
200                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
201                         return false;\r
202                 }\r
203                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
204                         return false;\r
205                 }\r
206                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
207                         return false;\r
208                 }\r
209                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
210                         return false;\r
211                 }\r
212                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
213                 return amb_prot.find();\r
214         }\r
215 \r
216         /**\r
217          * Writes list of FastaSequeces into the outstream formatting the sequence\r
218          * so that it contains width chars on each line\r
219          * \r
220          * @param outstream\r
221          * @param sequences\r
222          * @param width\r
223          *            - the maximum number of characters to write in one line\r
224          * @throws IOException\r
225          */\r
226         public static void writeFasta(final OutputStream outstream,\r
227                         final List<FastaSequence> sequences, final int width)\r
228                         throws IOException {\r
229                 writeFastaKeepTheStream(outstream, sequences, width);\r
230                 outstream.close();\r
231         }\r
232 \r
233         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
234                         final List<FastaSequence> sequences, final int width)\r
235                         throws IOException {\r
236                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
237                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
238                 for (final FastaSequence fs : sequences) {\r
239                         fastawriter.write(">" + fs.getId() + "\n");\r
240                         fastawriter.write(fs.getFormatedSequence(width));\r
241                         fastawriter.write("\n");\r
242                 }\r
243                 fastawriter.flush();\r
244                 writer.flush();\r
245         }\r
246 \r
247         /**\r
248          * Reads fasta sequences from inStream into the list of FastaSequence\r
249          * objects\r
250          * \r
251          * @param inStream\r
252          *            from\r
253          * @return list of FastaSequence objects\r
254          * @throws IOException\r
255          */\r
256         public static List<FastaSequence> readFasta(final InputStream inStream)\r
257                         throws IOException {\r
258                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
259 \r
260                 final BufferedReader infasta = new BufferedReader(\r
261                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
262                 final Pattern pattern = Pattern.compile("//s+");\r
263 \r
264                 String line;\r
265                 String sname = "", seqstr = null;\r
266                 do {\r
267                         line = infasta.readLine();\r
268                         if ((line == null) || line.startsWith(">")) {\r
269                                 if (seqstr != null) {\r
270                                         seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
271                                 }\r
272                                 sname = line; // remove >\r
273                                 seqstr = "";\r
274                         } else {\r
275                                 final String subseq = pattern.matcher(line).replaceAll("");\r
276                                 seqstr += subseq;\r
277                         }\r
278                 } while (line != null);\r
279 \r
280                 infasta.close();\r
281                 return seqs;\r
282         }\r
283 \r
284         /**\r
285          * Writes FastaSequence in the file, each sequence will take one line only\r
286          * \r
287          * @param os\r
288          * @param sequences\r
289          * @throws IOException\r
290          */\r
291         public static void writeFasta(final OutputStream os,\r
292                         final List<FastaSequence> sequences) throws IOException {\r
293                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
294                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
295                 for (final FastaSequence fs : sequences) {\r
296                         fasta_out.write(fs.getOnelineFasta());\r
297                 }\r
298                 fasta_out.close();\r
299                 outWriter.close();\r
300         }\r
301 \r
302         public static Map<String, Score> readJRonn(final File result)\r
303                         throws IOException, UnknownFileFormatException {\r
304                 InputStream input = new FileInputStream(result);\r
305                 Map<String, Score> sequences = readJRonn(input);\r
306                 input.close();\r
307                 return sequences;\r
308         }\r
309 \r
310         /**\r
311          * Reader for JRonn horizontal file format\r
312          * \r
313          * <pre>\r
314          * &gtFoobar M G D T T A G 0.48 0.42\r
315          * 0.42 0.48 0.52 0.53 0.54\r
316          * \r
317          * <pre>\r
318          * Where all values are tab delimited\r
319          * \r
320          * @param inStream\r
321          *            the InputStream connected to the JRonn output file\r
322          * @return List of {@link AnnotatedSequence} objects\r
323          * @throws IOException\r
324          *             is thrown if the inStream has problems accessing the data\r
325          * @throws UnknownFileFormatException\r
326          *             is thrown if the inStream represents an unknown source of\r
327          * data, i.e. not a JRonn output\r
328          */\r
329         public static Map<String, Score> readJRonn(final InputStream inStream)\r
330                         throws IOException, UnknownFileFormatException {\r
331                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
332 \r
333                 final BufferedReader infasta = new BufferedReader(\r
334                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
335 \r
336                 String line;\r
337                 String sname = "";\r
338                 do {\r
339                         line = infasta.readLine();\r
340                         if (line == null || line.isEmpty()) {\r
341                                 // skip empty lines\r
342                                 continue;\r
343                         }\r
344                         if (line.startsWith(">")) {\r
345                                 // read name\r
346                                 sname = line.trim().substring(1);\r
347                                 // read sequence line\r
348                                 line = infasta.readLine();\r
349                                 final String sequence = line.replace("\t", "");\r
350                                 // read annotation line\r
351                                 line = infasta.readLine();\r
352                                 String[] annotValues = line.split("\t");\r
353                                 float[] annotation = convertToNumber(annotValues);\r
354                                 if (annotation.length != sequence.length()) {\r
355                                         throw new UnknownFileFormatException(\r
356                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
357                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
358                                 }\r
359                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
360                         }\r
361                 } while (line != null);\r
362 \r
363                 infasta.close();\r
364                 return seqs;\r
365         }\r
366 \r
367         private static float[] convertToNumber(String[] annotValues)\r
368                         throws UnknownFileFormatException {\r
369                 float[] annotation = new float[annotValues.length];\r
370                 try {\r
371                         for (int i = 0; i < annotation.length; i++) {\r
372                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
373                         }\r
374                 } catch (NumberFormatException e) {\r
375                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
376                                         e.getCause());\r
377                 }\r
378                 return annotation;\r
379         }\r
380 \r
381         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
382                         + ">sequence_name\n "\r
383                         + "M    V       S\n"\r
384                         + "0.43 0.22    0.65\n"\r
385                         + "Where first line is the sequence name,\n"\r
386                         + "second line is the tab delimited sequence,\n"\r
387                         + "third line contains tab delimited disorder prediction values.\n"\r
388                         + "No lines are allowed between these three. Additionally, the number of  "\r
389                         + "sequence residues must be equal to the number of the disorder values.";\r
390 \r
391         /**\r
392          * Closes the Closable and logs the exception if any\r
393          * \r
394          * @param log\r
395          * @param stream\r
396          */\r
397         public final static void closeSilently(java.util.logging.Logger log,\r
398                         Closeable stream) {\r
399                 if (stream != null) {\r
400                         try {\r
401                                 stream.close();\r
402                         } catch (IOException e) {\r
403                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
404                         }\r
405                 }\r
406         }\r
407 \r
408         /**\r
409          * \r
410          > Foobar_dundeefriends\r
411          * \r
412          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
413          * \r
414          * # REM465 355-368\r
415          * \r
416          * # HOTLOOPS 190-204\r
417          * \r
418          * # RESIDUE COILS REM465 HOTLOOPS\r
419          * \r
420          * M 0.86010 0.88512 0.37094\r
421          * \r
422          * T 0.79983 0.85864 0.44331\r
423          * \r
424          * >Next Sequence name\r
425          * \r
426          * \r
427          * @param input\r
428          * @return\r
429          * @throws IOException\r
430          * @throws UnknownFileFormatException\r
431          */\r
432         public static HashMap<FastaSequence, HashSet<Score>> readDisembl(\r
433                         final InputStream input) throws IOException,\r
434                         UnknownFileFormatException {\r
435                 Scanner scan = new Scanner(input);\r
436                 scan.useDelimiter(">");\r
437                 if (!scan.hasNext()) {\r
438                         throw new UnknownFileFormatException(\r
439                                         "In Disembl score format each sequence score is expected "\r
440                                                         + "to start from the line: >Sequence name "\r
441                                                         + " No such line was found!");\r
442                 }\r
443 \r
444                 HashMap<FastaSequence, HashSet<Score>> results = new HashMap<FastaSequence, HashSet<Score>>();\r
445                 int seqCounter = 0;\r
446                 while (scan.hasNext()) {\r
447                         seqCounter++;\r
448                         String singleSeq = scan.next();\r
449                         Scanner scansingle = new Scanner(singleSeq);\r
450                         if (!scansingle.hasNextLine()) {\r
451                                 throw new RuntimeException(\r
452                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
453                         }\r
454 \r
455                         StringBuffer seqbuffer = new StringBuffer();\r
456                         ArrayList<Float> coils = new ArrayList<Float>();\r
457                         ArrayList<Float> rem = new ArrayList<Float>();\r
458                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
459 \r
460                         String sequenceName = scansingle.nextLine().trim();\r
461                         TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
462                                         scansingle.nextLine());\r
463                         TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
464                                         scansingle.nextLine());\r
465                         TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
466                                         scansingle.nextLine());\r
467 \r
468                         String title = scansingle.nextLine();\r
469                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
470 \r
471                         while (scansingle.hasNext()) {\r
472                                 seqbuffer.append(scansingle.next());\r
473                                 coils.add(scansingle.nextFloat());\r
474                                 rem.add(scansingle.nextFloat());\r
475                                 hotloops.add(scansingle.nextFloat());\r
476                         }\r
477                         FastaSequence fs = new FastaSequence(sequenceName,\r
478                                         seqbuffer.toString());\r
479                         HashSet<Score> scores = new HashSet<Score>();\r
480                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
481                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));\r
482                         scores.add(new Score(DisemblResult.REM465, rem, loopsR));\r
483                         results.put(fs, scores);\r
484 \r
485                         scansingle.close();\r
486                 }\r
487                 scan.close();\r
488                 input.close();\r
489                 return results;\r
490         }\r
491 \r
492         /**\r
493          * Parsing:\r
494          * \r
495          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
496          * 350-391, 429-485, 497-506, 539-547\r
497          * \r
498          * # REM465 355-368\r
499          * \r
500          * # HOTLOOPS 190-204\r
501          * \r
502          * @param lines\r
503          * @return\r
504          */\r
505         private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
506                 TreeSet<Range> ranges = new TreeSet<Range>();\r
507 \r
508                 Scanner scan = new Scanner(lines);\r
509 \r
510                 assert scan.hasNext();\r
511                 String del = scan.next();\r
512                 assert "#".equals(del); // pass delimiter #\r
513                 String type = scan.next(); // pass enum name e.g. COILS\r
514                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
515                                 + resultType.toString();\r
516 \r
517                 // beginning of the ranges\r
518                 scan.useDelimiter(",");\r
519                 while (scan.hasNext()) {\r
520                         String range = scan.next();\r
521                         if (!Util.isEmpty(range)) {\r
522                                 ranges.add(new Range(range.split("-")));\r
523                         }\r
524                 }\r
525                 return ranges;\r
526         }\r
527 \r
528         public static HashMap<String, HashSet<Score>> removeSequences(\r
529                         HashMap<FastaSequence, HashSet<Score>> disemblResults) {\r
530                 HashMap<String, HashSet<Score>> seqNameScores = new HashMap<String, HashSet<Score>>();\r
531                 for (Map.Entry<FastaSequence, HashSet<Score>> dres : disemblResults\r
532                                 .entrySet()) {\r
533                         seqNameScores.put(dres.getKey().getId(), dres.getValue());\r
534                 }\r
535                 return seqNameScores;\r
536         }\r
537 \r
538         /**\r
539          * \r
540          > Foobar_dundeefriends\r
541          * \r
542          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
543          * \r
544          * # REM465 355-368\r
545          * \r
546          * # HOTLOOPS 190-204\r
547          * \r
548          * # RESIDUE COILS REM465 HOTLOOPS\r
549          * \r
550          * M 0.86010 0.88512 0.37094\r
551          * \r
552          * T 0.79983 0.85864 0.44331\r
553          * \r
554          * >Next Sequence name\r
555          * \r
556          * \r
557          * @param input\r
558          * @return\r
559          * @throws IOException\r
560          * @throws UnknownFileFormatException\r
561          */\r
562         public static HashMap<FastaSequence, HashSet<Score>> readGlobPlot(\r
563                         final InputStream input) throws IOException,\r
564                         UnknownFileFormatException {\r
565                 Scanner scan = new Scanner(input);\r
566                 scan.useDelimiter(">");\r
567                 if (!scan.hasNext()) {\r
568                         throw new UnknownFileFormatException(\r
569                                         "In GlobPlot score format each sequence score is expected "\r
570                                                         + "to start from the line: >Sequence name "\r
571                                                         + " No such line was found!");\r
572                 }\r
573 \r
574                 HashMap<FastaSequence, HashSet<Score>> results = new HashMap<FastaSequence, HashSet<Score>>();\r
575                 int seqCounter = 0;\r
576                 while (scan.hasNext()) {\r
577                         seqCounter++;\r
578                         String singleSeq = scan.next();\r
579                         Scanner scansingle = new Scanner(singleSeq);\r
580                         if (!scansingle.hasNextLine()) {\r
581                                 throw new RuntimeException(\r
582                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
583                         }\r
584 \r
585                         StringBuffer seqbuffer = new StringBuffer();\r
586                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
587                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
588                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
589 \r
590                         String sequenceName = scansingle.nextLine().trim();\r
591                         TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
592                                         scansingle.nextLine());\r
593                         TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
594                                         scansingle.nextLine());\r
595 \r
596                         String title = scansingle.nextLine();\r
597                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
598 \r
599                         while (scansingle.hasNext()) {\r
600                                 seqbuffer.append(scansingle.next());\r
601                                 dydxScore.add(scansingle.nextFloat());\r
602                                 rawScore.add(scansingle.nextFloat());\r
603                                 smoothedScore.add(scansingle.nextFloat());\r
604                         }\r
605                         FastaSequence fs = new FastaSequence(sequenceName,\r
606                                         seqbuffer.toString());\r
607                         HashSet<Score> scores = new HashSet<Score>();\r
608                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
609                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
610                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
611                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
612                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
613                         results.put(fs, scores);\r
614 \r
615                         scansingle.close();\r
616                 }\r
617                 scan.close();\r
618                 input.close();\r
619                 return results;\r
620         }\r
621         /**\r
622          * Read AACon result with no alignment files. This method leaves incoming\r
623          * the InputStream results open!\r
624          * \r
625          * @param results\r
626          *            output file of AAConservation\r
627          * @return Map with keys {@link ConservationMethod} -> float[]\r
628          */\r
629         public static HashSet<Score> readAAConResults(InputStream results) {\r
630                 if (results == null) {\r
631                         throw new NullPointerException(\r
632                                         "InputStream with results must be provided");\r
633                 }\r
634                 HashSet<Score> annotations = new HashSet<Score>();\r
635                 Scanner sc = new Scanner(results);\r
636                 sc.useDelimiter("#");\r
637                 while (sc.hasNext()) {\r
638                         String line = sc.next();\r
639                         int spacePos = line.indexOf(" ");\r
640                         assert spacePos > 0 : "Space is expected as delimited between method "\r
641                                         + "name and values!";\r
642                         String methodLine = line.substring(0, spacePos);\r
643                         ConservationMethod method = ConservationMethod\r
644                                         .getMethod(methodLine);\r
645                         assert method != null : "Method " + methodLine\r
646                                         + " is not recognized! ";\r
647                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
648                         ArrayList<Float> values = new ArrayList<Float>();\r
649                         while (valuesScanner.hasNextDouble()) {\r
650                                 Double value = valuesScanner.nextDouble();\r
651                                 values.add(value.floatValue());\r
652                         }\r
653                         annotations.add(new Score(method, values));\r
654                 }\r
655                 return annotations;\r
656         }\r
657 \r
658         /**\r
659          * Reads and parses Fasta or Clustal formatted file into a list of\r
660          * FastaSequence objects\r
661          * \r
662          * @param inFilePath\r
663          *            the path to the input file\r
664          * @throws IOException\r
665          *             if the file denoted by inFilePath cannot be read\r
666          * @throws UnknownFileFormatException\r
667          *             if the inFilePath points to the file which format cannot be\r
668          *             recognised\r
669          * @return the List of FastaSequence objects\r
670          * \r
671          */\r
672         public static List<FastaSequence> openInputStream(String inFilePath)\r
673                         throws IOException, UnknownFileFormatException {\r
674 \r
675                 // This stream gets closed in isValidClustalFile method\r
676                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
677                 // This stream is closed in the calling methods\r
678                 InputStream inStr = new FileInputStream(inFilePath);\r
679                 List<FastaSequence> fastaSeqs = null;\r
680                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
681                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
682                         // alignment cannot be null see\r
683                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
684                         fastaSeqs = al.getSequences();\r
685                 } else {\r
686                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
687                 }\r
688                 return fastaSeqs;\r
689         }\r
690 \r
691 }\r
692 \r
693 enum DisemblResult {\r
694         /** These contains ranges and scores */\r
695         COILS, REM465, HOTLOOPS\r
696 }\r
697 enum GlobProtResult {\r
698         /** This a range with no scores */\r
699         GlobDoms,\r
700         /** This a range with no scores */\r
701         Disorder,\r
702         /** This a score with no range */\r
703         Dydx,\r
704         /** This a score with no range */\r
705         SmoothedScore,\r
706         /** This a score with no range */\r
707         RawScore\r
708 }