1c6d2e8ab9e81789291710facecfc65258ecd64b
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /*\r
2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
3  * Jalview Web Services version: 2.0 This library is free software; you can\r
4  * redistribute it and/or modify it under the terms of the Apache License\r
5  * version 2 as published by the Apache Software Foundation This library is\r
6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
9  * license is in apache_license.txt. It is also available here: see:\r
10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
11  * work distributed in source code form must include this copyright and license\r
12  * notice.\r
13  */\r
14 \r
15 package compbio.data.sequence;\r
16 \r
17 import java.io.BufferedReader;\r
18 import java.io.BufferedWriter;\r
19 import java.io.Closeable;\r
20 import java.io.File;\r
21 import java.io.FileInputStream;\r
22 import java.io.IOException;\r
23 import java.io.InputStream;\r
24 import java.io.InputStreamReader;\r
25 import java.io.OutputStream;\r
26 import java.io.OutputStreamWriter;\r
27 import java.util.ArrayList;\r
28 import java.util.HashMap;\r
29 import java.util.HashSet;\r
30 import java.util.List;\r
31 import java.util.Map;\r
32 import java.util.Scanner;\r
33 import java.util.Set;\r
34 import java.util.TreeSet;\r
35 import java.util.logging.Level;\r
36 import java.util.regex.Matcher;\r
37 import java.util.regex.Pattern;\r
38 \r
39 import compbio.util.Util;\r
40 \r
41 /**\r
42  * Utility class for operations on sequences\r
43  * \r
44  * @author Petr Troshin\r
45  * @version 1.0\r
46  */\r
47 public final class SequenceUtil {\r
48 \r
49         /**\r
50          * A whitespace character: [\t\n\x0B\f\r]\r
51          */\r
52         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
53 \r
54         /**\r
55          * A digit\r
56          */\r
57         public static final Pattern DIGIT = Pattern.compile("\\d");\r
58 \r
59         /**\r
60          * Non word\r
61          */\r
62         public static final Pattern NONWORD = Pattern.compile("\\W");\r
63 \r
64         /**\r
65          * Valid Amino acids\r
66          */\r
67         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
68                         Pattern.CASE_INSENSITIVE);\r
69 \r
70         /**\r
71          * inversion of AA pattern\r
72          */\r
73         public static final Pattern NON_AA = Pattern.compile(\r
74                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
75 \r
76         /**\r
77          * Same as AA pattern but with two additional letters - XU\r
78          */\r
79         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
80                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
81 \r
82         /**\r
83          * Nucleotides a, t, g, c, u\r
84          */\r
85         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
86                         Pattern.CASE_INSENSITIVE);\r
87 \r
88         /**\r
89          * Ambiguous nucleotide\r
90          */\r
91         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
92                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
93         /**\r
94          * Non nucleotide\r
95          */\r
96         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
97                         Pattern.CASE_INSENSITIVE);\r
98 \r
99         private SequenceUtil() {\r
100         } // utility class, no instantiation\r
101 \r
102         /*\r
103          * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
104          * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
105          * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
106          * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
107          * SysPrefs.newlinechar); pir_out.close(); } public static void\r
108          * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
109          * BufferedWriter fasta_out = new BufferedWriter( new\r
110          * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
111          * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
112          * SysPrefs.newlinechar); fasta_out.close(); }\r
113          */\r
114 \r
115         /**\r
116          * @return true is the sequence contains only letters a,c, t, g, u\r
117          */\r
118         public static boolean isNucleotideSequence(final FastaSequence s) {\r
119                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
120         }\r
121 \r
122         /**\r
123          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
124          * (!) - B char\r
125          */\r
126         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
127                 sequence = SequenceUtil.cleanSequence(sequence);\r
128                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
129                         return false;\r
130                 }\r
131                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
132                         return false;\r
133                         /*\r
134                          * System.out.format("I found the text starting at " +\r
135                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
136                          * nonDNAmatcher.end());\r
137                          */\r
138                 }\r
139                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
140                 return DNAmatcher.find();\r
141         }\r
142 \r
143         /**\r
144          * Removes all whitespace chars in the sequence string\r
145          * \r
146          * @param sequence\r
147          * @return cleaned up sequence\r
148          */\r
149         public static String cleanSequence(String sequence) {\r
150                 assert sequence != null;\r
151                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
152                 sequence = m.replaceAll("").toUpperCase();\r
153                 return sequence;\r
154         }\r
155 \r
156         /**\r
157          * Removes all special characters and digits as well as whitespace chars\r
158          * from the sequence\r
159          * \r
160          * @param sequence\r
161          * @return cleaned up sequence\r
162          */\r
163         public static String deepCleanSequence(String sequence) {\r
164                 sequence = SequenceUtil.cleanSequence(sequence);\r
165                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
166                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
167                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
168                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
169                 return sequence;\r
170         }\r
171 \r
172         /**\r
173          * Remove all non AA chars from the sequence\r
174          * \r
175          * @param sequence\r
176          *            the sequence to clean\r
177          * @return cleaned sequence\r
178          */\r
179         public static String cleanProteinSequence(String sequence) {\r
180                 return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");\r
181         }\r
182 \r
183         /**\r
184          * @param sequence\r
185          * @return true is the sequence is a protein sequence, false overwise\r
186          */\r
187         public static boolean isProteinSequence(String sequence) {\r
188                 sequence = SequenceUtil.cleanSequence(sequence);\r
189                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
190                         return false;\r
191                 }\r
192                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
193                         return false;\r
194                 }\r
195                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
196                         return false;\r
197                 }\r
198                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
199                 return protmatcher.find();\r
200         }\r
201 \r
202         /**\r
203          * Check whether the sequence confirms to amboguous protein sequence\r
204          * \r
205          * @param sequence\r
206          * @return return true only if the sequence if ambiguous protein sequence\r
207          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
208          *         protein or DNA\r
209          */\r
210         public static boolean isAmbiguosProtein(String sequence) {\r
211                 sequence = SequenceUtil.cleanSequence(sequence);\r
212                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
213                         return false;\r
214                 }\r
215                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
216                         return false;\r
217                 }\r
218                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
219                         return false;\r
220                 }\r
221                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
222                         return false;\r
223                 }\r
224                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
225                 return amb_prot.find();\r
226         }\r
227 \r
228         /**\r
229          * Writes list of FastaSequeces into the outstream formatting the sequence\r
230          * so that it contains width chars on each line\r
231          * \r
232          * @param outstream\r
233          * @param sequences\r
234          * @param width\r
235          *            - the maximum number of characters to write in one line\r
236          * @throws IOException\r
237          */\r
238         public static void writeFasta(final OutputStream outstream,\r
239                         final List<FastaSequence> sequences, final int width)\r
240                         throws IOException {\r
241                 writeFastaKeepTheStream(outstream, sequences, width);\r
242                 outstream.close();\r
243         }\r
244 \r
245         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
246                         final List<FastaSequence> sequences, final int width)\r
247                         throws IOException {\r
248                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
249                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
250                 for (final FastaSequence fs : sequences) {\r
251                         fastawriter.write(">" + fs.getId() + "\n");\r
252                         fastawriter.write(fs.getFormatedSequence(width));\r
253                         fastawriter.write("\n");\r
254                 }\r
255                 fastawriter.flush();\r
256                 writer.flush();\r
257         }\r
258 \r
259         /**\r
260          * Reads fasta sequences from inStream into the list of FastaSequence\r
261          * objects\r
262          * \r
263          * @param inStream\r
264          *            from\r
265          * @return list of FastaSequence objects\r
266          * @throws IOException\r
267          */\r
268         public static List<FastaSequence> readFasta(final InputStream inStream)\r
269                         throws IOException {\r
270                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
271 \r
272                 final BufferedReader infasta = new BufferedReader(\r
273                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
274                 final Pattern pattern = Pattern.compile("//s+");\r
275 \r
276                 String line;\r
277                 String sname = "", seqstr = null;\r
278                 do {\r
279                         line = infasta.readLine();\r
280                         if ((line == null) || line.startsWith(">")) {\r
281                                 if (seqstr != null) {\r
282                                         seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
283                                 }\r
284                                 sname = line; // remove >\r
285                                 seqstr = "";\r
286                         } else {\r
287                                 final String subseq = pattern.matcher(line).replaceAll("");\r
288                                 seqstr += subseq;\r
289                         }\r
290                 } while (line != null);\r
291 \r
292                 infasta.close();\r
293                 return seqs;\r
294         }\r
295 \r
296         /**\r
297          * Writes FastaSequence in the file, each sequence will take one line only\r
298          * \r
299          * @param os\r
300          * @param sequences\r
301          * @throws IOException\r
302          */\r
303         public static void writeFasta(final OutputStream os,\r
304                         final List<FastaSequence> sequences) throws IOException {\r
305                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
306                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
307                 for (final FastaSequence fs : sequences) {\r
308                         fasta_out.write(fs.getOnelineFasta());\r
309                 }\r
310                 fasta_out.close();\r
311                 outWriter.close();\r
312         }\r
313 \r
314         /**\r
315          * Read IUPred output\r
316          * \r
317          * @param result\r
318          * @return\r
319          * @throws IOException\r
320          * @throws UnknownFileFormatException\r
321          */\r
322         public static Map<String, Score> readIUPred(final File result)\r
323                         throws IOException, UnknownFileFormatException {\r
324                 InputStream input = new FileInputStream(result);\r
325                 Map<String, Score> sequences = readIUPred(input,\r
326                                 IUPredResult.getType(result));\r
327                 input.close();\r
328                 return sequences;\r
329         }\r
330 \r
331         // Check the type of the file e.g. long| short or domain\r
332         // and read\r
333         /**\r
334          * ## Long Disorder\r
335          * \r
336          * # P53_HUMAN\r
337          * \r
338          * 1 M 0.9943\r
339          * \r
340          * 2 E 0.9917\r
341          * \r
342          * 3 E 0.9879\r
343          * \r
344          * (every line)\r
345          * \r
346          * @throws IOException\r
347          * @throws UnknownFileFormatException\r
348          * \r
349          * \r
350          */\r
351         private static Map<String, Score> readIUPred(InputStream input,\r
352                         IUPredResult type) throws IOException, UnknownFileFormatException {\r
353 \r
354                 Score score = null;\r
355                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
356                 Scanner scan = new Scanner(input);\r
357                 scan.useDelimiter("#");\r
358                 while (scan.hasNext()) {\r
359                         String nextEntry = scan.next();\r
360                         Scanner entry = new Scanner(nextEntry);\r
361                         String name = entry.nextLine().trim();\r
362                         // inside entry:\r
363                         if (IUPredResult.Glob == type) {\r
364                                 // parse domains\r
365                                 TreeSet<Range> ranges = parseIUPredDomains(entry);\r
366                                 score = new Score(type, ranges);\r
367                         } else {\r
368                                 // parse short | long\r
369                                 float[] scores = parseIUPredScores(entry);\r
370                                 score = new Score(type, scores);\r
371                         }\r
372                         entry.close();\r
373                         seqs.put(name, score);\r
374                 }\r
375 \r
376                 scan.close();\r
377                 return seqs;\r
378         }\r
379 \r
380         /**\r
381          * # P53_HUMA\r
382          * \r
383          * Number of globular domains: 2\r
384          * \r
385          * globular domain 1. 98 - 269\r
386          * \r
387          * globular domain 2. 431 - 482\r
388          * \r
389          * >P53_HUMA\r
390          * \r
391          * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp\r
392          * \r
393          * @param scan\r
394          */\r
395         private static TreeSet<Range> parseIUPredDomains(Scanner scan) {\r
396                 String header = "Number of globular domains:";\r
397                 String domainPref = "globular domain";\r
398                 TreeSet<Range> ranges = new TreeSet<Range>();\r
399                 String line = scan.nextLine().trim();\r
400                 assert line.startsWith(header);\r
401                 line = line.substring(header.length()).trim();\r
402                 int domainNum = Integer.parseInt(line);\r
403                 if (domainNum == 0) {\r
404                         return ranges;\r
405                 }\r
406 \r
407                 for (int i = 0; i < domainNum; i++) {\r
408                         assert scan.hasNextLine();\r
409                         line = scan.nextLine();\r
410                         assert line.trim().startsWith(domainPref);\r
411                         line = line.substring(line.indexOf(".") + 1).trim();\r
412                         Range r = new Range(line.split("-"));\r
413                         ranges.add(r);\r
414                 }\r
415 \r
416                 return ranges;\r
417         }\r
418         /*\r
419          * 1 M 0.9943\r
420          * \r
421          * 2 E 0.9917\r
422          */\r
423         private static float[] parseIUPredScores(Scanner scan)\r
424                         throws UnknownFileFormatException {\r
425                 List<String> annotation = new ArrayList<String>();\r
426                 while (scan.hasNextLine()) {\r
427                         String line = scan.nextLine().trim();\r
428                         String[] val = line.split("\\s+");\r
429                         annotation.add(val[2]);\r
430                 }\r
431                 return convertToNumber(annotation\r
432                                 .toArray(new String[annotation.size()]));\r
433         }\r
434 \r
435         public static Map<String, Score> readJRonn(final File result)\r
436                         throws IOException, UnknownFileFormatException {\r
437                 InputStream input = new FileInputStream(result);\r
438                 Map<String, Score> sequences = readJRonn(input);\r
439                 input.close();\r
440                 return sequences;\r
441         }\r
442 \r
443         /**\r
444          * Reader for JRonn horizontal file format\r
445          * \r
446          * <pre>\r
447          * &gtFoobar M G D T T A G 0.48 0.42\r
448          * 0.42 0.48 0.52 0.53 0.54\r
449          * \r
450          * <pre>\r
451          * Where all values are tab delimited\r
452          * \r
453          * @param inStream\r
454          *            the InputStream connected to the JRonn output file\r
455          * @return List of {@link AnnotatedSequence} objects\r
456          * @throws IOException\r
457          *             is thrown if the inStream has problems accessing the data\r
458          * @throws UnknownFileFormatException\r
459          *             is thrown if the inStream represents an unknown source of\r
460          * data, i.e. not a JRonn output\r
461          */\r
462         public static Map<String, Score> readJRonn(final InputStream inStream)\r
463                         throws IOException, UnknownFileFormatException {\r
464                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
465 \r
466                 final BufferedReader infasta = new BufferedReader(\r
467                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
468 \r
469                 String line;\r
470                 String sname = "";\r
471                 do {\r
472                         line = infasta.readLine();\r
473                         if (line == null || line.isEmpty()) {\r
474                                 // skip empty lines\r
475                                 continue;\r
476                         }\r
477                         if (line.startsWith(">")) {\r
478                                 // read name\r
479                                 sname = line.trim().substring(1);\r
480                                 // read sequence line\r
481                                 line = infasta.readLine();\r
482                                 final String sequence = line.replace("\t", "");\r
483                                 // read annotation line\r
484                                 line = infasta.readLine();\r
485                                 String[] annotValues = line.split("\t");\r
486                                 float[] annotation = convertToNumber(annotValues);\r
487                                 if (annotation.length != sequence.length()) {\r
488                                         throw new UnknownFileFormatException(\r
489                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
490                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
491                                 }\r
492                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
493                         }\r
494                 } while (line != null);\r
495 \r
496                 infasta.close();\r
497                 return seqs;\r
498         }\r
499 \r
500         private static float[] convertToNumber(String[] annotValues)\r
501                         throws UnknownFileFormatException {\r
502                 float[] annotation = new float[annotValues.length];\r
503                 try {\r
504                         for (int i = 0; i < annotation.length; i++) {\r
505                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
506                         }\r
507                 } catch (NumberFormatException e) {\r
508                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
509                                         e.getCause());\r
510                 }\r
511                 return annotation;\r
512         }\r
513 \r
514         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
515                         + ">sequence_name\n "\r
516                         + "M    V       S\n"\r
517                         + "0.43 0.22    0.65\n"\r
518                         + "Where first line is the sequence name,\n"\r
519                         + "second line is the tab delimited sequence,\n"\r
520                         + "third line contains tab delimited disorder prediction values.\n"\r
521                         + "No lines are allowed between these three. Additionally, the number of  "\r
522                         + "sequence residues must be equal to the number of the disorder values.";\r
523 \r
524         /**\r
525          * Closes the Closable and logs the exception if any\r
526          * \r
527          * @param log\r
528          * @param stream\r
529          */\r
530         public final static void closeSilently(java.util.logging.Logger log,\r
531                         Closeable stream) {\r
532                 if (stream != null) {\r
533                         try {\r
534                                 stream.close();\r
535                         } catch (IOException e) {\r
536                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
537                         }\r
538                 }\r
539         }\r
540 \r
541         /**\r
542          * \r
543          > Foobar_dundeefriends\r
544          * \r
545          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
546          * \r
547          * # REM465 355-368\r
548          * \r
549          * # HOTLOOPS 190-204\r
550          * \r
551          * # RESIDUE COILS REM465 HOTLOOPS\r
552          * \r
553          * M 0.86010 0.88512 0.37094\r
554          * \r
555          * T 0.79983 0.85864 0.44331\r
556          * \r
557          * >Next Sequence name\r
558          * \r
559          * \r
560          * @param input\r
561          * @return\r
562          * @throws IOException\r
563          * @throws UnknownFileFormatException\r
564          */\r
565         public static HashMap<String, Set<Score>> readDisembl(\r
566                         final InputStream input) throws IOException,\r
567                         UnknownFileFormatException {\r
568                 Scanner scan = new Scanner(input);\r
569                 scan.useDelimiter(">");\r
570                 if (!scan.hasNext()) {\r
571                         throw new UnknownFileFormatException(\r
572                                         "In Disembl score format each sequence score is expected "\r
573                                                         + "to start from the line: >Sequence name "\r
574                                                         + " No such line was found!");\r
575                 }\r
576 \r
577                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
578                 int seqCounter = 0;\r
579                 while (scan.hasNext()) {\r
580                         seqCounter++;\r
581                         String singleSeq = scan.next();\r
582                         Scanner scansingle = new Scanner(singleSeq);\r
583                         if (!scansingle.hasNextLine()) {\r
584                                 throw new RuntimeException(\r
585                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
586                         }\r
587 \r
588                         StringBuffer seqbuffer = new StringBuffer();\r
589                         ArrayList<Float> coils = new ArrayList<Float>();\r
590                         ArrayList<Float> rem = new ArrayList<Float>();\r
591                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
592 \r
593                         String sequenceName = scansingle.nextLine().trim();\r
594                         TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
595                                         scansingle.nextLine());\r
596                         TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
597                                         scansingle.nextLine());\r
598                         TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
599                                         scansingle.nextLine());\r
600 \r
601                         String title = scansingle.nextLine();\r
602                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
603 \r
604                         while (scansingle.hasNext()) {\r
605                                 seqbuffer.append(scansingle.next());\r
606                                 coils.add(scansingle.nextFloat());\r
607                                 rem.add(scansingle.nextFloat());\r
608                                 hotloops.add(scansingle.nextFloat());\r
609                         }\r
610                         /*\r
611                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
612                          * seqbuffer.toString());\r
613                          */\r
614                         HashSet<Score> scores = new HashSet<Score>();\r
615                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
616                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));\r
617                         scores.add(new Score(DisemblResult.REM465, rem, loopsR));\r
618                         results.put(sequenceName, scores);\r
619 \r
620                         scansingle.close();\r
621                 }\r
622                 scan.close();\r
623                 input.close();\r
624                 return results;\r
625         }\r
626 \r
627         /**\r
628          * Parsing:\r
629          * \r
630          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
631          * 350-391, 429-485, 497-506, 539-547\r
632          * \r
633          * # REM465 355-368\r
634          * \r
635          * # HOTLOOPS 190-204\r
636          * \r
637          * @param lines\r
638          * @return\r
639          */\r
640         private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
641                 TreeSet<Range> ranges = new TreeSet<Range>();\r
642 \r
643                 Scanner scan = new Scanner(lines);\r
644 \r
645                 assert scan.hasNext();\r
646                 String del = scan.next();\r
647                 assert "#".equals(del); // pass delimiter #\r
648                 String type = scan.next(); // pass enum name e.g. COILS\r
649                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
650                                 + resultType.toString();\r
651 \r
652                 // beginning of the ranges\r
653                 scan.useDelimiter(",");\r
654                 while (scan.hasNext()) {\r
655                         String range = scan.next();\r
656                         if (!Util.isEmpty(range)) {\r
657                                 ranges.add(new Range(range.split("-")));\r
658                         }\r
659                 }\r
660                 return ranges;\r
661         }\r
662 \r
663         /**\r
664          * \r
665          > Foobar_dundeefriends\r
666          * \r
667          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
668          * \r
669          * # REM465 355-368\r
670          * \r
671          * # HOTLOOPS 190-204\r
672          * \r
673          * # RESIDUE COILS REM465 HOTLOOPS\r
674          * \r
675          * M 0.86010 0.88512 0.37094\r
676          * \r
677          * T 0.79983 0.85864 0.44331\r
678          * \r
679          * >Next Sequence name\r
680          * \r
681          * \r
682          * @param input\r
683          * @return\r
684          * @throws IOException\r
685          * @throws UnknownFileFormatException\r
686          */\r
687         public static HashMap<String, Set<Score>> readGlobPlot(\r
688                         final InputStream input) throws IOException,\r
689                         UnknownFileFormatException {\r
690                 Scanner scan = new Scanner(input);\r
691                 scan.useDelimiter(">");\r
692                 if (!scan.hasNext()) {\r
693                         throw new UnknownFileFormatException(\r
694                                         "In GlobPlot score format each sequence score is expected "\r
695                                                         + "to start from the line: >Sequence name "\r
696                                                         + " No such line was found!");\r
697                 }\r
698 \r
699                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
700                 int seqCounter = 0;\r
701                 while (scan.hasNext()) {\r
702                         seqCounter++;\r
703                         String singleSeq = scan.next();\r
704                         Scanner scansingle = new Scanner(singleSeq);\r
705                         if (!scansingle.hasNextLine()) {\r
706                                 throw new RuntimeException(\r
707                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
708                         }\r
709 \r
710                         StringBuffer seqbuffer = new StringBuffer();\r
711                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
712                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
713                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
714 \r
715                         String sequenceName = scansingle.nextLine().trim();\r
716                         TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
717                                         scansingle.nextLine());\r
718                         TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
719                                         scansingle.nextLine());\r
720 \r
721                         String title = scansingle.nextLine();\r
722                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
723 \r
724                         while (scansingle.hasNext()) {\r
725                                 seqbuffer.append(scansingle.next());\r
726                                 dydxScore.add(scansingle.nextFloat());\r
727                                 rawScore.add(scansingle.nextFloat());\r
728                                 smoothedScore.add(scansingle.nextFloat());\r
729                         }\r
730                         /*\r
731                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
732                          * seqbuffer.toString());\r
733                          */\r
734                         Set<Score> scores = new TreeSet<Score>();\r
735                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
736                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
737                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
738                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
739                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
740                         results.put(sequenceName, scores);\r
741 \r
742                         scansingle.close();\r
743                 }\r
744                 scan.close();\r
745                 input.close();\r
746                 return results;\r
747         }\r
748         /**\r
749          * Read AACon result with no alignment files. This method leaves incoming\r
750          * InputStream open!\r
751          * \r
752          * @param results\r
753          *            output file of AAConservation\r
754          * @return Map with keys {@link ConservationMethod} -> float[]\r
755          */\r
756         public static HashSet<Score> readAAConResults(InputStream results) {\r
757                 if (results == null) {\r
758                         throw new NullPointerException(\r
759                                         "InputStream with results must be provided");\r
760                 }\r
761                 HashSet<Score> annotations = new HashSet<Score>();\r
762                 Scanner sc = new Scanner(results);\r
763                 sc.useDelimiter("#");\r
764                 while (sc.hasNext()) {\r
765                         String line = sc.next();\r
766                         int spacePos = line.indexOf(" ");\r
767                         assert spacePos > 0 : "Space is expected as delimited between method "\r
768                                         + "name and values!";\r
769                         String methodLine = line.substring(0, spacePos);\r
770                         ConservationMethod method = ConservationMethod\r
771                                         .getMethod(methodLine);\r
772                         assert method != null : "Method " + methodLine\r
773                                         + " is not recognized! ";\r
774                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
775                         ArrayList<Float> values = new ArrayList<Float>();\r
776                         while (valuesScanner.hasNextDouble()) {\r
777                                 Double value = valuesScanner.nextDouble();\r
778                                 values.add(value.floatValue());\r
779                         }\r
780                         annotations.add(new Score(method, values));\r
781                 }\r
782                 return annotations;\r
783         }\r
784 \r
785         /**\r
786          * Reads and parses Fasta or Clustal formatted file into a list of\r
787          * FastaSequence objects\r
788          * \r
789          * @param inFilePath\r
790          *            the path to the input file\r
791          * @throws IOException\r
792          *             if the file denoted by inFilePath cannot be read\r
793          * @throws UnknownFileFormatException\r
794          *             if the inFilePath points to the file which format cannot be\r
795          *             recognised\r
796          * @return the List of FastaSequence objects\r
797          * \r
798          */\r
799         public static List<FastaSequence> openInputStream(String inFilePath)\r
800                         throws IOException, UnknownFileFormatException {\r
801 \r
802                 // This stream gets closed in isValidClustalFile method\r
803                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
804                 // This stream is closed in the calling methods\r
805                 InputStream inStr = new FileInputStream(inFilePath);\r
806                 List<FastaSequence> fastaSeqs = null;\r
807                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
808                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
809                         // alignment cannot be null see\r
810                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
811                         fastaSeqs = al.getSequences();\r
812                 } else {\r
813                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
814                 }\r
815                 return fastaSeqs;\r
816         }\r
817 \r
818 }\r
819 \r
820 enum DisemblResult {\r
821         /** These contains ranges and scores */\r
822         COILS, REM465, HOTLOOPS\r
823 }\r
824 enum GlobProtResult {\r
825         /** This a range with no scores */\r
826         GlobDoms,\r
827         /** This a range with no scores */\r
828         Disorder,\r
829         /** This a score with no range */\r
830         Dydx,\r
831         /** This a score with no range */\r
832         SmoothedScore,\r
833         /** This a score with no range */\r
834         RawScore\r
835 }\r
836 \r
837 enum IUPredResult {\r
838         /**\r
839          * Short disorder\r
840          */\r
841         Short,\r
842         /**\r
843          * Long disorder\r
844          */\r
845         Long,\r
846         /**\r
847          * Globular domains\r
848          */\r
849         Glob;\r
850 \r
851         static IUPredResult getType(File file) {\r
852                 assert file != null;\r
853                 String name = file.getName();\r
854                 if (name.endsWith(Long.toString().toLowerCase())) {\r
855                         return Long;\r
856                 }\r
857                 if (name.endsWith(Short.toString().toLowerCase())) {\r
858                         return Short;\r
859                 }\r
860                 if (name.endsWith(Glob.toString().toLowerCase())) {\r
861                         return Glob;\r
862                 }\r
863                 throw new AssertionError(\r
864                                 "IUPred result file type cannot be recognised! "\r
865                                                 + "\nFile must ends with one of [glob, long or short]"\r
866                                                 + "\n but given file name was: " + file.getName());\r
867         }\r
868 }