Merge branch 'JABAWS_Release_2_5' into develop
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /* Copyright (c) 2011 Peter Troshin\r
2  * Copyright (c) 2013 Alexander Sherstnev\r
3  *  \r
4  *  JAva Bioinformatics Analysis Web Services (JABAWS)\r
5  *  @version: 2.5     \r
6  * \r
7  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
8  *  Apache License version 2 as published by the Apache Software Foundation\r
9  * \r
10  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
11  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
12  *  License for more details.\r
13  * \r
14  *  A copy of the license is in apache_license.txt. It is also available here:\r
15  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
16  * \r
17  * Any republication or derived work distributed in source code form\r
18  * must include this copyright and license notice.\r
19  */\r
20 \r
21 package compbio.data.sequence;\r
22 \r
23 import java.io.BufferedReader;\r
24 import java.io.BufferedWriter;\r
25 import java.io.Closeable;\r
26 import java.io.File;\r
27 import java.io.FileInputStream;\r
28 import java.io.FileNotFoundException;\r
29 import java.io.IOException;\r
30 import java.io.InputStream;\r
31 import java.io.InputStreamReader;\r
32 import java.io.OutputStream;\r
33 import java.io.OutputStreamWriter;\r
34 import java.util.ArrayList;\r
35 import java.util.Arrays;\r
36 import java.util.HashMap;\r
37 import java.util.Collections;\r
38 import java.util.HashSet;\r
39 import java.util.List;\r
40 import java.util.Map;\r
41 import java.util.Scanner;\r
42 import java.util.Set;\r
43 import java.util.TreeMap;\r
44 import java.util.TreeSet;\r
45 import java.util.logging.Level;\r
46 import java.util.regex.Matcher;\r
47 import java.util.regex.Pattern;\r
48 \r
49 import compbio.util.Util;\r
50 \r
51 /**\r
52  * Utility class for operations on sequences\r
53  * \r
54  * @author Peter Troshin\r
55  * @since 1.0\r
56  * @version 2.0 June 2011\r
57  */\r
58 public final class SequenceUtil {\r
59 \r
60         /**\r
61          * A whitespace character: [\t\n\x0B\f\r]\r
62          */\r
63         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
64 \r
65         /**\r
66          * A digit\r
67          */\r
68         public static final Pattern DIGIT = Pattern.compile("\\d");\r
69 \r
70         /**\r
71          * Non word\r
72          */\r
73         public static final Pattern NONWORD = Pattern.compile("\\W");\r
74 \r
75         /**\r
76          * Valid Amino acids\r
77          */\r
78         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
79                         Pattern.CASE_INSENSITIVE);\r
80 \r
81         /**\r
82          * inversion of AA pattern\r
83          */\r
84         public static final Pattern NON_AA = Pattern.compile(\r
85                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
86 \r
87         /**\r
88          * Same as AA pattern but with two additional letters - XU\r
89          */\r
90         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
91                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
92 \r
93         /**\r
94          * Nucleotides a, t, g, c, u\r
95          */\r
96         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
97                         Pattern.CASE_INSENSITIVE);\r
98 \r
99         /**\r
100          * Ambiguous nucleotide\r
101          */\r
102         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
103                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
104         /**\r
105          * Non nucleotide\r
106          */\r
107         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
108                         Pattern.CASE_INSENSITIVE);\r
109 \r
110         private SequenceUtil() {\r
111         } // utility class, no instantiation\r
112 \r
113         /**\r
114          * @return true is the sequence contains only letters a,c, t, g, u\r
115          */\r
116         public static boolean isNucleotideSequence(final FastaSequence s) {\r
117                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
118         }\r
119 \r
120         /**\r
121          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
122          * (!) - B char\r
123          */\r
124         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
125                 sequence = SequenceUtil.cleanSequence(sequence);\r
126                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
127                         return false;\r
128                 }\r
129                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
130                         return false;\r
131                         /*\r
132                          * System.out.format("I found the text starting at " +\r
133                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
134                          * nonDNAmatcher.end());\r
135                          */\r
136                 }\r
137                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
138                 return DNAmatcher.find();\r
139         }\r
140 \r
141         /**\r
142          * Removes all whitespace chars in the sequence string\r
143          * \r
144          * @param sequence\r
145          * @return cleaned up sequence\r
146          */\r
147         public static String cleanSequence(String sequence) {\r
148                 assert sequence != null;\r
149                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
150                 sequence = m.replaceAll("").toUpperCase();\r
151                 return sequence;\r
152         }\r
153 \r
154         /**\r
155          * Removes all special characters and digits as well as whitespace chars\r
156          * from the sequence\r
157          * \r
158          * @param sequence\r
159          * @return cleaned up sequence\r
160          */\r
161         public static String deepCleanSequence(String sequence) {\r
162                 sequence = SequenceUtil.cleanSequence(sequence);\r
163                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
164                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
165                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
166                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
167                 return sequence;\r
168         }\r
169 \r
170         /**\r
171          * Remove all non AA chars from the sequence\r
172          * \r
173          * @param sequence\r
174          *            the sequence to clean\r
175          * @return cleaned sequence\r
176          */\r
177         public static String cleanProteinSequence(String sequence) {\r
178                 return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");\r
179         }\r
180 \r
181         /**\r
182          * @param sequence\r
183          * @return true is the sequence is a protein sequence, false overwise\r
184          */\r
185         public static boolean isProteinSequence(String sequence) {\r
186                 sequence = SequenceUtil.cleanSequence(sequence);\r
187                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
188                         return false;\r
189                 }\r
190                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
191                         return false;\r
192                 }\r
193                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
194                         return false;\r
195                 }\r
196                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
197                 return protmatcher.find();\r
198         }\r
199 \r
200         /**\r
201          * Check whether the sequence confirms to amboguous protein sequence\r
202          * \r
203          * @param sequence\r
204          * @return return true only if the sequence if ambiguous protein sequence\r
205          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
206          *         protein or DNA\r
207          */\r
208         public static boolean isAmbiguosProtein(String sequence) {\r
209                 sequence = SequenceUtil.cleanSequence(sequence);\r
210                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
211                         return false;\r
212                 }\r
213                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
214                         return false;\r
215                 }\r
216                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
217                         return false;\r
218                 }\r
219                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
220                         return false;\r
221                 }\r
222                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
223                 return amb_prot.find();\r
224         }\r
225 \r
226         /**\r
227          * Writes list of FastaSequeces into the outstream formatting the sequence\r
228          * so that it contains width chars on each line\r
229          * \r
230          * @param outstream\r
231          * @param sequences\r
232          * @param width\r
233          *            - the maximum number of characters to write in one line\r
234          * @throws IOException\r
235          */\r
236         public static void writeFasta(final OutputStream outstream,\r
237                         final List<FastaSequence> sequences, final int width)\r
238                         throws IOException {\r
239                 writeFastaKeepTheStream(outstream, sequences, width);\r
240                 outstream.close();\r
241         }\r
242 \r
243         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
244                         final List<FastaSequence> sequences, final int width)\r
245                         throws IOException {\r
246                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
247                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
248                 for (final FastaSequence fs : sequences) {\r
249                         fastawriter.write(">" + fs.getId() + "\n");\r
250                         fastawriter.write(fs.getFormatedSequence(width));\r
251                         fastawriter.write("\n");\r
252                 }\r
253                 fastawriter.flush();\r
254                 writer.flush();\r
255         }\r
256 \r
257         /**\r
258          * Reads fasta sequences from inStream into the list of FastaSequence\r
259          * objects\r
260          * \r
261          * @param inStream\r
262          *            from\r
263          * @return list of FastaSequence objects\r
264          * @throws IOException\r
265          */\r
266         public static List<FastaSequence> readFasta(final InputStream inStream)\r
267                         throws IOException {\r
268                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
269                 FastaReader reader = new FastaReader(inStream);\r
270                 while (reader.hasNext()) {\r
271                         seqs.add(reader.next());\r
272                 }\r
273                 inStream.close();\r
274                 return seqs;\r
275         }\r
276 \r
277         /**\r
278          * Writes FastaSequence in the file, each sequence will take one line only\r
279          * \r
280          * @param os\r
281          * @param sequences\r
282          * @throws IOException\r
283          */\r
284         public static void writeFasta(final OutputStream os,\r
285                         final List<FastaSequence> sequences) throws IOException {\r
286                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
287                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
288                 for (final FastaSequence fs : sequences) {\r
289                         fasta_out.write(fs.getOnelineFasta());\r
290                 }\r
291                 fasta_out.close();\r
292                 outWriter.close();\r
293         }\r
294 \r
295         \r
296         public static final List<FastaSequence> readJpredFile(InputStream result)\r
297                         throws IOException, FileNotFoundException,NullPointerException {\r
298                 return readFasta (result);\r
299         }\r
300         \r
301         /**\r
302          * Read IUPred output\r
303          * \r
304          * @param result\r
305          * @return Map key->sequence name, value->Score\r
306          * @throws IOException\r
307          * @throws UnknownFileFormatException\r
308          */\r
309         public static Map<String, Score> readIUPred(final File result)\r
310                         throws IOException, UnknownFileFormatException {\r
311                 InputStream input = new FileInputStream(result);\r
312                 Map<String, Score> sequences = readIUPred(input,\r
313                                 IUPredResult.getType(result));\r
314                 input.close(); \r
315                 return sequences;\r
316         }\r
317 \r
318         // Check the type of the file e.g. long| short or domain\r
319         // and read\r
320         /**\r
321          * ## Long Disorder\r
322          * \r
323          * # P53_HUMAN\r
324          * \r
325          * 1 M 0.9943\r
326          * \r
327          * 2 E 0.9917\r
328          * \r
329          * 3 E 0.9879\r
330          * \r
331          * (every line)\r
332          * \r
333          * @throws IOException\r
334          * @throws UnknownFileFormatException\r
335          * \r
336          * \r
337          */\r
338         private static Map<String, Score> readIUPred(InputStream input,\r
339                         IUPredResult type) throws IOException, UnknownFileFormatException {\r
340 \r
341                 Score score = null;\r
342                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
343                 Scanner scan = new Scanner(input);\r
344                 scan.useDelimiter("#");\r
345                 while (scan.hasNext()) {\r
346                         String nextEntry = scan.next();\r
347                         Scanner entry = new Scanner(nextEntry);\r
348                         String name = entry.nextLine().trim();\r
349                         // inside entry:\r
350                         if (IUPredResult.Glob == type) {\r
351                                 // parse domains\r
352                                 TreeSet<Range> ranges = parseIUPredDomains(entry);\r
353                                 score = new Score(type, ranges);\r
354                         } else {\r
355                                 // parse short | long\r
356                                 float[] scores = parseIUPredScores(entry);\r
357                                 score = new Score(type, scores);\r
358                         }\r
359                         entry.close();\r
360                         seqs.put(name, score);\r
361                 }\r
362 \r
363                 scan.close();\r
364                 return seqs;\r
365         }\r
366 \r
367         /**\r
368          * # P53_HUMA\r
369          * \r
370          * Number of globular domains: 2\r
371          * \r
372          * globular domain 1. 98 - 269\r
373          * \r
374          * globular domain 2. 431 - 482\r
375          * \r
376          * >P53_HUMA\r
377          * \r
378          * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp\r
379          * \r
380          * @param scan\r
381          */\r
382         private static TreeSet<Range> parseIUPredDomains(Scanner scan) {\r
383                 String header = "Number of globular domains:";\r
384                 String domainPref = "globular domain";\r
385                 TreeSet<Range> ranges = new TreeSet<Range>();\r
386                 String line = scan.nextLine().trim();\r
387                 assert line.startsWith(header);\r
388                 line = line.substring(header.length()).trim();\r
389                 int domainNum = Integer.parseInt(line);\r
390                 if (domainNum == 0) {\r
391                         return ranges;\r
392                 }\r
393 \r
394                 for (int i = 0; i < domainNum; i++) {\r
395                         assert scan.hasNextLine();\r
396                         line = scan.nextLine();\r
397                         assert line.trim().startsWith(domainPref);\r
398                         line = line.substring(line.indexOf(".") + 1).trim();\r
399                         Range r = new Range(line.split("-"));\r
400                         ranges.add(r);\r
401                 }\r
402 \r
403                 return ranges;\r
404         }\r
405         /*\r
406          * 1 M 0.9943\r
407          * \r
408          * 2 E 0.9917\r
409          */\r
410         private static float[] parseIUPredScores(Scanner scan)\r
411                         throws UnknownFileFormatException {\r
412                 List<String> annotation = new ArrayList<String>();\r
413                 while (scan.hasNextLine()) {\r
414                         String line = scan.nextLine().trim();\r
415                         String[] val = line.split("\\s+");\r
416                         annotation.add(val[2]);\r
417                 }\r
418                 return convertToNumber(annotation\r
419                                 .toArray(new String[annotation.size()]));\r
420         }\r
421 \r
422         public static Map<String, Score> readJRonn(final File result)\r
423                         throws IOException, UnknownFileFormatException {\r
424                 InputStream input = new FileInputStream(result);\r
425                 Map<String, Score> sequences = readJRonn(input);\r
426                 input.close();\r
427                 return sequences;\r
428         }\r
429 \r
430         /**\r
431          * Reader for JRonn horizontal file format\r
432          * \r
433          * <pre>\r
434          * &gtFoobar M G D T T A G 0.48 0.42\r
435          * 0.42 0.48 0.52 0.53 0.54\r
436          * \r
437          * <pre>\r
438          * Where all values are tab delimited\r
439          * \r
440          * @param inStream\r
441          *            the InputStream connected to the JRonn output file\r
442          * @return Map key=sequence name value=Score\r
443          * @throws IOException\r
444          *             is thrown if the inStream has problems accessing the data\r
445          * @throws UnknownFileFormatException\r
446          *             is thrown if the inStream represents an unknown source of\r
447          * data, i.e. not a JRonn output\r
448          */\r
449         public static Map<String, Score> readJRonn(final InputStream inStream)\r
450                         throws IOException, UnknownFileFormatException {\r
451                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
452 \r
453                 final BufferedReader infasta = new BufferedReader(\r
454                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
455 \r
456                 String line;\r
457                 String sname = "";\r
458                 do {\r
459                         line = infasta.readLine();\r
460                         if (line == null || line.isEmpty()) {\r
461                                 // skip empty lines\r
462                                 continue;\r
463                         }\r
464                         if (line.startsWith(">")) {\r
465                                 // read name\r
466                                 sname = line.trim().substring(1);\r
467                                 // read sequence line\r
468                                 line = infasta.readLine();\r
469                                 final String sequence = line.replace("\t", "");\r
470                                 // read annotation line\r
471                                 line = infasta.readLine();\r
472                                 String[] annotValues = line.split("\t");\r
473                                 float[] annotation = convertToNumber(annotValues);\r
474                                 if (annotation.length != sequence.length()) {\r
475                                         throw new UnknownFileFormatException(\r
476                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
477                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
478                                 }\r
479                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
480                         }\r
481                 } while (line != null);\r
482 \r
483                 infasta.close();\r
484                 return seqs;\r
485         }\r
486 \r
487         private static float[] convertToNumber(String[] annotValues)\r
488                         throws UnknownFileFormatException {\r
489                 float[] annotation = new float[annotValues.length];\r
490                 try {\r
491                         for (int i = 0; i < annotation.length; i++) {\r
492                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
493                         }\r
494                 } catch (NumberFormatException e) {\r
495                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
496                                         e.getCause());\r
497                 }\r
498                 return annotation;\r
499         }\r
500 \r
501         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
502                         + ">sequence_name\n "\r
503                         + "M    V       S\n"\r
504                         + "0.43 0.22    0.65\n"\r
505                         + "Where first line is the sequence name,\n"\r
506                         + "second line is the tab delimited sequence,\n"\r
507                         + "third line contains tab delimited disorder prediction values.\n"\r
508                         + "No lines are allowed between these three. Additionally, the number of  "\r
509                         + "sequence residues must be equal to the number of the disorder values.";\r
510 \r
511         /**\r
512          * Closes the Closable and logs the exception if any\r
513          * \r
514          * @param log\r
515          * @param stream\r
516          */\r
517         public final static void closeSilently(java.util.logging.Logger log,\r
518                         Closeable stream) {\r
519                 if (stream != null) {\r
520                         try {\r
521                                 stream.close();\r
522                         } catch (IOException e) {\r
523                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
524                         }\r
525                 }\r
526         }\r
527 \r
528         /**\r
529          * \r
530          > Foobar_dundeefriends\r
531          * \r
532          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
533          * \r
534          * # REM465 355-368\r
535          * \r
536          * # HOTLOOPS 190-204\r
537          * \r
538          * # RESIDUE COILS REM465 HOTLOOPS\r
539          * \r
540          * M 0.86010 0.88512 0.37094\r
541          * \r
542          * T 0.79983 0.85864 0.44331\r
543          * \r
544          * >Next Sequence name\r
545          * \r
546          * \r
547          * @param input\r
548          *            the InputStream\r
549          * @return Map key=sequence name, value=set of score\r
550          * @throws IOException\r
551          * @throws UnknownFileFormatException\r
552          */\r
553         public static HashMap<String, Set<Score>> readDisembl(\r
554                         final InputStream input) throws IOException,\r
555                         UnknownFileFormatException {\r
556                 Scanner scan = new Scanner(input);\r
557                 scan.useDelimiter(">");\r
558                 if (!scan.hasNext()) {\r
559                         throw new UnknownFileFormatException(\r
560                                         "In Disembl score format each sequence score is expected "\r
561                                                         + "to start from the line: >Sequence name "\r
562                                                         + " No such line was found!");\r
563                 }\r
564 \r
565                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
566                 int seqCounter = 0;\r
567                 while (scan.hasNext()) {\r
568                         seqCounter++;\r
569                         String singleSeq = scan.next();\r
570                         Scanner scansingle = new Scanner(singleSeq);\r
571                         if (!scansingle.hasNextLine()) {\r
572                                 throw new RuntimeException(\r
573                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
574                         }\r
575 \r
576                         StringBuffer seqbuffer = new StringBuffer();\r
577                         ArrayList<Float> coils = new ArrayList<Float>();\r
578                         ArrayList<Float> rem = new ArrayList<Float>();\r
579                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
580 \r
581                         String sequenceName = scansingle.nextLine().trim();\r
582                         TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
583                                         scansingle.nextLine());\r
584                         TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
585                                         scansingle.nextLine());\r
586                         TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
587                                         scansingle.nextLine());\r
588 \r
589                         String title = scansingle.nextLine();\r
590                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
591 \r
592                         while (scansingle.hasNext()) {\r
593                                 seqbuffer.append(scansingle.next());\r
594                                 coils.add(scansingle.nextFloat());\r
595                                 rem.add(scansingle.nextFloat());\r
596                                 hotloops.add(scansingle.nextFloat());\r
597                         }\r
598                         /*\r
599                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
600                          * seqbuffer.toString());\r
601                          */\r
602                         HashSet<Score> scores = new HashSet<Score>();\r
603                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
604                         scores.add(new Score(DisemblResult.REM465, rem, rem465R));\r
605                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, loopsR));\r
606                         results.put(sequenceName, scores);\r
607 \r
608                         scansingle.close();\r
609                 }\r
610                 scan.close();\r
611                 input.close();\r
612                 return results;\r
613         }\r
614 \r
615         /**\r
616          * Parsing:\r
617          * \r
618          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
619          * 350-391, 429-485, 497-506, 539-547\r
620          * \r
621          * # REM465 355-368\r
622          * \r
623          * # HOTLOOPS 190-204\r
624          * \r
625          * @param lines\r
626          * @return\r
627          */\r
628         private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
629                 TreeSet<Range> ranges = new TreeSet<Range>();\r
630 \r
631                 Scanner scan = new Scanner(lines);\r
632 \r
633                 assert scan.hasNext();\r
634                 String del = scan.next();\r
635                 assert "#".equals(del); // pass delimiter #\r
636                 String type = scan.next(); // pass enum name e.g. COILS\r
637                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
638                                 + resultType.toString();\r
639 \r
640                 // beginning of the ranges\r
641                 scan.useDelimiter(",");\r
642                 while (scan.hasNext()) {\r
643                         String range = scan.next();\r
644                         if (!Util.isEmpty(range)) {\r
645                                 ranges.add(new Range(range.split("-")));\r
646                         }\r
647                 }\r
648                 return ranges;\r
649         }\r
650 \r
651         /**\r
652          * \r
653          > Foobar_dundeefriends\r
654          * \r
655          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
656          * \r
657          * # REM465 355-368\r
658          * \r
659          * # HOTLOOPS 190-204\r
660          * \r
661          * # RESIDUE COILS REM465 HOTLOOPS\r
662          * \r
663          * M 0.86010 0.88512 0.37094\r
664          * \r
665          * T 0.79983 0.85864 0.44331\r
666          * \r
667          * >Next Sequence name\r
668          * \r
669          * \r
670          * @param input\r
671          * @return Map key=sequence name, value=set of score\r
672          * @throws IOException\r
673          * @throws UnknownFileFormatException\r
674          */\r
675         public static HashMap<String, Set<Score>> readGlobPlot(\r
676                         final InputStream input) throws IOException,\r
677                         UnknownFileFormatException {\r
678                 Scanner scan = new Scanner(input);\r
679                 scan.useDelimiter(">");\r
680                 if (!scan.hasNext()) {\r
681                         throw new UnknownFileFormatException(\r
682                                         "In GlobPlot score format each sequence score is expected "\r
683                                                         + "to start from the line: >Sequence name "\r
684                                                         + " No such line was found!");\r
685                 }\r
686 \r
687                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
688                 int seqCounter = 0;\r
689                 while (scan.hasNext()) {\r
690                         seqCounter++;\r
691                         String singleSeq = scan.next();\r
692                         Scanner scansingle = new Scanner(singleSeq);\r
693                         if (!scansingle.hasNextLine()) {\r
694                                 throw new RuntimeException(\r
695                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
696                         }\r
697 \r
698                         StringBuffer seqbuffer = new StringBuffer();\r
699                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
700                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
701                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
702 \r
703                         String sequenceName = scansingle.nextLine().trim();\r
704                         TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
705                                         scansingle.nextLine());\r
706                         TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
707                                         scansingle.nextLine());\r
708 \r
709                         String title = scansingle.nextLine();\r
710                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
711 \r
712                         while (scansingle.hasNext()) {\r
713                                 seqbuffer.append(scansingle.next());\r
714                                 dydxScore.add(scansingle.nextFloat());\r
715                                 rawScore.add(scansingle.nextFloat());\r
716                                 smoothedScore.add(scansingle.nextFloat());\r
717                         }\r
718                         /*\r
719                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
720                          * seqbuffer.toString());\r
721                          */\r
722                         Set<Score> scores = new TreeSet<Score>();\r
723                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
724                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
725                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
726                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
727                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
728                         results.put(sequenceName, scores);\r
729 \r
730                         scansingle.close();\r
731                 }\r
732                 scan.close();\r
733                 input.close();\r
734                 return results;\r
735         }\r
736         /**\r
737          * Read AACon result with no alignment files. This method leaves incoming\r
738          * InputStream open!\r
739          * \r
740          * @param results\r
741          *            output file of AAConservation\r
742          * @return Map with keys {@link ConservationMethod} -> float[]\r
743          */\r
744         public static HashSet<Score> readAAConResults(InputStream results) {\r
745                 if (results == null) {\r
746                         throw new NullPointerException(\r
747                                         "InputStream with results must be provided");\r
748                 }\r
749                 HashSet<Score> annotations = new HashSet<Score>();\r
750                 Scanner sc = new Scanner(results);\r
751                 sc.useDelimiter("#");\r
752                 while (sc.hasNext()) {\r
753                         String line = sc.next();\r
754                         int spacePos = line.indexOf(" ");\r
755                         assert spacePos > 0 : "Space is expected as delimited between method "\r
756                                         + "name and values!";\r
757                         String methodLine = line.substring(0, spacePos);\r
758                         ConservationMethod method = ConservationMethod\r
759                                         .getMethod(methodLine);\r
760                         assert method != null : "Method " + methodLine\r
761                                         + " is not recognized! ";\r
762                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
763                         ArrayList<Float> values = new ArrayList<Float>();\r
764                         while (valuesScanner.hasNextDouble()) {\r
765                                 Double value = valuesScanner.nextDouble();\r
766                                 values.add(value.floatValue());\r
767                         }\r
768                         annotations.add(new Score(method, values));\r
769                 }\r
770                 return annotations;\r
771         }\r
772 \r
773         \r
774         \r
775 \r
776         /**\r
777          * Reads and parses Fasta or Clustal formatted file into a list of\r
778          * FastaSequence objects\r
779          * \r
780          * @param inFilePath\r
781          *            the path to the input file\r
782          * @throws IOException\r
783          *             if the file denoted by inFilePath cannot be read\r
784          * @throws UnknownFileFormatException\r
785          *             if the inFilePath points to the file which format cannot be\r
786          *             recognised\r
787          * @return the List of FastaSequence objects\r
788          * \r
789          */\r
790         public static List<FastaSequence> openInputStream(String inFilePath)\r
791                         throws IOException, UnknownFileFormatException {\r
792 \r
793                 // This stream gets closed in isValidClustalFile method\r
794                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
795                 // This stream is closed in the calling methods\r
796                 InputStream inStr = new FileInputStream(inFilePath);\r
797                 List<FastaSequence> fastaSeqs = null;\r
798                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
799                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
800                         // alignment cannot be null see\r
801                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
802                         fastaSeqs = al.getSequences();\r
803                 } else {\r
804                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
805                 }\r
806                 return fastaSeqs;\r
807         }\r
808 \r
809         // This can't possibly be right for all cases!\r
810         // but it will do for now\r
811         \r
812         // As for the metadata. This function doesnt know what program\r
813         // generated it. How to handle the metadata!?\r
814         \r
815         public static void writeClustal(OutputStream outStream,\r
816                         List<FastaSequence> sequences, char gapChar) \r
817                         throws IOException {\r
818                 \r
819                 BufferedWriter writer = new BufferedWriter(\r
820                                 new OutputStreamWriter(outStream));\r
821                 // will give AlignmentMetadata default type of CLUSTAL for now\r
822                 AlignmentMetadata al = new AlignmentMetadata(Program.CLUSTAL, gapChar);\r
823                 \r
824                 ClustalAlignmentUtil.writeClustalAlignment(writer, \r
825                                 new Alignment(sequences, al));\r
826                 \r
827         }\r
828 \r
829 }\r
830 \r
831 enum DisemblResult {\r
832         /** These contains ranges and scores */\r
833         COILS, REM465, HOTLOOPS\r
834 }\r
835 enum GlobProtResult {\r
836         /** This a range with no scores */\r
837         GlobDoms,\r
838         /** This a range with no scores */\r
839         Disorder,\r
840         /** This a score with no range */\r
841         Dydx,\r
842         /** This a score with no range */\r
843         SmoothedScore,\r
844         /** This a score with no range */\r
845         RawScore\r
846 }\r
847 \r
848 enum IUPredResult {\r
849         /**\r
850          * Short disorder\r
851          */\r
852         Short,\r
853         /**\r
854          * Long disorder\r
855          */\r
856         Long,\r
857         /**\r
858          * Globular domains\r
859          */\r
860         Glob;\r
861 \r
862         static IUPredResult getType(File file) {\r
863                 assert file != null;\r
864                 String name = file.getName();\r
865                 if (name.endsWith(Long.toString().toLowerCase())) {\r
866                         return Long;\r
867                 }\r
868                 if (name.endsWith(Short.toString().toLowerCase())) {\r
869                         return Short;\r
870                 }\r
871                 if (name.endsWith(Glob.toString().toLowerCase())) {\r
872                         return Glob;\r
873                 }\r
874                 throw new AssertionError(\r
875                                 "IUPred result file type cannot be recognised! "\r
876                                                 + "\nFile must ends with one of [glob, long or short]"\r
877                                                 + "\n but given file name was: " + file.getName());\r
878         }\r
879 }\r