Alifold results are now parsed and stored in a ScoreManager object
[jabaws.git] / datamodel / compbio / data / sequence / SequenceUtil.java
1 /* Copyright (c) 2011 Peter Troshin\r
2  *  \r
3  *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0     \r
4  * \r
5  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
6  *  Apache License version 2 as published by the Apache Software Foundation\r
7  * \r
8  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
9  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
10  *  License for more details.\r
11  * \r
12  *  A copy of the license is in apache_license.txt. It is also available here:\r
13  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
14  * \r
15  * Any republication or derived work distributed in source code form\r
16  * must include this copyright and license notice.\r
17  */\r
18 \r
19 package compbio.data.sequence;\r
20 \r
21 import java.io.BufferedReader;\r
22 import java.io.BufferedWriter;\r
23 import java.io.Closeable;\r
24 import java.io.File;\r
25 import java.io.FileInputStream;\r
26 import java.io.FileNotFoundException;\r
27 import java.io.IOException;\r
28 import java.io.InputStream;\r
29 import java.io.InputStreamReader;\r
30 import java.io.OutputStream;\r
31 import java.io.OutputStreamWriter;\r
32 import java.util.ArrayList;\r
33 import java.util.Arrays;\r
34 import java.util.HashMap;\r
35 import java.util.HashSet;\r
36 import java.util.List;\r
37 import java.util.Map;\r
38 import java.util.Scanner;\r
39 import java.util.Set;\r
40 import java.util.TreeMap;\r
41 import java.util.TreeSet;\r
42 import java.util.logging.Level;\r
43 import java.util.regex.Matcher;\r
44 import java.util.regex.Pattern;\r
45 \r
46 import compbio.util.Util;\r
47 \r
48 /**\r
49  * Utility class for operations on sequences\r
50  * \r
51  * @author Peter Troshin\r
52  * @since 1.0\r
53  * @version 2.0 June 2011\r
54  */\r
55 public final class SequenceUtil {\r
56 \r
57         /**\r
58          * A whitespace character: [\t\n\x0B\f\r]\r
59          */\r
60         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
61 \r
62         /**\r
63          * A digit\r
64          */\r
65         public static final Pattern DIGIT = Pattern.compile("\\d");\r
66 \r
67         /**\r
68          * Non word\r
69          */\r
70         public static final Pattern NONWORD = Pattern.compile("\\W");\r
71 \r
72         /**\r
73          * Valid Amino acids\r
74          */\r
75         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
76                         Pattern.CASE_INSENSITIVE);\r
77 \r
78         /**\r
79          * inversion of AA pattern\r
80          */\r
81         public static final Pattern NON_AA = Pattern.compile(\r
82                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
83 \r
84         /**\r
85          * Same as AA pattern but with two additional letters - XU\r
86          */\r
87         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
88                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
89 \r
90         /**\r
91          * Nucleotides a, t, g, c, u\r
92          */\r
93         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
94                         Pattern.CASE_INSENSITIVE);\r
95 \r
96         /**\r
97          * Ambiguous nucleotide\r
98          */\r
99         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
100                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
101         /**\r
102          * Non nucleotide\r
103          */\r
104         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
105                         Pattern.CASE_INSENSITIVE);\r
106 \r
107         private SequenceUtil() {\r
108         } // utility class, no instantiation\r
109 \r
110         /**\r
111          * @return true is the sequence contains only letters a,c, t, g, u\r
112          */\r
113         public static boolean isNucleotideSequence(final FastaSequence s) {\r
114                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
115         }\r
116 \r
117         /**\r
118          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
119          * (!) - B char\r
120          */\r
121         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
122                 sequence = SequenceUtil.cleanSequence(sequence);\r
123                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
124                         return false;\r
125                 }\r
126                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
127                         return false;\r
128                         /*\r
129                          * System.out.format("I found the text starting at " +\r
130                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
131                          * nonDNAmatcher.end());\r
132                          */\r
133                 }\r
134                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
135                 return DNAmatcher.find();\r
136         }\r
137 \r
138         /**\r
139          * Removes all whitespace chars in the sequence string\r
140          * \r
141          * @param sequence\r
142          * @return cleaned up sequence\r
143          */\r
144         public static String cleanSequence(String sequence) {\r
145                 assert sequence != null;\r
146                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
147                 sequence = m.replaceAll("").toUpperCase();\r
148                 return sequence;\r
149         }\r
150 \r
151         /**\r
152          * Removes all special characters and digits as well as whitespace chars\r
153          * from the sequence\r
154          * \r
155          * @param sequence\r
156          * @return cleaned up sequence\r
157          */\r
158         public static String deepCleanSequence(String sequence) {\r
159                 sequence = SequenceUtil.cleanSequence(sequence);\r
160                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
161                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
162                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
163                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
164                 return sequence;\r
165         }\r
166 \r
167         /**\r
168          * Remove all non AA chars from the sequence\r
169          * \r
170          * @param sequence\r
171          *            the sequence to clean\r
172          * @return cleaned sequence\r
173          */\r
174         public static String cleanProteinSequence(String sequence) {\r
175                 return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");\r
176         }\r
177 \r
178         /**\r
179          * @param sequence\r
180          * @return true is the sequence is a protein sequence, false overwise\r
181          */\r
182         public static boolean isProteinSequence(String sequence) {\r
183                 sequence = SequenceUtil.cleanSequence(sequence);\r
184                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
185                         return false;\r
186                 }\r
187                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
188                         return false;\r
189                 }\r
190                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
191                         return false;\r
192                 }\r
193                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
194                 return protmatcher.find();\r
195         }\r
196 \r
197         /**\r
198          * Check whether the sequence confirms to amboguous protein sequence\r
199          * \r
200          * @param sequence\r
201          * @return return true only if the sequence if ambiguous protein sequence\r
202          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
203          *         protein or DNA\r
204          */\r
205         public static boolean isAmbiguosProtein(String sequence) {\r
206                 sequence = SequenceUtil.cleanSequence(sequence);\r
207                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
208                         return false;\r
209                 }\r
210                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
211                         return false;\r
212                 }\r
213                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
214                         return false;\r
215                 }\r
216                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
217                         return false;\r
218                 }\r
219                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
220                 return amb_prot.find();\r
221         }\r
222 \r
223         /**\r
224          * Writes list of FastaSequeces into the outstream formatting the sequence\r
225          * so that it contains width chars on each line\r
226          * \r
227          * @param outstream\r
228          * @param sequences\r
229          * @param width\r
230          *            - the maximum number of characters to write in one line\r
231          * @throws IOException\r
232          */\r
233         public static void writeFasta(final OutputStream outstream,\r
234                         final List<FastaSequence> sequences, final int width)\r
235                         throws IOException {\r
236                 writeFastaKeepTheStream(outstream, sequences, width);\r
237                 outstream.close();\r
238         }\r
239 \r
240         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
241                         final List<FastaSequence> sequences, final int width)\r
242                         throws IOException {\r
243                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
244                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
245                 for (final FastaSequence fs : sequences) {\r
246                         fastawriter.write(">" + fs.getId() + "\n");\r
247                         fastawriter.write(fs.getFormatedSequence(width));\r
248                         fastawriter.write("\n");\r
249                 }\r
250                 fastawriter.flush();\r
251                 writer.flush();\r
252         }\r
253 \r
254         /**\r
255          * Reads fasta sequences from inStream into the list of FastaSequence\r
256          * objects\r
257          * \r
258          * @param inStream\r
259          *            from\r
260          * @return list of FastaSequence objects\r
261          * @throws IOException\r
262          */\r
263         public static List<FastaSequence> readFasta(final InputStream inStream)\r
264                         throws IOException {\r
265                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
266                 FastaReader reader = new FastaReader(inStream);\r
267                 while (reader.hasNext()) {\r
268                         seqs.add(reader.next());\r
269                 }\r
270                 inStream.close();\r
271                 return seqs;\r
272         }\r
273 \r
274         /**\r
275          * Writes FastaSequence in the file, each sequence will take one line only\r
276          * \r
277          * @param os\r
278          * @param sequences\r
279          * @throws IOException\r
280          */\r
281         public static void writeFasta(final OutputStream os,\r
282                         final List<FastaSequence> sequences) throws IOException {\r
283                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
284                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
285                 for (final FastaSequence fs : sequences) {\r
286                         fasta_out.write(fs.getOnelineFasta());\r
287                 }\r
288                 fasta_out.close();\r
289                 outWriter.close();\r
290         }\r
291 \r
292         /**\r
293          * Read IUPred output\r
294          * \r
295          * @param result\r
296          * @return Map key->sequence name, value->Score\r
297          * @throws IOException\r
298          * @throws UnknownFileFormatException\r
299          */\r
300         public static Map<String, Score> readIUPred(final File result)\r
301                         throws IOException, UnknownFileFormatException {\r
302                 InputStream input = new FileInputStream(result);\r
303                 Map<String, Score> sequences = readIUPred(input,\r
304                                 IUPredResult.getType(result));\r
305                 input.close();\r
306                 return sequences;\r
307         }\r
308 \r
309         // Check the type of the file e.g. long| short or domain\r
310         // and read\r
311         /**\r
312          * ## Long Disorder\r
313          * \r
314          * # P53_HUMAN\r
315          * \r
316          * 1 M 0.9943\r
317          * \r
318          * 2 E 0.9917\r
319          * \r
320          * 3 E 0.9879\r
321          * \r
322          * (every line)\r
323          * \r
324          * @throws IOException\r
325          * @throws UnknownFileFormatException\r
326          * \r
327          * \r
328          */\r
329         private static Map<String, Score> readIUPred(InputStream input,\r
330                         IUPredResult type) throws IOException, UnknownFileFormatException {\r
331 \r
332                 Score score = null;\r
333                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
334                 Scanner scan = new Scanner(input);\r
335                 scan.useDelimiter("#");\r
336                 while (scan.hasNext()) {\r
337                         String nextEntry = scan.next();\r
338                         Scanner entry = new Scanner(nextEntry);\r
339                         String name = entry.nextLine().trim();\r
340                         // inside entry:\r
341                         if (IUPredResult.Glob == type) {\r
342                                 // parse domains\r
343                                 TreeSet<Range> ranges = parseIUPredDomains(entry);\r
344                                 score = new Score(type, ranges);\r
345                         } else {\r
346                                 // parse short | long\r
347                                 float[] scores = parseIUPredScores(entry);\r
348                                 score = new Score(type, scores);\r
349                         }\r
350                         entry.close();\r
351                         seqs.put(name, score);\r
352                 }\r
353 \r
354                 scan.close();\r
355                 return seqs;\r
356         }\r
357 \r
358         /**\r
359          * # P53_HUMA\r
360          * \r
361          * Number of globular domains: 2\r
362          * \r
363          * globular domain 1. 98 - 269\r
364          * \r
365          * globular domain 2. 431 - 482\r
366          * \r
367          * >P53_HUMA\r
368          * \r
369          * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp\r
370          * \r
371          * @param scan\r
372          */\r
373         private static TreeSet<Range> parseIUPredDomains(Scanner scan) {\r
374                 String header = "Number of globular domains:";\r
375                 String domainPref = "globular domain";\r
376                 TreeSet<Range> ranges = new TreeSet<Range>();\r
377                 String line = scan.nextLine().trim();\r
378                 assert line.startsWith(header);\r
379                 line = line.substring(header.length()).trim();\r
380                 int domainNum = Integer.parseInt(line);\r
381                 if (domainNum == 0) {\r
382                         return ranges;\r
383                 }\r
384 \r
385                 for (int i = 0; i < domainNum; i++) {\r
386                         assert scan.hasNextLine();\r
387                         line = scan.nextLine();\r
388                         assert line.trim().startsWith(domainPref);\r
389                         line = line.substring(line.indexOf(".") + 1).trim();\r
390                         Range r = new Range(line.split("-"));\r
391                         ranges.add(r);\r
392                 }\r
393 \r
394                 return ranges;\r
395         }\r
396         /*\r
397          * 1 M 0.9943\r
398          * \r
399          * 2 E 0.9917\r
400          */\r
401         private static float[] parseIUPredScores(Scanner scan)\r
402                         throws UnknownFileFormatException {\r
403                 List<String> annotation = new ArrayList<String>();\r
404                 while (scan.hasNextLine()) {\r
405                         String line = scan.nextLine().trim();\r
406                         String[] val = line.split("\\s+");\r
407                         annotation.add(val[2]);\r
408                 }\r
409                 return convertToNumber(annotation\r
410                                 .toArray(new String[annotation.size()]));\r
411         }\r
412 \r
413         public static Map<String, Score> readJRonn(final File result)\r
414                         throws IOException, UnknownFileFormatException {\r
415                 InputStream input = new FileInputStream(result);\r
416                 Map<String, Score> sequences = readJRonn(input);\r
417                 input.close();\r
418                 return sequences;\r
419         }\r
420 \r
421         /**\r
422          * Reader for JRonn horizontal file format\r
423          * \r
424          * <pre>\r
425          * &gtFoobar M G D T T A G 0.48 0.42\r
426          * 0.42 0.48 0.52 0.53 0.54\r
427          * \r
428          * <pre>\r
429          * Where all values are tab delimited\r
430          * \r
431          * @param inStream\r
432          *            the InputStream connected to the JRonn output file\r
433          * @return Map key=sequence name value=Score\r
434          * @throws IOException\r
435          *             is thrown if the inStream has problems accessing the data\r
436          * @throws UnknownFileFormatException\r
437          *             is thrown if the inStream represents an unknown source of\r
438          * data, i.e. not a JRonn output\r
439          */\r
440         public static Map<String, Score> readJRonn(final InputStream inStream)\r
441                         throws IOException, UnknownFileFormatException {\r
442                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
443 \r
444                 final BufferedReader infasta = new BufferedReader(\r
445                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
446 \r
447                 String line;\r
448                 String sname = "";\r
449                 do {\r
450                         line = infasta.readLine();\r
451                         if (line == null || line.isEmpty()) {\r
452                                 // skip empty lines\r
453                                 continue;\r
454                         }\r
455                         if (line.startsWith(">")) {\r
456                                 // read name\r
457                                 sname = line.trim().substring(1);\r
458                                 // read sequence line\r
459                                 line = infasta.readLine();\r
460                                 final String sequence = line.replace("\t", "");\r
461                                 // read annotation line\r
462                                 line = infasta.readLine();\r
463                                 String[] annotValues = line.split("\t");\r
464                                 float[] annotation = convertToNumber(annotValues);\r
465                                 if (annotation.length != sequence.length()) {\r
466                                         throw new UnknownFileFormatException(\r
467                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
468                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
469                                 }\r
470                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
471                         }\r
472                 } while (line != null);\r
473 \r
474                 infasta.close();\r
475                 return seqs;\r
476         }\r
477 \r
478         private static float[] convertToNumber(String[] annotValues)\r
479                         throws UnknownFileFormatException {\r
480                 float[] annotation = new float[annotValues.length];\r
481                 try {\r
482                         for (int i = 0; i < annotation.length; i++) {\r
483                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
484                         }\r
485                 } catch (NumberFormatException e) {\r
486                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
487                                         e.getCause());\r
488                 }\r
489                 return annotation;\r
490         }\r
491 \r
492         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
493                         + ">sequence_name\n "\r
494                         + "M    V       S\n"\r
495                         + "0.43 0.22    0.65\n"\r
496                         + "Where first line is the sequence name,\n"\r
497                         + "second line is the tab delimited sequence,\n"\r
498                         + "third line contains tab delimited disorder prediction values.\n"\r
499                         + "No lines are allowed between these three. Additionally, the number of  "\r
500                         + "sequence residues must be equal to the number of the disorder values.";\r
501 \r
502         /**\r
503          * Closes the Closable and logs the exception if any\r
504          * \r
505          * @param log\r
506          * @param stream\r
507          */\r
508         public final static void closeSilently(java.util.logging.Logger log,\r
509                         Closeable stream) {\r
510                 if (stream != null) {\r
511                         try {\r
512                                 stream.close();\r
513                         } catch (IOException e) {\r
514                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
515                         }\r
516                 }\r
517         }\r
518 \r
519         /**\r
520          * \r
521          > Foobar_dundeefriends\r
522          * \r
523          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
524          * \r
525          * # REM465 355-368\r
526          * \r
527          * # HOTLOOPS 190-204\r
528          * \r
529          * # RESIDUE COILS REM465 HOTLOOPS\r
530          * \r
531          * M 0.86010 0.88512 0.37094\r
532          * \r
533          * T 0.79983 0.85864 0.44331\r
534          * \r
535          * >Next Sequence name\r
536          * \r
537          * \r
538          * @param input\r
539          *            the InputStream\r
540          * @return Map key=sequence name, value=set of score\r
541          * @throws IOException\r
542          * @throws UnknownFileFormatException\r
543          */\r
544         public static HashMap<String, Set<Score>> readDisembl(\r
545                         final InputStream input) throws IOException,\r
546                         UnknownFileFormatException {\r
547                 Scanner scan = new Scanner(input);\r
548                 scan.useDelimiter(">");\r
549                 if (!scan.hasNext()) {\r
550                         throw new UnknownFileFormatException(\r
551                                         "In Disembl score format each sequence score is expected "\r
552                                                         + "to start from the line: >Sequence name "\r
553                                                         + " No such line was found!");\r
554                 }\r
555 \r
556                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
557                 int seqCounter = 0;\r
558                 while (scan.hasNext()) {\r
559                         seqCounter++;\r
560                         String singleSeq = scan.next();\r
561                         Scanner scansingle = new Scanner(singleSeq);\r
562                         if (!scansingle.hasNextLine()) {\r
563                                 throw new RuntimeException(\r
564                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
565                         }\r
566 \r
567                         StringBuffer seqbuffer = new StringBuffer();\r
568                         ArrayList<Float> coils = new ArrayList<Float>();\r
569                         ArrayList<Float> rem = new ArrayList<Float>();\r
570                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
571 \r
572                         String sequenceName = scansingle.nextLine().trim();\r
573                         TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
574                                         scansingle.nextLine());\r
575                         TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
576                                         scansingle.nextLine());\r
577                         TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
578                                         scansingle.nextLine());\r
579 \r
580                         String title = scansingle.nextLine();\r
581                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
582 \r
583                         while (scansingle.hasNext()) {\r
584                                 seqbuffer.append(scansingle.next());\r
585                                 coils.add(scansingle.nextFloat());\r
586                                 rem.add(scansingle.nextFloat());\r
587                                 hotloops.add(scansingle.nextFloat());\r
588                         }\r
589                         /*\r
590                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
591                          * seqbuffer.toString());\r
592                          */\r
593                         HashSet<Score> scores = new HashSet<Score>();\r
594                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
595                         scores.add(new Score(DisemblResult.REM465, rem, rem465R));\r
596                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, loopsR));\r
597                         results.put(sequenceName, scores);\r
598 \r
599                         scansingle.close();\r
600                 }\r
601                 scan.close();\r
602                 input.close();\r
603                 return results;\r
604         }\r
605 \r
606         /**\r
607          * Parsing:\r
608          * \r
609          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
610          * 350-391, 429-485, 497-506, 539-547\r
611          * \r
612          * # REM465 355-368\r
613          * \r
614          * # HOTLOOPS 190-204\r
615          * \r
616          * @param lines\r
617          * @return\r
618          */\r
619         private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
620                 TreeSet<Range> ranges = new TreeSet<Range>();\r
621 \r
622                 Scanner scan = new Scanner(lines);\r
623 \r
624                 assert scan.hasNext();\r
625                 String del = scan.next();\r
626                 assert "#".equals(del); // pass delimiter #\r
627                 String type = scan.next(); // pass enum name e.g. COILS\r
628                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
629                                 + resultType.toString();\r
630 \r
631                 // beginning of the ranges\r
632                 scan.useDelimiter(",");\r
633                 while (scan.hasNext()) {\r
634                         String range = scan.next();\r
635                         if (!Util.isEmpty(range)) {\r
636                                 ranges.add(new Range(range.split("-")));\r
637                         }\r
638                 }\r
639                 return ranges;\r
640         }\r
641 \r
642         /**\r
643          * \r
644          > Foobar_dundeefriends\r
645          * \r
646          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
647          * \r
648          * # REM465 355-368\r
649          * \r
650          * # HOTLOOPS 190-204\r
651          * \r
652          * # RESIDUE COILS REM465 HOTLOOPS\r
653          * \r
654          * M 0.86010 0.88512 0.37094\r
655          * \r
656          * T 0.79983 0.85864 0.44331\r
657          * \r
658          * >Next Sequence name\r
659          * \r
660          * \r
661          * @param input\r
662          * @return Map key=sequence name, value=set of score\r
663          * @throws IOException\r
664          * @throws UnknownFileFormatException\r
665          */\r
666         public static HashMap<String, Set<Score>> readGlobPlot(\r
667                         final InputStream input) throws IOException,\r
668                         UnknownFileFormatException {\r
669                 Scanner scan = new Scanner(input);\r
670                 scan.useDelimiter(">");\r
671                 if (!scan.hasNext()) {\r
672                         throw new UnknownFileFormatException(\r
673                                         "In GlobPlot score format each sequence score is expected "\r
674                                                         + "to start from the line: >Sequence name "\r
675                                                         + " No such line was found!");\r
676                 }\r
677 \r
678                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
679                 int seqCounter = 0;\r
680                 while (scan.hasNext()) {\r
681                         seqCounter++;\r
682                         String singleSeq = scan.next();\r
683                         Scanner scansingle = new Scanner(singleSeq);\r
684                         if (!scansingle.hasNextLine()) {\r
685                                 throw new RuntimeException(\r
686                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
687                         }\r
688 \r
689                         StringBuffer seqbuffer = new StringBuffer();\r
690                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
691                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
692                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
693 \r
694                         String sequenceName = scansingle.nextLine().trim();\r
695                         TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
696                                         scansingle.nextLine());\r
697                         TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
698                                         scansingle.nextLine());\r
699 \r
700                         String title = scansingle.nextLine();\r
701                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
702 \r
703                         while (scansingle.hasNext()) {\r
704                                 seqbuffer.append(scansingle.next());\r
705                                 dydxScore.add(scansingle.nextFloat());\r
706                                 rawScore.add(scansingle.nextFloat());\r
707                                 smoothedScore.add(scansingle.nextFloat());\r
708                         }\r
709                         /*\r
710                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
711                          * seqbuffer.toString());\r
712                          */\r
713                         Set<Score> scores = new TreeSet<Score>();\r
714                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
715                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
716                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
717                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
718                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
719                         results.put(sequenceName, scores);\r
720 \r
721                         scansingle.close();\r
722                 }\r
723                 scan.close();\r
724                 input.close();\r
725                 return results;\r
726         }\r
727         /**\r
728          * Read AACon result with no alignment files. This method leaves incoming\r
729          * InputStream open!\r
730          * \r
731          * @param results\r
732          *            output file of AAConservation\r
733          * @return Map with keys {@link ConservationMethod} -> float[]\r
734          */\r
735         public static HashSet<Score> readAAConResults(InputStream results) {\r
736                 if (results == null) {\r
737                         throw new NullPointerException(\r
738                                         "InputStream with results must be provided");\r
739                 }\r
740                 HashSet<Score> annotations = new HashSet<Score>();\r
741                 Scanner sc = new Scanner(results);\r
742                 sc.useDelimiter("#");\r
743                 while (sc.hasNext()) {\r
744                         String line = sc.next();\r
745                         int spacePos = line.indexOf(" ");\r
746                         assert spacePos > 0 : "Space is expected as delimited between method "\r
747                                         + "name and values!";\r
748                         String methodLine = line.substring(0, spacePos);\r
749                         ConservationMethod method = ConservationMethod\r
750                                         .getMethod(methodLine);\r
751                         assert method != null : "Method " + methodLine\r
752                                         + " is not recognized! ";\r
753                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
754                         ArrayList<Float> values = new ArrayList<Float>();\r
755                         while (valuesScanner.hasNextDouble()) {\r
756                                 Double value = valuesScanner.nextDouble();\r
757                                 values.add(value.floatValue());\r
758                         }\r
759                         annotations.add(new Score(method, values));\r
760                 }\r
761                 return annotations;\r
762         }\r
763 \r
764         \r
765         \r
766 \r
767         /**\r
768          * Reads and parses Fasta or Clustal formatted file into a list of\r
769          * FastaSequence objects\r
770          * \r
771          * @param inFilePath\r
772          *            the path to the input file\r
773          * @throws IOException\r
774          *             if the file denoted by inFilePath cannot be read\r
775          * @throws UnknownFileFormatException\r
776          *             if the inFilePath points to the file which format cannot be\r
777          *             recognised\r
778          * @return the List of FastaSequence objects\r
779          * \r
780          */\r
781         public static List<FastaSequence> openInputStream(String inFilePath)\r
782                         throws IOException, UnknownFileFormatException {\r
783 \r
784                 // This stream gets closed in isValidClustalFile method\r
785                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
786                 // This stream is closed in the calling methods\r
787                 InputStream inStr = new FileInputStream(inFilePath);\r
788                 List<FastaSequence> fastaSeqs = null;\r
789                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
790                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
791                         // alignment cannot be null see\r
792                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
793                         fastaSeqs = al.getSequences();\r
794                 } else {\r
795                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
796                 }\r
797                 return fastaSeqs;\r
798         }\r
799 \r
800         // This can't possibly be right for all cases!\r
801         // but it will do for now\r
802         \r
803         // As for the metadata. This function doesnt know what program\r
804         // generated it. How to handle the metadata!?\r
805         \r
806         public static void writeClustal(OutputStream outStream,\r
807                         List<FastaSequence> sequences, char gapChar) \r
808                         throws IOException {\r
809                 \r
810                 BufferedWriter writer = new BufferedWriter(\r
811                                 new OutputStreamWriter(outStream));\r
812                 // will give AlignmentMetadata default type of CLUSTAL for now\r
813                 AlignmentMetadata al = new AlignmentMetadata(Program.CLUSTAL, gapChar);\r
814                 \r
815                 ClustalAlignmentUtil.writeClustalAlignment(writer, \r
816                                 new Alignment(sequences, al));\r
817                 \r
818         }\r
819 \r
820 }\r
821 \r
822 enum DisemblResult {\r
823         /** These contains ranges and scores */\r
824         COILS, REM465, HOTLOOPS\r
825 }\r
826 enum GlobProtResult {\r
827         /** This a range with no scores */\r
828         GlobDoms,\r
829         /** This a range with no scores */\r
830         Disorder,\r
831         /** This a score with no range */\r
832         Dydx,\r
833         /** This a score with no range */\r
834         SmoothedScore,\r
835         /** This a score with no range */\r
836         RawScore\r
837 }\r
838 \r
839 enum IUPredResult {\r
840         /**\r
841          * Short disorder\r
842          */\r
843         Short,\r
844         /**\r
845          * Long disorder\r
846          */\r
847         Long,\r
848         /**\r
849          * Globular domains\r
850          */\r
851         Glob;\r
852 \r
853         static IUPredResult getType(File file) {\r
854                 assert file != null;\r
855                 String name = file.getName();\r
856                 if (name.endsWith(Long.toString().toLowerCase())) {\r
857                         return Long;\r
858                 }\r
859                 if (name.endsWith(Short.toString().toLowerCase())) {\r
860                         return Short;\r
861                 }\r
862                 if (name.endsWith(Glob.toString().toLowerCase())) {\r
863                         return Glob;\r
864                 }\r
865                 throw new AssertionError(\r
866                                 "IUPred result file type cannot be recognised! "\r
867                                                 + "\nFile must ends with one of [glob, long or short]"\r
868                                                 + "\n but given file name was: " + file.getName());\r
869         }\r
870 }\r