RNAStruct replaced by RNAStructScoreManager. Why does webservice
[jabaws.git] / datamodel / compbio / data / sequence / RNAStructReader.java
1 package compbio.data.sequence;\r
2 \r
3 import java.io.BufferedReader;\r
4 import java.io.InputStream;\r
5 import java.io.InputStreamReader;\r
6 import java.io.IOException;\r
7 import java.util.ArrayList;\r
8 import java.util.Arrays;\r
9 import java.util.List;\r
10 import java.util.Scanner;\r
11 import java.util.TreeSet;\r
12 import java.util.regex.Pattern;\r
13 \r
14 import org.apache.log4j.Logger;\r
15 \r
16 import compbio.runner.structure.RNAalifold;\r
17 \r
18 // Utility class for reading alifold output\r
19 \r
20 public class RNAStructReader {\r
21 \r
22         private static Logger log = Logger.getLogger(RNAStructReader.class);\r
23         \r
24         // Whitespace patterns\r
25         static String s = "[+\\s=]+";\r
26         static String bracket = "\\(|\\)|\\{|\\}|\\[|\\]";\r
27         static String notData = "[\\s=+]+";\r
28 \r
29         // RNAOut data type patterns \r
30         static String seqP = "[_\\-a-zA-Z]{2,}"; // Has to match --mis output aswell (not just ACGU_)\r
31         static String structP = "[\\.)({}\\[\\],]{2,}";\r
32         static String floatP = "-?\\d+\\.\\d*(e[+\\-]\\d+)?";\r
33         static String energyP = "-?[0-9]*\\.?[0-9]{2}";\r
34         static String freqP = "^-?\\d\\.\\d{6,}(e[+\\-]\\d+)?$";\r
35         \r
36         // alifold out line patterns\r
37         static String ps = "\\s*";\r
38         static String alignmentP = "^"+seqP+ps+"$";\r
39         static String stdStructP = "^"+structP+s+"\\("+ps+floatP+s+floatP+s+floatP+ps+"\\)"+ps+"$";\r
40         static String justStructP = "^"+structP+ps+"$";\r
41         static String stochBTStructP = "^"+structP+s+floatP+s+floatP+ps+"$";\r
42         static String PStructP = "^"+structP+s+"\\["+ps+floatP+ps+"\\]"+ps+"$";\r
43         static String centStructP = "^"+structP+s+floatP+ps+"\\{"+ps+floatP+s+floatP+ps+"\\}"+ps+"$";\r
44         static String MEAStructP = "^"+structP+s+"\\{"+ps+floatP+s+"MEA="+floatP+ps+"\\}"+ps+"$";\r
45         static String freeEnergyP = "^"+ps+"free energy of ensemble"+ps+"="+ps+floatP+ps+"kcal/mol"+ps+"$";\r
46         static String ensembleFreqP = "^"+ps+"frequency of mfe structure in ensemble "+floatP+ps+"$";\r
47 \r
48         public static RNAStructScoreManager readRNAStructStream(InputStream stdout)\r
49                         throws IOException {\r
50                 \r
51                 String error = "Error in parsing alifold stdout file: ";\r
52                 // The Lists required to construct a ScoreManager Using the new constructor\r
53                 List<String> structs = new ArrayList<String>();\r
54                 List<TreeSet<Score>> data = new ArrayList<TreeSet<Score>>();\r
55 \r
56                 // Allocate necessry data structures for creating Score objects\r
57                 ArrayList<Float> scores = new ArrayList<Float>();\r
58 \r
59                 BufferedReader reader = new BufferedReader(new InputStreamReader(stdout));\r
60                 // The first 2 lines of the alifold stdout file are always the same format\r
61                 String fline = reader.readLine();\r
62                 assert (Pattern.matches(AlifoldLine.alignment.regex, fline)) :\r
63                         error + "Sequence Alignment Expected";\r
64                 structs.add(fline.trim());\r
65                 data.add(newEmptyScore(AlifoldResult.alifoldSeq));\r
66                 \r
67                 fline = reader.readLine();\r
68                 assert (Pattern.matches(AlifoldLine.stdStruct.regex, fline)) :\r
69                         error + "Consensus Structure and Energy Expected";\r
70                 Scanner sc = new Scanner(fline);\r
71                 structs.add(sc.next());\r
72                 for (int i = 0; i < 3; i++) {\r
73                         scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
74                 }\r
75                 data.add(newSetScore(AlifoldResult.alifold, scores));\r
76                 \r
77                 // Now the alifold stdout file formats diverge based on arguments\r
78                 fline = reader.readLine();\r
79                 String sline;\r
80                 Scanner nsc = null;\r
81                 while ( fline != null) {\r
82                         scores.clear();\r
83                         AlifoldLine ftype = identifyLine(fline);\r
84                         sline = reader.readLine(); // Look ahead\r
85                         sc = new Scanner(fline);\r
86                         if (sline != null) nsc = new Scanner(sline);\r
87 \r
88                         if (ftype.equals(AlifoldLine.PStruct)) {\r
89                                 // The -p or --MEA option is specified\r
90                                 // The next line should always be frequency of mfe structure\r
91                                 assert ( sline != null && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :\r
92                                         error + "Expected frequency of mfe structure";\r
93                                 structs.add(sc.next());\r
94                                 scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
95                                 scores.add(Float.parseFloat(nsc.findInLine(floatP)));\r
96                                 data.add(newSetScore(AlifoldResult.alifoldP, scores));\r
97                                 // Jump line\r
98                                 sline = reader.readLine();\r
99                         }\r
100                         else if (ftype.equals(AlifoldLine.centStruct)) {\r
101                                 structs.add(sc.next());\r
102                                 for (int i = 0; i < 3; i++) {\r
103                                         scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
104                                 }\r
105                                 data.add(newSetScore(AlifoldResult.alifoldCentroid, scores));\r
106                         }\r
107                         else if (ftype.equals(AlifoldLine.MEAStruct)) {\r
108                                 structs.add(sc.next());\r
109                                 for (int i = 0; i < 2; i++) {\r
110                                         scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
111                                 }\r
112                                 data.add(newSetScore(AlifoldResult.alifoldMEA, scores));\r
113                         }\r
114                         else if (ftype.equals(AlifoldLine.justStruct)) {\r
115                                 structs.add(sc.next());\r
116                                 data.add(newEmptyScore(AlifoldResult.alifoldStochBT));\r
117                         }\r
118                         else if (ftype.equals(AlifoldLine.stochBTStruct)) {\r
119                                 structs.add(sc.next());\r
120                                 scores.add(sc.nextFloat());\r
121                                 scores.add(sc.nextFloat());\r
122                                 data.add(newSetScore(AlifoldResult.alifoldStochBT, scores));\r
123                         }\r
124                         else if (ftype.equals(AlifoldLine.freeEnergy)) {\r
125                                 assert (sline != null \r
126                                                 && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :\r
127                                                 error + "Found 'freeEnergy' line on its own";\r
128                                 structs.add("Free energy of ensemble (kcal/mol) followed by "\r
129                                                 + "frequency of mfe structure in ensemble");\r
130                                 scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
131                                 scores.add(Float.parseFloat(nsc.findInLine(floatP)));\r
132                                 data.add(newSetScore(AlifoldResult.alifoldMetadata, scores));\r
133                                 // jump line\r
134                                 sline = reader.readLine();\r
135                         }\r
136                         \r
137 \r
138                         assert(!ftype.equals(AlifoldLine.ensembleFreq)) :\r
139                                 error + "Wasn't expecting 'frequency of mfe structure'!";\r
140                         assert(!ftype.equals(AlifoldLine.stdStruct)) :\r
141                                 error + "'Standard output' line at a place other than line 2!";\r
142                         assert(!ftype.equals(AlifoldLine.alignment)) :\r
143                                 error + "Wasn't expecting an alignment sequence!";\r
144                         assert(!ftype.equals(AlifoldLine.OTHER)) :\r
145                                 error + "Wasn't expecting this whatever it is: " + fline;\r
146                         if (Pattern.matches("^\\s*$", fline)) {\r
147                                 log.warn("While parsing alifold stdout: A line is either empty or"\r
148                                                 + " contains only whitespace");\r
149                         }\r
150                         \r
151                         fline = sline;\r
152                 }\r
153                                 \r
154                 sc.close();\r
155                 if (nsc != null) nsc.close();\r
156                 \r
157                 return new RNAStructScoreManager(structs, data);\r
158         }\r
159         \r
160         // Just for the purpose of creating nee TreeSet<Score> objects of length one\r
161         // for adding to a 'data' list to make a ScoreManager\r
162         private static TreeSet<Score> newSetScore(Enum<?> res, List<Float> scores) {\r
163                 // first convert List<Float> to float[]\r
164                 float[] scoresf = new float[scores.size()];\r
165                 Float f;\r
166                 for (int i = 0; i < scoresf.length; i++) {\r
167                         f = scores.get(i);\r
168                         scoresf[i] = ( f != null ? f : Float.NaN);\r
169                 }\r
170                 return new TreeSet<Score>(Arrays.asList(new Score(res, scoresf)));\r
171         }\r
172 \r
173         // A method just for the purpose of neatly creating Almost Empty score objects\r
174         // that can't be null\r
175         public static TreeSet<Score> newEmptyScore(Enum<?> res) {\r
176                 return new TreeSet<Score>(Arrays.asList(new Score(res, new float[0])));\r
177         }\r
178 \r
179         public static RNAStructScoreManager readRNAStructStream(InputStream stdout, \r
180                         InputStream alifold) throws IOException {\r
181                 \r
182                 // The Lists required to construct a ScoreManager Using the new constructor\r
183                 List<String> structs;\r
184                 List<TreeSet<Score>> data; \r
185                 \r
186                 // Get a ScoreManager that takes the std output but ignores alifold.out (-p)\r
187                 RNAStructScoreManager stdSM = readRNAStructStream(stdout);\r
188                 \r
189                 // Unpack this into the structs and data lists\r
190                 structs = stdSM.getStructs();\r
191                 data = stdSM.getData();\r
192                 \r
193                 // Now parse alifold.out\r
194                 Scanner sc = new Scanner(alifold);\r
195                 sc.useDelimiter("[\\s%]+");\r
196                 \r
197                 // jump two lines to the data \r
198                 sc.nextLine(); sc.nextLine();\r
199                 \r
200                 // Read the first, second and fourth columns. Ignoring everything else.\r
201                 // Allocate necessry data structures for creating Score objects\r
202                 ArrayList<Float> scores = new ArrayList<Float>();\r
203                 List<Range> rangeHolder = new ArrayList<Range>();\r
204                 String s = "null";\r
205                 while (true) {\r
206                         s = sc.next();\r
207                         if (java.util.regex.Pattern.matches("^[\\.)(]{2,}$", s)) break;\r
208                         if (!sc.hasNextLine()) break;\r
209                         int t = sc.nextInt();\r
210                         rangeHolder.add(new Range(Integer.parseInt(s), t));\r
211                         sc.next();\r
212                         scores.add(sc.nextFloat());\r
213                         sc.nextLine();\r
214                 }\r
215                 sc.close();\r
216                 \r
217                 // Update the first ScoreHolder TreeSet<Score> element\r
218                 assert (rangeHolder.size() == scores.size());\r
219                 TreeSet<Score> sHolder = new TreeSet<Score>();\r
220                 for (int i = 0; i < rangeHolder.size(); i++) {\r
221                         ArrayList<Float> singleS = new ArrayList<Float>(Arrays.asList(scores.get(i)));\r
222                         TreeSet<Range> singleR = new TreeSet<Range>(Arrays.asList(rangeHolder.get(i)));\r
223                         sHolder.add(new Score(AlifoldResult.alifoldSeq, singleS, singleR));\r
224                 }\r
225                 \r
226                 data.set(0, sHolder);\r
227                 \r
228                 return new RNAStructScoreManager(structs, data);\r
229         }\r
230 \r
231         private static RNAOut identify(String token) {\r
232                 if (Pattern.matches(seqP, token)) {\r
233                         return RNAOut.SEQ;\r
234                 } else if (Pattern.matches(structP, token)) {\r
235                         return RNAOut.STRUCT;\r
236                 } else if (Pattern.matches(energyP, token)) {\r
237                         return RNAOut.ENERGY;\r
238                 } else if (Pattern.matches(freqP, token)) {\r
239                         return RNAOut.FREQ;\r
240                 }\r
241                 \r
242                 return RNAOut.OTHER;\r
243         }\r
244         \r
245         private static AlifoldLine identifyLine(String line) {\r
246                 \r
247                 for (AlifoldLine il : AlifoldLine.values()) {\r
248                         if (Pattern.matches(il.regex, line)) return il;\r
249                 }\r
250                 return AlifoldLine.OTHER;\r
251         }\r
252         \r
253         static enum AlifoldLine {\r
254                 stdStruct (stdStructP),\r
255                 justStruct (justStructP),\r
256                 stochBTStruct (stochBTStructP),\r
257                 PStruct (PStructP),\r
258                 centStruct (centStructP),\r
259                 MEAStruct (MEAStructP),\r
260                 freeEnergy (freeEnergyP),\r
261                 ensembleFreq (ensembleFreqP),\r
262                 alignment (alignmentP), \r
263                 OTHER (".*");\r
264                 \r
265                 String regex;\r
266                 AlifoldLine(String regex) { this.regex = regex; }\r
267 \r
268         }\r
269         \r
270         //The types of data in an RNAalifold stdout file\r
271         static enum RNAOut {\r
272                 SEQ, STRUCT, ENERGY, FREQ, OTHER\r
273         }\r
274 \r
275         //Something to put in the Score objects of the alifold result which gives information\r
276         //about what kind of sequence it is holding in its String Id.\r
277         static enum AlifoldResult {\r
278                 alifold, alifoldP, alifoldMEA, alifoldCentroid, alifoldStochBT, alifoldSeq, alifoldMetadata\r
279         }\r
280         \r
281         \r
282 \r
283         // Print the full regex Strings for testing \r
284         public static void main(String[] args) {\r
285                 for (AlifoldLine l : AlifoldLine.values()) {\r
286                         System.out.println(l.toString() + ": " + l.regex.replace("^","").replace("$",""));\r
287                 }\r
288         }\r
289         \r
290 \r
291         \r
292 }       \r