5eec46f167d1a91b8ef2dc6b3ad04259adc39366
[jabaws.git] / datamodel / compbio / data / sequence / RNAStructReader.java
1 package compbio.data.sequence;\r
2 \r
3 import java.io.BufferedReader;\r
4 import java.io.InputStream;\r
5 import java.io.InputStreamReader;\r
6 import java.io.IOException;\r
7 import java.util.ArrayList;\r
8 import java.util.Arrays;\r
9 import java.util.List;\r
10 import java.util.Scanner;\r
11 import java.util.TreeSet;\r
12 import java.util.regex.Pattern;\r
13 \r
14 import org.apache.log4j.Logger;\r
15 \r
16 // Utility class for reading alifold output\r
17 \r
18 public class RNAStructReader {\r
19 \r
20         private static Logger log = Logger.getLogger(RNAStructReader.class);\r
21         \r
22         // Whitespace patterns\r
23         static String s = "[+\\s=]+";\r
24         static String bracket = "\\(|\\)|\\{|\\}|\\[|\\]";\r
25         static String notData = "[\\s=+]+";\r
26 \r
27         // RNAOut data type patterns \r
28         static String seqP = "[_\\-a-zA-Z]{2,}"; // Has to match --mis output aswell (not just ACGU_)\r
29         static String structP = "[\\.)({}\\[\\],]{2,}";\r
30         static String floatP = "-?\\d+\\.\\d*(e[+\\-]\\d+)?";\r
31         static String energyP = "-?[0-9]*\\.?[0-9]{2}";\r
32         static String freqP = "^-?\\d\\.\\d{6,}(e[+\\-]\\d+)?$";\r
33         \r
34         // alifold out line patterns\r
35         static String ps = "\\s*";\r
36         static String alignmentP = "^"+seqP+ps+"$";\r
37         static String stdStructP = "^"+structP+s+"\\("+ps+floatP+s+floatP+s+floatP+ps+"\\)"+ps+"$";\r
38         static String justStructP = "^"+structP+ps+"$";\r
39         static String stochBTStructP = "^"+structP+s+floatP+s+floatP+ps+"$";\r
40         static String PStructP = "^"+structP+s+"\\["+ps+floatP+ps+"\\]"+ps+"$";\r
41         static String centStructP = "^"+structP+s+floatP+ps+"\\{"+ps+floatP+s+floatP+ps+"\\}"+ps+"$";\r
42         static String MEAStructP = "^"+structP+s+"\\{"+ps+floatP+s+"MEA="+floatP+ps+"\\}"+ps+"$";\r
43         static String freeEnergyP = "^"+ps+"free energy of ensemble"+ps+"="+ps+floatP+ps+"kcal/mol"+ps+"$";\r
44         static String ensembleFreqP = "^"+ps+"frequency of mfe structure in ensemble "+floatP+ps+"$";\r
45 \r
46         public static RNAStructScoreManager readRNAStructStream(InputStream stdout)\r
47                         throws IOException {\r
48                 \r
49                 String error = "Error in parsing alifold stdout file: ";\r
50                 // The Lists required to construct a ScoreManager Using the new constructor\r
51                 List<String> structs = new ArrayList<String>();\r
52                 List<TreeSet<Score>> data = new ArrayList<TreeSet<Score>>();\r
53 \r
54                 // Allocate necessry data structures for creating Score objects\r
55                 ArrayList<Float> scores = new ArrayList<Float>();\r
56 \r
57                 BufferedReader reader = new BufferedReader(new InputStreamReader(stdout));\r
58                 // The first 2 lines of the alifold stdout file are always the same format\r
59                 String fline = reader.readLine();\r
60                 assert (Pattern.matches(AlifoldLine.alignment.regex, fline)) :\r
61                         error + "Sequence Alignment Expected";\r
62                 structs.add(fline.trim());\r
63                 data.add(newEmptyScore(AlifoldResult.alifoldSeq));\r
64                 \r
65                 fline = reader.readLine();\r
66                 assert (Pattern.matches(AlifoldLine.stdStruct.regex, fline)) :\r
67                         error + "Consensus Structure and Energy Expected";\r
68                 Scanner sc = new Scanner(fline);\r
69                 structs.add(sc.next());\r
70                 for (int i = 0; i < 3; i++) {\r
71                         scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
72                 }\r
73                 data.add(newSetScore(AlifoldResult.alifold, scores));\r
74                 \r
75                 // Now the alifold stdout file formats diverge based on arguments\r
76                 fline = reader.readLine();\r
77                 String sline;\r
78                 Scanner nsc = null;\r
79                 while ( fline != null) {\r
80                         scores.clear();\r
81                         AlifoldLine ftype = identifyLine(fline);\r
82                         sline = reader.readLine(); // Look ahead\r
83                         sc = new Scanner(fline);\r
84                         if (sline != null) nsc = new Scanner(sline);\r
85 \r
86                         if (ftype.equals(AlifoldLine.PStruct)) {\r
87                                 // The -p or --MEA option is specified\r
88                                 // The next line should always be frequency of mfe structure\r
89                                 assert ( sline != null && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :\r
90                                         error + "Expected frequency of mfe structure";\r
91                                 structs.add(sc.next());\r
92                                 scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
93                                 scores.add(Float.parseFloat(nsc.findInLine(floatP)));\r
94                                 data.add(newSetScore(AlifoldResult.alifoldP, scores));\r
95                                 // Jump line\r
96                                 sline = reader.readLine();\r
97                         }\r
98                         else if (ftype.equals(AlifoldLine.centStruct)) {\r
99                                 structs.add(sc.next());\r
100                                 for (int i = 0; i < 3; i++) {\r
101                                         scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
102                                 }\r
103                                 data.add(newSetScore(AlifoldResult.alifoldCentroid, scores));\r
104                         }\r
105                         else if (ftype.equals(AlifoldLine.MEAStruct)) {\r
106                                 structs.add(sc.next());\r
107                                 for (int i = 0; i < 2; i++) {\r
108                                         scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
109                                 }\r
110                                 data.add(newSetScore(AlifoldResult.alifoldMEA, scores));\r
111                         }\r
112                         else if (ftype.equals(AlifoldLine.justStruct)) {\r
113                                 structs.add(sc.next());\r
114                                 data.add(newEmptyScore(AlifoldResult.alifoldStochBT));\r
115                         }\r
116                         else if (ftype.equals(AlifoldLine.stochBTStruct)) {\r
117                                 structs.add(sc.next());\r
118                                 scores.add(sc.nextFloat());\r
119                                 scores.add(sc.nextFloat());\r
120                                 data.add(newSetScore(AlifoldResult.alifoldStochBT, scores));\r
121                         }\r
122                         else if (ftype.equals(AlifoldLine.freeEnergy)) {\r
123                                 assert (sline != null \r
124                                                 && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :\r
125                                                 error + "Found 'freeEnergy' line on its own";\r
126                                 structs.add("Free energy of ensemble (kcal/mol) followed by "\r
127                                                 + "frequency of mfe structure in ensemble");\r
128                                 scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
129                                 scores.add(Float.parseFloat(nsc.findInLine(floatP)));\r
130                                 data.add(newSetScore(AlifoldResult.alifoldMetadata, scores));\r
131                                 // jump line\r
132                                 sline = reader.readLine();\r
133                         }\r
134                         \r
135 \r
136                         assert(!ftype.equals(AlifoldLine.ensembleFreq)) :\r
137                                 error + "Wasn't expecting 'frequency of mfe structure'!";\r
138                         assert(!ftype.equals(AlifoldLine.stdStruct)) :\r
139                                 error + "'Standard output' line at a place other than line 2!";\r
140                         assert(!ftype.equals(AlifoldLine.alignment)) :\r
141                                 error + "Wasn't expecting an alignment sequence!";\r
142                         assert(!ftype.equals(AlifoldLine.OTHER)) :\r
143                                 error + "Wasn't expecting this whatever it is: " + fline;\r
144                         if (Pattern.matches("^\\s*$", fline)) {\r
145                                 log.warn("While parsing alifold stdout: A line is either empty or"\r
146                                                 + " contains only whitespace");\r
147                         }\r
148                         \r
149                         fline = sline;\r
150                 }\r
151                                 \r
152                 sc.close();\r
153                 if (nsc != null) nsc.close();\r
154                 \r
155                 return new RNAStructScoreManager(structs, data);\r
156         }\r
157         \r
158         // Just for the purpose of creating new TreeSet<Score> objects of length one\r
159         // for adding to a 'data' list to make a ScoreManager\r
160         private static TreeSet<Score> newSetScore(Enum<?> res, List<Float> scores) {\r
161                 // first convert List<Float> to float[]\r
162                 float[] scoresf = new float[scores.size()];\r
163                 Float f;\r
164                 for (int i = 0; i < scoresf.length; i++) {\r
165                         f = scores.get(i);\r
166                         scoresf[i] = ( f != null ? f : Float.NaN);\r
167                 }\r
168                 return new TreeSet<Score>(Arrays.asList(new Score(res, scoresf)));\r
169         }\r
170 \r
171         // A method just for the purpose of neatly creating Almost Empty score objects\r
172         // that can't be null\r
173         public static TreeSet<Score> newEmptyScore(Enum<?> res) {\r
174                 return new TreeSet<Score>(Arrays.asList(new Score(res, new float[0])));\r
175         }\r
176 \r
177         public static RNAStructScoreManager readRNAStructStream(InputStream stdout, \r
178                         InputStream alifold) throws IOException {\r
179                 \r
180                 // The Lists required to construct a ScoreManager Using the new constructor\r
181                 List<String> structs;\r
182                 List<TreeSet<Score>> data; \r
183                 \r
184                 // Get a ScoreManager that takes the std output but ignores alifold.out (-p)\r
185                 RNAStructScoreManager stdSM = readRNAStructStream(stdout);\r
186                 \r
187                 // Unpack this into the structs and data lists\r
188                 structs = stdSM.getStructs();\r
189                 data = stdSM.getData();\r
190                 \r
191                 // Now parse alifold.out\r
192                 Scanner sc = new Scanner(alifold);\r
193                 sc.useDelimiter("[\\s%]+");\r
194                 \r
195                 // jump two lines to the data \r
196                 sc.nextLine(); sc.nextLine();\r
197                 \r
198                 // Read the first, second and fourth columns. Ignoring everything else.\r
199                 // Allocate necessry data structures for creating Score objects\r
200                 ArrayList<Float> scores = new ArrayList<Float>();\r
201                 List<Range> rangeHolder = new ArrayList<Range>();\r
202                 String s = "null";\r
203                 while (true) {\r
204                         s = sc.next();\r
205                         if (java.util.regex.Pattern.matches("^[\\.)(]{2,}$", s)) break;\r
206                         if (!sc.hasNextLine()) break;\r
207                         int t = sc.nextInt();\r
208                         rangeHolder.add(new Range(Integer.parseInt(s), t));\r
209                         sc.next();\r
210                         scores.add(sc.nextFloat());\r
211                         sc.nextLine();\r
212                 }\r
213                 sc.close();\r
214                 \r
215                 // Update the first ScoreHolder TreeSet<Score> element\r
216                 assert (rangeHolder.size() == scores.size());\r
217                 TreeSet<Score> sHolder = new TreeSet<Score>();\r
218                 for (int i = 0; i < rangeHolder.size(); i++) {\r
219                         ArrayList<Float> singleS = new ArrayList<Float>(Arrays.asList(scores.get(i)));\r
220                         TreeSet<Range> singleR = new TreeSet<Range>(Arrays.asList(rangeHolder.get(i)));\r
221                         sHolder.add(new Score(AlifoldResult.alifoldSeq, singleS, singleR));\r
222                 }\r
223                 \r
224                 data.set(0, sHolder);\r
225                 \r
226                 return new RNAStructScoreManager(structs, data);\r
227         }\r
228 \r
229         private static RNAOut identify(String token) {\r
230                 if (Pattern.matches(seqP, token)) {\r
231                         return RNAOut.SEQ;\r
232                 } else if (Pattern.matches(structP, token)) {\r
233                         return RNAOut.STRUCT;\r
234                 } else if (Pattern.matches(energyP, token)) {\r
235                         return RNAOut.ENERGY;\r
236                 } else if (Pattern.matches(freqP, token)) {\r
237                         return RNAOut.FREQ;\r
238                 }\r
239                 \r
240                 return RNAOut.OTHER;\r
241         }\r
242         \r
243         private static AlifoldLine identifyLine(String line) {\r
244                 \r
245                 for (AlifoldLine il : AlifoldLine.values()) {\r
246                         if (Pattern.matches(il.regex, line)) return il;\r
247                 }\r
248                 return AlifoldLine.OTHER;\r
249         }\r
250         \r
251         static enum AlifoldLine {\r
252                 stdStruct (stdStructP),\r
253                 justStruct (justStructP),\r
254                 stochBTStruct (stochBTStructP),\r
255                 PStruct (PStructP),\r
256                 centStruct (centStructP),\r
257                 MEAStruct (MEAStructP),\r
258                 freeEnergy (freeEnergyP),\r
259                 ensembleFreq (ensembleFreqP),\r
260                 alignment (alignmentP), \r
261                 OTHER (".*");\r
262                 \r
263                 String regex;\r
264                 AlifoldLine(String regex) { this.regex = regex; }\r
265 \r
266         }\r
267         \r
268         //The types of data in an RNAalifold stdout file\r
269         static enum RNAOut {\r
270                 SEQ, STRUCT, ENERGY, FREQ, OTHER\r
271         }\r
272 \r
273         //Something to put in the Score objects of the alifold result which gives information\r
274         //about what kind of sequence it is holding in its String Id.\r
275         static enum AlifoldResult {\r
276                 alifold, alifoldP, alifoldMEA, alifoldCentroid, alifoldStochBT, alifoldSeq, alifoldMetadata\r
277         }\r
278         \r
279         \r
280 \r
281         // Print the full regex Strings for testing \r
282         public static void main(String[] args) {\r
283                 for (AlifoldLine l : AlifoldLine.values()) {\r
284                         System.out.println(l.toString() + ": " + l.regex.replace("^","").replace("$",""));\r
285                 }\r
286         }\r
287         \r
288 \r
289         \r
290 }       \r