Clean up logging system
[jabaws.git] / datamodel / compbio / data / sequence / RNAStructReader.java
1 package compbio.data.sequence;\r
2 \r
3 import java.io.BufferedReader;\r
4 import java.io.InputStream;\r
5 import java.io.InputStreamReader;\r
6 import java.io.IOException;\r
7 import java.util.ArrayList;\r
8 import java.util.Arrays;\r
9 import java.util.List;\r
10 import java.util.Scanner;\r
11 import java.util.TreeSet;\r
12 import java.util.regex.Pattern;\r
13 \r
14 import org.apache.log4j.Logger;\r
15 \r
16 // Utility class for reading alifold output\r
17 \r
18 public class RNAStructReader {\r
19 \r
20         // Whitespace patterns\r
21         static String s = "[+\\s=]+";\r
22         static String bracket = "\\(|\\)|\\{|\\}|\\[|\\]";\r
23         static String notData = "[\\s=+]+";\r
24 \r
25         // RNAOut data type patterns \r
26         static String seqP = "[_\\-a-zA-Z]{2,}"; // Has to match --mis output aswell (not just ACGU_)\r
27         static String structP = "[\\.)({}\\[\\],]{2,}";\r
28         static String floatP = "-?\\d+\\.\\d*(e[+\\-]\\d+)?";\r
29         static String energyP = "-?[0-9]*\\.?[0-9]{2}";\r
30         static String freqP = "^-?\\d\\.\\d{6,}(e[+\\-]\\d+)?$";\r
31         \r
32         // alifold out line patterns\r
33         static String ps = "\\s*";\r
34         static String alignmentP = "^"+seqP+ps+"$";\r
35         static String mfeStructP = "^"+structP+s+"\\("+ps+floatP+s+floatP+s+floatP+ps+"\\)"+ps+"$";\r
36         static String justStructP = "^"+structP+ps+"$";\r
37         static String stochBTStructP = "^"+structP+s+floatP+s+floatP+ps+"$";\r
38         static String PStructP = "^"+structP+s+"\\["+ps+floatP+ps+"\\]"+ps+"$";\r
39         static String centStructP = "^"+structP+s+floatP+ps+"\\{"+ps+floatP+s+floatP+ps+"\\}"+ps+"$";\r
40         static String MEAStructP = "^"+structP+s+"\\{"+ps+floatP+s+"MEA="+floatP+ps+"\\}"+ps+"$";\r
41         static String freeEnergyP = "^"+ps+"free energy of ensemble"+ps+"="+ps+floatP+ps+"kcal/mol"+ps+"$";\r
42         static String ensembleFreqP = "^"+ps+"frequency of mfe structure in ensemble "+floatP+ps+"$";\r
43         \r
44         \r
45         public static RNAStructScoreManager readRNAStructStream(InputStream stdout)\r
46                         throws IOException {\r
47                 \r
48                 String error = "Error in parsing alifold stdout file: ";\r
49                 // The Lists required to construct a ScoreManager Using the new constructor\r
50                 List<String> structs = new ArrayList<String>();\r
51                 List<TreeSet<Score>> data = new ArrayList<TreeSet<Score>>();\r
52 \r
53                 // Allocate necessry data structures for creating Score objects\r
54                 ArrayList<Float> scores = new ArrayList<Float>();\r
55 \r
56                 BufferedReader reader = new BufferedReader(new InputStreamReader(stdout));\r
57                 // The first 2 lines of the alifold stdout file are always the same format\r
58                 String fline = reader.readLine();\r
59                 assert (Pattern.matches(AlifoldLine.alignment.regex, fline)) :\r
60                         error + "Sequence Alignment Expected";\r
61                 structs.add(fline.trim());\r
62                 data.add(newEmptyScore(AlifoldResult.consensusAlignment));\r
63                 \r
64                 fline = reader.readLine();\r
65                 assert (Pattern.matches(AlifoldLine.mfeStruct.regex, fline)) :\r
66                         error + "Consensus Structure and Energy Expected";\r
67                 Scanner sc = new Scanner(fline);\r
68                 structs.add(sc.next());\r
69                 for (int i = 0; i < 3; i++) {\r
70                         scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
71                 }\r
72                 data.add(newSetScore(AlifoldResult.mfeStructure, scores));\r
73                 \r
74                 // Now the alifold stdout file formats diverge based on arguments\r
75                 fline = reader.readLine();\r
76                 String sline;\r
77                 Scanner nsc = null;\r
78                 while ( fline != null) {\r
79                         scores.clear();\r
80                         AlifoldLine ftype = identifyLine(fline);\r
81                         sline = reader.readLine(); // Look ahead\r
82                         sc = new Scanner(fline);\r
83                         if (sline != null) nsc = new Scanner(sline);\r
84 \r
85                         if (ftype.equals(AlifoldLine.PStruct)) {\r
86                                 // The -p or --MEA option is specified\r
87                                 // The next line should always be frequency of mfe structure\r
88                                 assert ( sline != null && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :\r
89                                         error + "Expected frequency of mfe structure";\r
90                                 structs.add(sc.next());\r
91                                 scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
92                                 scores.add(Float.parseFloat(nsc.findInLine(floatP)));\r
93                                 data.add(newSetScore(AlifoldResult.contactProbabilityStructure, scores));\r
94                                 // Jump line\r
95                                 sline = reader.readLine();\r
96                         }\r
97                         else if (ftype.equals(AlifoldLine.centStruct)) {\r
98                                 structs.add(sc.next());\r
99                                 for (int i = 0; i < 3; i++) {\r
100                                         scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
101                                 }\r
102                                 data.add(newSetScore(AlifoldResult.centroidStructure, scores));\r
103                         }\r
104                         else if (ftype.equals(AlifoldLine.MEAStruct)) {\r
105                                 structs.add(sc.next());\r
106                                 for (int i = 0; i < 2; i++) {\r
107                                         scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
108                                 }\r
109                                 data.add(newSetScore(AlifoldResult.MEAStucture, scores));\r
110                         }\r
111                         else if (ftype.equals(AlifoldLine.justStruct)) {\r
112                                 structs.add(sc.next());\r
113                                 data.add(newEmptyScore(AlifoldResult.stochBTStructure));\r
114                         }\r
115                         else if (ftype.equals(AlifoldLine.stochBTStruct)) {\r
116                                 structs.add(sc.next());\r
117                                 scores.add(sc.nextFloat());\r
118                                 scores.add(sc.nextFloat());\r
119                                 data.add(newSetScore(AlifoldResult.stochBTStructure, scores));\r
120                         }\r
121                         else if (ftype.equals(AlifoldLine.freeEnergy)) {\r
122                                 assert (sline != null \r
123                                                 && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :\r
124                                                 error + "Found 'freeEnergy' line on its own";\r
125                                 structs.add("Free energy of ensemble (kcal/mol) followed by frequency of mfe structure in ensemble");\r
126                                 scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
127                                 scores.add(Float.parseFloat(nsc.findInLine(floatP)));\r
128                                 data.add(newSetScore(AlifoldResult.ensembleValues, scores));\r
129                                 // jump line\r
130                                 sline = reader.readLine();\r
131                         }\r
132 \r
133                         assert(!ftype.equals(AlifoldLine.ensembleFreq)) :\r
134                                 error + "Wasn't expecting 'frequency of mfe structure'!";\r
135                         assert(!ftype.equals(AlifoldLine.mfeStruct)) :\r
136                                 error + "'Standard output' line at a place other than line 2!";\r
137                         assert(!ftype.equals(AlifoldLine.alignment)) :\r
138                                 error + "Wasn't expecting an alignment sequence!";\r
139                         assert(!ftype.equals(AlifoldLine.OTHER)) :\r
140                                 error + "Wasn't expecting this whatever it is: " + fline;\r
141 \r
142                         fline = sline;\r
143                 }\r
144                                 \r
145                 sc.close();\r
146                 if (nsc != null) nsc.close();\r
147                 \r
148                 return new RNAStructScoreManager(structs, data);\r
149         }\r
150         \r
151         // Just for the purpose of creating new TreeSet<Score> objects of length one\r
152         // for adding to a 'data' list to make a ScoreManager\r
153         private static TreeSet<Score> newSetScore(Enum<?> res, List<Float> scores) {\r
154                 // first convert List<Float> to float[]\r
155                 float[] scoresf = new float[scores.size()];\r
156                 Float f;\r
157                 for (int i = 0; i < scoresf.length; i++) {\r
158                         f = scores.get(i);\r
159                         scoresf[i] = ( f != null ? f : Float.NaN);\r
160                 }\r
161                 return new TreeSet<Score>(Arrays.asList(new Score(res, scoresf)));\r
162         }\r
163 \r
164         // A method just for the purpose of neatly creating Almost Empty score objects\r
165         // that can't be null\r
166         public static TreeSet<Score> newEmptyScore(Enum<?> res) {\r
167                 return new TreeSet<Score>(Arrays.asList(new Score(res, new float[0])));\r
168         }\r
169 \r
170         public static RNAStructScoreManager readRNAStructStream(InputStream stdout, \r
171                         InputStream alifold) throws IOException {\r
172                 \r
173                 // The Lists required to construct a ScoreManager Using the new constructor\r
174                 List<String> structs;\r
175                 List<TreeSet<Score>> data; \r
176                 \r
177                 // Get a ScoreManager that takes the std output but ignores alifold.out (-p)\r
178                 RNAStructScoreManager stdSM = readRNAStructStream(stdout);\r
179                 \r
180                 // Unpack this into the structs and data lists\r
181                 structs = stdSM.getStructs();\r
182                 data = stdSM.getData();\r
183                 \r
184                 // Now parse alifold.out\r
185                 Scanner sc = new Scanner(alifold);\r
186                 sc.useDelimiter("[\\s%]+");\r
187                 \r
188                 // jump two lines to the data \r
189                 sc.nextLine(); sc.nextLine();\r
190                 \r
191                 // Read the first, second and fourth columns. Ignoring everything else.\r
192                 // Allocate necessry data structures for creating Score objects\r
193                 ArrayList<Float> scores = new ArrayList<Float>();\r
194                 List<Range> rangeHolder = new ArrayList<Range>();\r
195                 String s = "null";\r
196                 while (true) {\r
197                         s = sc.next();\r
198                         if (java.util.regex.Pattern.matches("^[\\.)(]{2,}$", s)) break;\r
199                         if (!sc.hasNextLine()) break;\r
200                         int t = sc.nextInt();\r
201                         rangeHolder.add(new Range(Integer.parseInt(s), t));\r
202                         sc.next();\r
203                         scores.add(sc.nextFloat());\r
204                         sc.nextLine();\r
205                 }\r
206                 sc.close();\r
207                 \r
208                 // Update the first ScoreHolder TreeSet<Score> element\r
209                 assert (rangeHolder.size() == scores.size());\r
210                 TreeSet<Score> sHolder = new TreeSet<Score>();\r
211                 for (int i = 0; i < rangeHolder.size(); i++) {\r
212                         ArrayList<Float> singleS = new ArrayList<Float>(Arrays.asList(scores.get(i)));\r
213                         TreeSet<Range> singleR = new TreeSet<Range>(Arrays.asList(rangeHolder.get(i)));\r
214                         sHolder.add(new Score(AlifoldResult.contactProbabilities, singleS, singleR));\r
215                 }\r
216                 \r
217                 data.set(0, sHolder);\r
218                 \r
219                 return new RNAStructScoreManager(structs, data);\r
220         }\r
221 \r
222         private static RNAOut identify(String token) {\r
223                 if (Pattern.matches(seqP, token)) {\r
224                         return RNAOut.SEQ;\r
225                 } else if (Pattern.matches(structP, token)) {\r
226                         return RNAOut.STRUCT;\r
227                 } else if (Pattern.matches(energyP, token)) {\r
228                         return RNAOut.ENERGY;\r
229                 } else if (Pattern.matches(freqP, token)) {\r
230                         return RNAOut.FREQ;\r
231                 }\r
232 \r
233                 return RNAOut.OTHER;\r
234         }\r
235         \r
236         private static AlifoldLine identifyLine(String line) {\r
237                 \r
238                 for (AlifoldLine il : AlifoldLine.values()) {\r
239                         if (Pattern.matches(il.regex, line)) return il;\r
240                 }\r
241                 return AlifoldLine.OTHER;\r
242         }\r
243         \r
244         static enum AlifoldLine {\r
245                 mfeStruct (mfeStructP),\r
246                 justStruct (justStructP),\r
247                 stochBTStruct (stochBTStructP),\r
248                 PStruct (PStructP),\r
249                 centStruct (centStructP),\r
250                 MEAStruct (MEAStructP),\r
251                 freeEnergy (freeEnergyP),\r
252                 ensembleFreq (ensembleFreqP),\r
253                 alignment (alignmentP), \r
254                 OTHER (".*");\r
255                 \r
256                 String regex;\r
257                 AlifoldLine(String regex) { this.regex = regex; }\r
258 \r
259         }\r
260         \r
261         //The types of data in an RNAalifold stdout file\r
262         static enum RNAOut {\r
263                 SEQ, STRUCT, ENERGY, FREQ, OTHER\r
264         }\r
265 \r
266         //Something to put in the Score objects of the alifold result which gives information\r
267         //about what kind of sequence it is holding in its String Id.\r
268 \r
269         public static enum AlifoldResult {\r
270                 mfeStructure, contactProbabilityStructure, MEAStucture, centroidStructure, stochBTStructure, consensusAlignment, ensembleValues, contactProbabilities\r
271         }\r
272 \r
273         // Print the full regex Strings for testing \r
274         public static void main(String[] args) {\r
275                 for (AlifoldLine l : AlifoldLine.values()) {\r
276                         System.out.println(l.toString() + ": " + l.regex.replace("^","").replace("$",""));\r
277                 }\r
278         }\r
279 \r
280 }\r