Category.java updated to include new category for RNA folding. RNAalifoldParameters...
[jabaws.git] / datamodel / compbio / data / sequence / RNAStructReader.java
1 package compbio.data.sequence;\r
2 \r
3 import java.io.BufferedReader;\r
4 import java.io.InputStream;\r
5 import java.io.InputStreamReader;\r
6 import java.io.IOException;\r
7 import java.util.ArrayList;\r
8 import java.util.Arrays;\r
9 import java.util.List;\r
10 import java.util.Scanner;\r
11 import java.util.TreeSet;\r
12 import java.util.regex.Pattern;\r
13 \r
14 import org.apache.log4j.Logger;\r
15 \r
16 // Utility class for reading alifold output\r
17 \r
18 public class RNAStructReader {\r
19 \r
20         private static Logger log = Logger.getLogger(RNAStructReader.class);\r
21         \r
22         // Whitespace patterns\r
23         static String s = "[+\\s=]+";\r
24         static String bracket = "\\(|\\)|\\{|\\}|\\[|\\]";\r
25         static String notData = "[\\s=+]+";\r
26 \r
27         // RNAOut data type patterns \r
28         static String seqP = "[_\\-a-zA-Z]{2,}"; // Has to match --mis output aswell (not just ACGU_)\r
29         static String structP = "[\\.)({}\\[\\],]{2,}";\r
30         static String floatP = "-?\\d+\\.\\d*(e[+\\-]\\d+)?";\r
31         static String energyP = "-?[0-9]*\\.?[0-9]{2}";\r
32         static String freqP = "^-?\\d\\.\\d{6,}(e[+\\-]\\d+)?$";\r
33         \r
34         // alifold out line patterns\r
35         static String ps = "\\s*";\r
36         static String alignmentP = "^"+seqP+ps+"$";\r
37         static String mfeStructP = "^"+structP+s+"\\("+ps+floatP+s+floatP+s+floatP+ps+"\\)"+ps+"$";\r
38         static String justStructP = "^"+structP+ps+"$";\r
39         static String stochBTStructP = "^"+structP+s+floatP+s+floatP+ps+"$";\r
40         static String PStructP = "^"+structP+s+"\\["+ps+floatP+ps+"\\]"+ps+"$";\r
41         static String centStructP = "^"+structP+s+floatP+ps+"\\{"+ps+floatP+s+floatP+ps+"\\}"+ps+"$";\r
42         static String MEAStructP = "^"+structP+s+"\\{"+ps+floatP+s+"MEA="+floatP+ps+"\\}"+ps+"$";\r
43         static String freeEnergyP = "^"+ps+"free energy of ensemble"+ps+"="+ps+floatP+ps+"kcal/mol"+ps+"$";\r
44         static String ensembleFreqP = "^"+ps+"frequency of mfe structure in ensemble "+floatP+ps+"$";\r
45         \r
46         \r
47         public static RNAStructScoreManager readRNAStructStream(InputStream stdout)\r
48                         throws IOException {\r
49                 \r
50                 String error = "Error in parsing alifold stdout file: ";\r
51                 // The Lists required to construct a ScoreManager Using the new constructor\r
52                 List<String> structs = new ArrayList<String>();\r
53                 List<TreeSet<Score>> data = new ArrayList<TreeSet<Score>>();\r
54 \r
55                 // Allocate necessry data structures for creating Score objects\r
56                 ArrayList<Float> scores = new ArrayList<Float>();\r
57 \r
58                 BufferedReader reader = new BufferedReader(new InputStreamReader(stdout));\r
59                 // The first 2 lines of the alifold stdout file are always the same format\r
60                 String fline = reader.readLine();\r
61                 assert (Pattern.matches(AlifoldLine.alignment.regex, fline)) :\r
62                         error + "Sequence Alignment Expected";\r
63                 structs.add(fline.trim());\r
64                 data.add(newEmptyScore(AlifoldResult.consensusAlignment));\r
65                 \r
66                 fline = reader.readLine();\r
67                 assert (Pattern.matches(AlifoldLine.mfeStruct.regex, fline)) :\r
68                         error + "Consensus Structure and Energy Expected";\r
69                 Scanner sc = new Scanner(fline);\r
70                 structs.add(sc.next());\r
71                 for (int i = 0; i < 3; i++) {\r
72                         scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
73                 }\r
74                 data.add(newSetScore(AlifoldResult.mfeStructure, scores));\r
75                 \r
76                 // Now the alifold stdout file formats diverge based on arguments\r
77                 fline = reader.readLine();\r
78                 String sline;\r
79                 Scanner nsc = null;\r
80                 while ( fline != null) {\r
81                         scores.clear();\r
82                         AlifoldLine ftype = identifyLine(fline);\r
83                         sline = reader.readLine(); // Look ahead\r
84                         sc = new Scanner(fline);\r
85                         if (sline != null) nsc = new Scanner(sline);\r
86 \r
87                         if (ftype.equals(AlifoldLine.PStruct)) {\r
88                                 // The -p or --MEA option is specified\r
89                                 // The next line should always be frequency of mfe structure\r
90                                 assert ( sline != null && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :\r
91                                         error + "Expected frequency of mfe structure";\r
92                                 structs.add(sc.next());\r
93                                 scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
94                                 scores.add(Float.parseFloat(nsc.findInLine(floatP)));\r
95                                 data.add(newSetScore(AlifoldResult.contactProbabilityStructure, scores));\r
96                                 // Jump line\r
97                                 sline = reader.readLine();\r
98                         }\r
99                         else if (ftype.equals(AlifoldLine.centStruct)) {\r
100                                 structs.add(sc.next());\r
101                                 for (int i = 0; i < 3; i++) {\r
102                                         scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
103                                 }\r
104                                 data.add(newSetScore(AlifoldResult.centroidStructure, scores));\r
105                         }\r
106                         else if (ftype.equals(AlifoldLine.MEAStruct)) {\r
107                                 structs.add(sc.next());\r
108                                 for (int i = 0; i < 2; i++) {\r
109                                         scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
110                                 }\r
111                                 data.add(newSetScore(AlifoldResult.MEAStucture, scores));\r
112                         }\r
113                         else if (ftype.equals(AlifoldLine.justStruct)) {\r
114                                 structs.add(sc.next());\r
115                                 data.add(newEmptyScore(AlifoldResult.stochBTStructure));\r
116                         }\r
117                         else if (ftype.equals(AlifoldLine.stochBTStruct)) {\r
118                                 structs.add(sc.next());\r
119                                 scores.add(sc.nextFloat());\r
120                                 scores.add(sc.nextFloat());\r
121                                 data.add(newSetScore(AlifoldResult.stochBTStructure, scores));\r
122                         }\r
123                         else if (ftype.equals(AlifoldLine.freeEnergy)) {\r
124                                 assert (sline != null \r
125                                                 && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :\r
126                                                 error + "Found 'freeEnergy' line on its own";\r
127                                 structs.add("Free energy of ensemble (kcal/mol) followed by "\r
128                                                 + "frequency of mfe structure in ensemble");\r
129                                 scores.add(Float.parseFloat(sc.findInLine(floatP)));\r
130                                 scores.add(Float.parseFloat(nsc.findInLine(floatP)));\r
131                                 data.add(newSetScore(AlifoldResult.ensembleValues, scores));\r
132                                 // jump line\r
133                                 sline = reader.readLine();\r
134                         }\r
135                         \r
136 \r
137                         assert(!ftype.equals(AlifoldLine.ensembleFreq)) :\r
138                                 error + "Wasn't expecting 'frequency of mfe structure'!";\r
139                         assert(!ftype.equals(AlifoldLine.mfeStruct)) :\r
140                                 error + "'Standard output' line at a place other than line 2!";\r
141                         assert(!ftype.equals(AlifoldLine.alignment)) :\r
142                                 error + "Wasn't expecting an alignment sequence!";\r
143                         assert(!ftype.equals(AlifoldLine.OTHER)) :\r
144                                 error + "Wasn't expecting this whatever it is: " + fline;\r
145                         if (Pattern.matches("^\\s*$", fline)) {\r
146                                 log.warn("While parsing alifold stdout: A line is either empty or"\r
147                                                 + " contains only whitespace");\r
148                         }\r
149                         \r
150                         fline = sline;\r
151                 }\r
152                                 \r
153                 sc.close();\r
154                 if (nsc != null) nsc.close();\r
155                 \r
156                 return new RNAStructScoreManager(structs, data);\r
157         }\r
158         \r
159         // Just for the purpose of creating new TreeSet<Score> objects of length one\r
160         // for adding to a 'data' list to make a ScoreManager\r
161         private static TreeSet<Score> newSetScore(Enum<?> res, List<Float> scores) {\r
162                 // first convert List<Float> to float[]\r
163                 float[] scoresf = new float[scores.size()];\r
164                 Float f;\r
165                 for (int i = 0; i < scoresf.length; i++) {\r
166                         f = scores.get(i);\r
167                         scoresf[i] = ( f != null ? f : Float.NaN);\r
168                 }\r
169                 return new TreeSet<Score>(Arrays.asList(new Score(res, scoresf)));\r
170         }\r
171 \r
172         // A method just for the purpose of neatly creating Almost Empty score objects\r
173         // that can't be null\r
174         public static TreeSet<Score> newEmptyScore(Enum<?> res) {\r
175                 return new TreeSet<Score>(Arrays.asList(new Score(res, new float[0])));\r
176         }\r
177 \r
178         public static RNAStructScoreManager readRNAStructStream(InputStream stdout, \r
179                         InputStream alifold) throws IOException {\r
180                 \r
181                 // The Lists required to construct a ScoreManager Using the new constructor\r
182                 List<String> structs;\r
183                 List<TreeSet<Score>> data; \r
184                 \r
185                 // Get a ScoreManager that takes the std output but ignores alifold.out (-p)\r
186                 RNAStructScoreManager stdSM = readRNAStructStream(stdout);\r
187                 \r
188                 // Unpack this into the structs and data lists\r
189                 structs = stdSM.getStructs();\r
190                 data = stdSM.getData();\r
191                 \r
192                 // Now parse alifold.out\r
193                 Scanner sc = new Scanner(alifold);\r
194                 sc.useDelimiter("[\\s%]+");\r
195                 \r
196                 // jump two lines to the data \r
197                 sc.nextLine(); sc.nextLine();\r
198                 \r
199                 // Read the first, second and fourth columns. Ignoring everything else.\r
200                 // Allocate necessry data structures for creating Score objects\r
201                 ArrayList<Float> scores = new ArrayList<Float>();\r
202                 List<Range> rangeHolder = new ArrayList<Range>();\r
203                 String s = "null";\r
204                 while (true) {\r
205                         s = sc.next();\r
206                         if (java.util.regex.Pattern.matches("^[\\.)(]{2,}$", s)) break;\r
207                         if (!sc.hasNextLine()) break;\r
208                         int t = sc.nextInt();\r
209                         rangeHolder.add(new Range(Integer.parseInt(s), t));\r
210                         sc.next();\r
211                         scores.add(sc.nextFloat());\r
212                         sc.nextLine();\r
213                 }\r
214                 sc.close();\r
215                 \r
216                 // Update the first ScoreHolder TreeSet<Score> element\r
217                 assert (rangeHolder.size() == scores.size());\r
218                 TreeSet<Score> sHolder = new TreeSet<Score>();\r
219                 for (int i = 0; i < rangeHolder.size(); i++) {\r
220                         ArrayList<Float> singleS = new ArrayList<Float>(Arrays.asList(scores.get(i)));\r
221                         TreeSet<Range> singleR = new TreeSet<Range>(Arrays.asList(rangeHolder.get(i)));\r
222                         sHolder.add(new Score(AlifoldResult.contactProbabilities, singleS, singleR));\r
223                 }\r
224                 \r
225                 data.set(0, sHolder);\r
226                 \r
227                 return new RNAStructScoreManager(structs, data);\r
228         }\r
229 \r
230         private static RNAOut identify(String token) {\r
231                 if (Pattern.matches(seqP, token)) {\r
232                         return RNAOut.SEQ;\r
233                 } else if (Pattern.matches(structP, token)) {\r
234                         return RNAOut.STRUCT;\r
235                 } else if (Pattern.matches(energyP, token)) {\r
236                         return RNAOut.ENERGY;\r
237                 } else if (Pattern.matches(freqP, token)) {\r
238                         return RNAOut.FREQ;\r
239                 }\r
240                 \r
241                 return RNAOut.OTHER;\r
242         }\r
243         \r
244         private static AlifoldLine identifyLine(String line) {\r
245                 \r
246                 for (AlifoldLine il : AlifoldLine.values()) {\r
247                         if (Pattern.matches(il.regex, line)) return il;\r
248                 }\r
249                 return AlifoldLine.OTHER;\r
250         }\r
251         \r
252         static enum AlifoldLine {\r
253                 mfeStruct (mfeStructP),\r
254                 justStruct (justStructP),\r
255                 stochBTStruct (stochBTStructP),\r
256                 PStruct (PStructP),\r
257                 centStruct (centStructP),\r
258                 MEAStruct (MEAStructP),\r
259                 freeEnergy (freeEnergyP),\r
260                 ensembleFreq (ensembleFreqP),\r
261                 alignment (alignmentP), \r
262                 OTHER (".*");\r
263                 \r
264                 String regex;\r
265                 AlifoldLine(String regex) { this.regex = regex; }\r
266 \r
267         }\r
268         \r
269         //The types of data in an RNAalifold stdout file\r
270         static enum RNAOut {\r
271                 SEQ, STRUCT, ENERGY, FREQ, OTHER\r
272         }\r
273 \r
274         //Something to put in the Score objects of the alifold result which gives information\r
275         //about what kind of sequence it is holding in its String Id.\r
276 \r
277         public static enum AlifoldResult {\r
278                 mfeStructure, contactProbabilityStructure, MEAStucture, centroidStructure, stochBTStructure, consensusAlignment, ensembleValues, contactProbabilities\r
279         }\r
280         \r
281 \r
282         // Print the full regex Strings for testing \r
283         public static void main(String[] args) {\r
284                 for (AlifoldLine l : AlifoldLine.values()) {\r
285                         System.out.println(l.toString() + ": " + l.regex.replace("^","").replace("$",""));\r
286                 }\r
287         }\r
288         \r
289 \r
290         \r
291 }       \r