1 package compbio.data.sequence;
\r
3 import java.io.BufferedReader;
\r
4 import java.io.InputStream;
\r
5 import java.io.InputStreamReader;
\r
6 import java.io.IOException;
\r
7 import java.util.ArrayList;
\r
8 import java.util.Arrays;
\r
9 import java.util.List;
\r
10 import java.util.Scanner;
\r
11 import java.util.TreeSet;
\r
12 import java.util.regex.Pattern;
\r
14 import org.apache.log4j.Logger;
\r
16 import compbio.runner.structure.RNAalifold;
\r
18 // Utility class for reading alifold output
\r
20 public class RNAStructReader {
\r
22 private static Logger log = Logger.getLogger(RNAStructReader.class);
\r
24 // Whitespace patterns
\r
25 static String s = "[+\\s=]+";
\r
26 static String bracket = "\\(|\\)|\\{|\\}|\\[|\\]";
\r
27 static String notData = "[\\s=+]+";
\r
29 // RNAOut data type patterns
\r
30 static String seqP = "[_\\-a-zA-Z]{2,}"; // Has to match --mis output aswell (not just ACGU_)
\r
31 static String structP = "[\\.)({}\\[\\],]{2,}";
\r
32 static String floatP = "-?\\d+\\.\\d*(e[+\\-]\\d+)?";
\r
33 static String energyP = "-?[0-9]*\\.?[0-9]{2}";
\r
34 static String freqP = "^-?\\d\\.\\d{6,}(e[+\\-]\\d+)?$";
\r
36 // alifold out line patterns
\r
37 static String ps = "\\s*";
\r
38 static String alignmentP = "^"+seqP+ps+"$";
\r
39 static String stdStructP = "^"+structP+s+"\\("+ps+floatP+s+floatP+s+floatP+ps+"\\)"+ps+"$";
\r
40 static String justStructP = "^"+structP+ps+"$";
\r
41 static String stochBTStructP = "^"+structP+s+floatP+s+floatP+ps+"$";
\r
42 static String PStructP = "^"+structP+s+"\\["+ps+floatP+ps+"\\]"+ps+"$";
\r
43 static String centStructP = "^"+structP+s+floatP+ps+"\\{"+ps+floatP+s+floatP+ps+"\\}"+ps+"$";
\r
44 static String MEAStructP = "^"+structP+s+"\\{"+ps+floatP+s+"MEA="+floatP+ps+"\\}"+ps+"$";
\r
45 static String freeEnergyP = "^"+ps+"free energy of ensemble"+ps+"="+ps+floatP+ps+"kcal/mol"+ps+"$";
\r
46 static String ensembleFreqP = "^"+ps+"frequency of mfe structure in ensemble "+floatP+ps+"$";
\r
48 public static RNAStructScoreManager readRNAStructStream(InputStream stdout)
\r
49 throws IOException {
\r
51 String error = "Error in parsing alifold stdout file: ";
\r
52 // The Lists required to construct a ScoreManager Using the new constructor
\r
53 List<String> structs = new ArrayList<String>();
\r
54 List<TreeSet<Score>> data = new ArrayList<TreeSet<Score>>();
\r
56 // Allocate necessry data structures for creating Score objects
\r
57 ArrayList<Float> scores = new ArrayList<Float>();
\r
59 BufferedReader reader = new BufferedReader(new InputStreamReader(stdout));
\r
60 // The first 2 lines of the alifold stdout file are always the same format
\r
61 String fline = reader.readLine();
\r
62 assert (Pattern.matches(AlifoldLine.alignment.regex, fline)) :
\r
63 error + "Sequence Alignment Expected";
\r
64 structs.add(fline.trim());
\r
65 data.add(newEmptyScore(AlifoldResult.alifoldSeq));
\r
67 fline = reader.readLine();
\r
68 assert (Pattern.matches(AlifoldLine.stdStruct.regex, fline)) :
\r
69 error + "Consensus Structure and Energy Expected";
\r
70 Scanner sc = new Scanner(fline);
\r
71 structs.add(sc.next());
\r
72 for (int i = 0; i < 3; i++) {
\r
73 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
75 data.add(newSetScore(AlifoldResult.alifold, scores));
\r
77 // Now the alifold stdout file formats diverge based on arguments
\r
78 fline = reader.readLine();
\r
81 while ( fline != null) {
\r
83 AlifoldLine ftype = identifyLine(fline);
\r
84 sline = reader.readLine(); // Look ahead
\r
85 sc = new Scanner(fline);
\r
86 if (sline != null) nsc = new Scanner(sline);
\r
88 if (ftype.equals(AlifoldLine.PStruct)) {
\r
89 // The -p or --MEA option is specified
\r
90 // The next line should always be frequency of mfe structure
\r
91 assert ( sline != null && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :
\r
92 error + "Expected frequency of mfe structure";
\r
93 structs.add(sc.next());
\r
94 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
95 scores.add(Float.parseFloat(nsc.findInLine(floatP)));
\r
96 data.add(newSetScore(AlifoldResult.alifoldP, scores));
\r
98 sline = reader.readLine();
\r
100 else if (ftype.equals(AlifoldLine.centStruct)) {
\r
101 structs.add(sc.next());
\r
102 for (int i = 0; i < 3; i++) {
\r
103 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
105 data.add(newSetScore(AlifoldResult.alifoldCentroid, scores));
\r
107 else if (ftype.equals(AlifoldLine.MEAStruct)) {
\r
108 structs.add(sc.next());
\r
109 for (int i = 0; i < 2; i++) {
\r
110 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
112 data.add(newSetScore(AlifoldResult.alifoldMEA, scores));
\r
114 else if (ftype.equals(AlifoldLine.justStruct)) {
\r
115 structs.add(sc.next());
\r
116 data.add(newEmptyScore(AlifoldResult.alifoldStochBT));
\r
118 else if (ftype.equals(AlifoldLine.stochBTStruct)) {
\r
119 structs.add(sc.next());
\r
120 scores.add(sc.nextFloat());
\r
121 scores.add(sc.nextFloat());
\r
122 data.add(newSetScore(AlifoldResult.alifoldStochBT, scores));
\r
124 else if (ftype.equals(AlifoldLine.freeEnergy)) {
\r
125 assert (sline != null
\r
126 && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :
\r
127 error + "Found 'freeEnergy' line on its own";
\r
128 structs.add("Free energy of ensemble (kcal/mol) followed by "
\r
129 + "frequency of mfe structure in ensemble");
\r
130 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
131 scores.add(Float.parseFloat(nsc.findInLine(floatP)));
\r
132 data.add(newSetScore(AlifoldResult.alifoldMetadata, scores));
\r
134 sline = reader.readLine();
\r
138 assert(!ftype.equals(AlifoldLine.ensembleFreq)) :
\r
139 error + "Wasn't expecting 'frequency of mfe structure'!";
\r
140 assert(!ftype.equals(AlifoldLine.stdStruct)) :
\r
141 error + "'Standard output' line at a place other than line 2!";
\r
142 assert(!ftype.equals(AlifoldLine.alignment)) :
\r
143 error + "Wasn't expecting an alignment sequence!";
\r
144 assert(!ftype.equals(AlifoldLine.OTHER)) :
\r
145 error + "Wasn't expecting this whatever it is: " + fline;
\r
146 if (Pattern.matches("^\\s*$", fline)) {
\r
147 log.warn("While parsing alifold stdout: A line is either empty or"
\r
148 + " contains only whitespace");
\r
155 if (nsc != null) nsc.close();
\r
157 return new RNAStructScoreManager(structs, data);
\r
160 // Just for the purpose of creating nee TreeSet<Score> objects of length one
\r
161 // for adding to a 'data' list to make a ScoreManager
\r
162 private static TreeSet<Score> newSetScore(Enum<?> res, List<Float> scores) {
\r
163 // first convert List<Float> to float[]
\r
164 float[] scoresf = new float[scores.size()];
\r
166 for (int i = 0; i < scoresf.length; i++) {
\r
168 scoresf[i] = ( f != null ? f : Float.NaN);
\r
170 return new TreeSet<Score>(Arrays.asList(new Score(res, scoresf)));
\r
173 // A method just for the purpose of neatly creating Almost Empty score objects
\r
174 // that can't be null
\r
175 public static TreeSet<Score> newEmptyScore(Enum<?> res) {
\r
176 return new TreeSet<Score>(Arrays.asList(new Score(res, new float[0])));
\r
179 public static RNAStructScoreManager readRNAStructStream(InputStream stdout,
\r
180 InputStream alifold) throws IOException {
\r
182 // The Lists required to construct a ScoreManager Using the new constructor
\r
183 List<String> structs;
\r
184 List<TreeSet<Score>> data;
\r
186 // Get a ScoreManager that takes the std output but ignores alifold.out (-p)
\r
187 RNAStructScoreManager stdSM = readRNAStructStream(stdout);
\r
189 // Unpack this into the structs and data lists
\r
190 structs = stdSM.getStructs();
\r
191 data = stdSM.getData();
\r
193 // Now parse alifold.out
\r
194 Scanner sc = new Scanner(alifold);
\r
195 sc.useDelimiter("[\\s%]+");
\r
197 // jump two lines to the data
\r
198 sc.nextLine(); sc.nextLine();
\r
200 // Read the first, second and fourth columns. Ignoring everything else.
\r
201 // Allocate necessry data structures for creating Score objects
\r
202 ArrayList<Float> scores = new ArrayList<Float>();
\r
203 List<Range> rangeHolder = new ArrayList<Range>();
\r
207 if (java.util.regex.Pattern.matches("^[\\.)(]{2,}$", s)) break;
\r
208 if (!sc.hasNextLine()) break;
\r
209 int t = sc.nextInt();
\r
210 rangeHolder.add(new Range(Integer.parseInt(s), t));
\r
212 scores.add(sc.nextFloat());
\r
217 // Update the first ScoreHolder TreeSet<Score> element
\r
218 assert (rangeHolder.size() == scores.size());
\r
219 TreeSet<Score> sHolder = new TreeSet<Score>();
\r
220 for (int i = 0; i < rangeHolder.size(); i++) {
\r
221 ArrayList<Float> singleS = new ArrayList<Float>(Arrays.asList(scores.get(i)));
\r
222 TreeSet<Range> singleR = new TreeSet<Range>(Arrays.asList(rangeHolder.get(i)));
\r
223 sHolder.add(new Score(AlifoldResult.alifoldSeq, singleS, singleR));
\r
226 data.set(0, sHolder);
\r
228 return new RNAStructScoreManager(structs, data);
\r
231 private static RNAOut identify(String token) {
\r
232 if (Pattern.matches(seqP, token)) {
\r
234 } else if (Pattern.matches(structP, token)) {
\r
235 return RNAOut.STRUCT;
\r
236 } else if (Pattern.matches(energyP, token)) {
\r
237 return RNAOut.ENERGY;
\r
238 } else if (Pattern.matches(freqP, token)) {
\r
239 return RNAOut.FREQ;
\r
242 return RNAOut.OTHER;
\r
245 private static AlifoldLine identifyLine(String line) {
\r
247 for (AlifoldLine il : AlifoldLine.values()) {
\r
248 if (Pattern.matches(il.regex, line)) return il;
\r
250 return AlifoldLine.OTHER;
\r
253 static enum AlifoldLine {
\r
254 stdStruct (stdStructP),
\r
255 justStruct (justStructP),
\r
256 stochBTStruct (stochBTStructP),
\r
257 PStruct (PStructP),
\r
258 centStruct (centStructP),
\r
259 MEAStruct (MEAStructP),
\r
260 freeEnergy (freeEnergyP),
\r
261 ensembleFreq (ensembleFreqP),
\r
262 alignment (alignmentP),
\r
266 AlifoldLine(String regex) { this.regex = regex; }
\r
270 //The types of data in an RNAalifold stdout file
\r
271 static enum RNAOut {
\r
272 SEQ, STRUCT, ENERGY, FREQ, OTHER
\r
275 //Something to put in the Score objects of the alifold result which gives information
\r
276 //about what kind of sequence it is holding in its String Id.
\r
277 static enum AlifoldResult {
\r
278 alifold, alifoldP, alifoldMEA, alifoldCentroid, alifoldStochBT, alifoldSeq, alifoldMetadata
\r
283 // Print the full regex Strings for testing
\r
284 public static void main(String[] args) {
\r
285 for (AlifoldLine l : AlifoldLine.values()) {
\r
286 System.out.println(l.toString() + ": " + l.regex.replace("^","").replace("$",""));
\r