1 package compbio.data.sequence;
\r
3 import java.io.BufferedReader;
\r
4 import java.io.InputStream;
\r
5 import java.io.InputStreamReader;
\r
6 import java.io.IOException;
\r
7 import java.util.ArrayList;
\r
8 import java.util.Arrays;
\r
9 import java.util.List;
\r
10 import java.util.Scanner;
\r
11 import java.util.TreeSet;
\r
12 import java.util.regex.Pattern;
\r
14 import org.apache.log4j.Logger;
\r
16 // Utility class for reading alifold output
\r
18 public class RNAStructReader {
\r
20 private static Logger log = Logger.getLogger(RNAStructReader.class);
\r
22 // Whitespace patterns
\r
23 static String s = "[+\\s=]+";
\r
24 static String bracket = "\\(|\\)|\\{|\\}|\\[|\\]";
\r
25 static String notData = "[\\s=+]+";
\r
27 // RNAOut data type patterns
\r
28 static String seqP = "[_\\-a-zA-Z]{2,}"; // Has to match --mis output aswell (not just ACGU_)
\r
29 static String structP = "[\\.)({}\\[\\],]{2,}";
\r
30 static String floatP = "-?\\d+\\.\\d*(e[+\\-]\\d+)?";
\r
31 static String energyP = "-?[0-9]*\\.?[0-9]{2}";
\r
32 static String freqP = "^-?\\d\\.\\d{6,}(e[+\\-]\\d+)?$";
\r
34 // alifold out line patterns
\r
35 static String ps = "\\s*";
\r
36 static String alignmentP = "^"+seqP+ps+"$";
\r
37 static String stdStructP = "^"+structP+s+"\\("+ps+floatP+s+floatP+s+floatP+ps+"\\)"+ps+"$";
\r
38 static String justStructP = "^"+structP+ps+"$";
\r
39 static String stochBTStructP = "^"+structP+s+floatP+s+floatP+ps+"$";
\r
40 static String PStructP = "^"+structP+s+"\\["+ps+floatP+ps+"\\]"+ps+"$";
\r
41 static String centStructP = "^"+structP+s+floatP+ps+"\\{"+ps+floatP+s+floatP+ps+"\\}"+ps+"$";
\r
42 static String MEAStructP = "^"+structP+s+"\\{"+ps+floatP+s+"MEA="+floatP+ps+"\\}"+ps+"$";
\r
43 static String freeEnergyP = "^"+ps+"free energy of ensemble"+ps+"="+ps+floatP+ps+"kcal/mol"+ps+"$";
\r
44 static String ensembleFreqP = "^"+ps+"frequency of mfe structure in ensemble "+floatP+ps+"$";
\r
46 public static RNAStructScoreManager readRNAStructStream(InputStream stdout)
\r
47 throws IOException {
\r
49 String error = "Error in parsing alifold stdout file: ";
\r
50 // The Lists required to construct a ScoreManager Using the new constructor
\r
51 List<String> structs = new ArrayList<String>();
\r
52 List<TreeSet<Score>> data = new ArrayList<TreeSet<Score>>();
\r
54 // Allocate necessry data structures for creating Score objects
\r
55 ArrayList<Float> scores = new ArrayList<Float>();
\r
57 BufferedReader reader = new BufferedReader(new InputStreamReader(stdout));
\r
58 // The first 2 lines of the alifold stdout file are always the same format
\r
59 String fline = reader.readLine();
\r
60 assert (Pattern.matches(AlifoldLine.alignment.regex, fline)) :
\r
61 error + "Sequence Alignment Expected";
\r
62 structs.add(fline.trim());
\r
63 data.add(newEmptyScore(AlifoldResult.alifoldSeq));
\r
65 fline = reader.readLine();
\r
66 assert (Pattern.matches(AlifoldLine.stdStruct.regex, fline)) :
\r
67 error + "Consensus Structure and Energy Expected";
\r
68 Scanner sc = new Scanner(fline);
\r
69 structs.add(sc.next());
\r
70 for (int i = 0; i < 3; i++) {
\r
71 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
73 data.add(newSetScore(AlifoldResult.alifold, scores));
\r
75 // Now the alifold stdout file formats diverge based on arguments
\r
76 fline = reader.readLine();
\r
79 while ( fline != null) {
\r
81 AlifoldLine ftype = identifyLine(fline);
\r
82 sline = reader.readLine(); // Look ahead
\r
83 sc = new Scanner(fline);
\r
84 if (sline != null) nsc = new Scanner(sline);
\r
86 if (ftype.equals(AlifoldLine.PStruct)) {
\r
87 // The -p or --MEA option is specified
\r
88 // The next line should always be frequency of mfe structure
\r
89 assert ( sline != null && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :
\r
90 error + "Expected frequency of mfe structure";
\r
91 structs.add(sc.next());
\r
92 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
93 scores.add(Float.parseFloat(nsc.findInLine(floatP)));
\r
94 data.add(newSetScore(AlifoldResult.alifoldP, scores));
\r
96 sline = reader.readLine();
\r
98 else if (ftype.equals(AlifoldLine.centStruct)) {
\r
99 structs.add(sc.next());
\r
100 for (int i = 0; i < 3; i++) {
\r
101 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
103 data.add(newSetScore(AlifoldResult.alifoldCentroid, scores));
\r
105 else if (ftype.equals(AlifoldLine.MEAStruct)) {
\r
106 structs.add(sc.next());
\r
107 for (int i = 0; i < 2; i++) {
\r
108 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
110 data.add(newSetScore(AlifoldResult.alifoldMEA, scores));
\r
112 else if (ftype.equals(AlifoldLine.justStruct)) {
\r
113 structs.add(sc.next());
\r
114 data.add(newEmptyScore(AlifoldResult.alifoldStochBT));
\r
116 else if (ftype.equals(AlifoldLine.stochBTStruct)) {
\r
117 structs.add(sc.next());
\r
118 scores.add(sc.nextFloat());
\r
119 scores.add(sc.nextFloat());
\r
120 data.add(newSetScore(AlifoldResult.alifoldStochBT, scores));
\r
122 else if (ftype.equals(AlifoldLine.freeEnergy)) {
\r
123 assert (sline != null
\r
124 && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :
\r
125 error + "Found 'freeEnergy' line on its own";
\r
126 structs.add("Free energy of ensemble (kcal/mol) followed by "
\r
127 + "frequency of mfe structure in ensemble");
\r
128 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
129 scores.add(Float.parseFloat(nsc.findInLine(floatP)));
\r
130 data.add(newSetScore(AlifoldResult.alifoldMetadata, scores));
\r
132 sline = reader.readLine();
\r
136 assert(!ftype.equals(AlifoldLine.ensembleFreq)) :
\r
137 error + "Wasn't expecting 'frequency of mfe structure'!";
\r
138 assert(!ftype.equals(AlifoldLine.stdStruct)) :
\r
139 error + "'Standard output' line at a place other than line 2!";
\r
140 assert(!ftype.equals(AlifoldLine.alignment)) :
\r
141 error + "Wasn't expecting an alignment sequence!";
\r
142 assert(!ftype.equals(AlifoldLine.OTHER)) :
\r
143 error + "Wasn't expecting this whatever it is: " + fline;
\r
144 if (Pattern.matches("^\\s*$", fline)) {
\r
145 log.warn("While parsing alifold stdout: A line is either empty or"
\r
146 + " contains only whitespace");
\r
153 if (nsc != null) nsc.close();
\r
155 return new RNAStructScoreManager(structs, data);
\r
158 // Just for the purpose of creating new TreeSet<Score> objects of length one
\r
159 // for adding to a 'data' list to make a ScoreManager
\r
160 private static TreeSet<Score> newSetScore(Enum<?> res, List<Float> scores) {
\r
161 // first convert List<Float> to float[]
\r
162 float[] scoresf = new float[scores.size()];
\r
164 for (int i = 0; i < scoresf.length; i++) {
\r
166 scoresf[i] = ( f != null ? f : Float.NaN);
\r
168 return new TreeSet<Score>(Arrays.asList(new Score(res, scoresf)));
\r
171 // A method just for the purpose of neatly creating Almost Empty score objects
\r
172 // that can't be null
\r
173 public static TreeSet<Score> newEmptyScore(Enum<?> res) {
\r
174 return new TreeSet<Score>(Arrays.asList(new Score(res, new float[0])));
\r
177 public static RNAStructScoreManager readRNAStructStream(InputStream stdout,
\r
178 InputStream alifold) throws IOException {
\r
180 // The Lists required to construct a ScoreManager Using the new constructor
\r
181 List<String> structs;
\r
182 List<TreeSet<Score>> data;
\r
184 // Get a ScoreManager that takes the std output but ignores alifold.out (-p)
\r
185 RNAStructScoreManager stdSM = readRNAStructStream(stdout);
\r
187 // Unpack this into the structs and data lists
\r
188 structs = stdSM.getStructs();
\r
189 data = stdSM.getData();
\r
191 // Now parse alifold.out
\r
192 Scanner sc = new Scanner(alifold);
\r
193 sc.useDelimiter("[\\s%]+");
\r
195 // jump two lines to the data
\r
196 sc.nextLine(); sc.nextLine();
\r
198 // Read the first, second and fourth columns. Ignoring everything else.
\r
199 // Allocate necessry data structures for creating Score objects
\r
200 ArrayList<Float> scores = new ArrayList<Float>();
\r
201 List<Range> rangeHolder = new ArrayList<Range>();
\r
205 if (java.util.regex.Pattern.matches("^[\\.)(]{2,}$", s)) break;
\r
206 if (!sc.hasNextLine()) break;
\r
207 int t = sc.nextInt();
\r
208 rangeHolder.add(new Range(Integer.parseInt(s), t));
\r
210 scores.add(sc.nextFloat());
\r
215 // Update the first ScoreHolder TreeSet<Score> element
\r
216 assert (rangeHolder.size() == scores.size());
\r
217 TreeSet<Score> sHolder = new TreeSet<Score>();
\r
218 for (int i = 0; i < rangeHolder.size(); i++) {
\r
219 ArrayList<Float> singleS = new ArrayList<Float>(Arrays.asList(scores.get(i)));
\r
220 TreeSet<Range> singleR = new TreeSet<Range>(Arrays.asList(rangeHolder.get(i)));
\r
221 sHolder.add(new Score(AlifoldResult.alifoldSeq, singleS, singleR));
\r
224 data.set(0, sHolder);
\r
226 return new RNAStructScoreManager(structs, data);
\r
229 private static RNAOut identify(String token) {
\r
230 if (Pattern.matches(seqP, token)) {
\r
232 } else if (Pattern.matches(structP, token)) {
\r
233 return RNAOut.STRUCT;
\r
234 } else if (Pattern.matches(energyP, token)) {
\r
235 return RNAOut.ENERGY;
\r
236 } else if (Pattern.matches(freqP, token)) {
\r
237 return RNAOut.FREQ;
\r
240 return RNAOut.OTHER;
\r
243 private static AlifoldLine identifyLine(String line) {
\r
245 for (AlifoldLine il : AlifoldLine.values()) {
\r
246 if (Pattern.matches(il.regex, line)) return il;
\r
248 return AlifoldLine.OTHER;
\r
251 static enum AlifoldLine {
\r
252 stdStruct (stdStructP),
\r
253 justStruct (justStructP),
\r
254 stochBTStruct (stochBTStructP),
\r
255 PStruct (PStructP),
\r
256 centStruct (centStructP),
\r
257 MEAStruct (MEAStructP),
\r
258 freeEnergy (freeEnergyP),
\r
259 ensembleFreq (ensembleFreqP),
\r
260 alignment (alignmentP),
\r
264 AlifoldLine(String regex) { this.regex = regex; }
\r
268 //The types of data in an RNAalifold stdout file
\r
269 static enum RNAOut {
\r
270 SEQ, STRUCT, ENERGY, FREQ, OTHER
\r
273 //Something to put in the Score objects of the alifold result which gives information
\r
274 //about what kind of sequence it is holding in its String Id.
\r
275 static enum AlifoldResult {
\r
276 alifold, alifoldP, alifoldMEA, alifoldCentroid, alifoldStochBT, alifoldSeq, alifoldMetadata
\r
281 // Print the full regex Strings for testing
\r
282 public static void main(String[] args) {
\r
283 for (AlifoldLine l : AlifoldLine.values()) {
\r
284 System.out.println(l.toString() + ": " + l.regex.replace("^","").replace("$",""));
\r