1 package compbio.data.sequence;
\r
3 import java.io.BufferedReader;
\r
4 import java.io.InputStream;
\r
5 import java.io.InputStreamReader;
\r
6 import java.io.IOException;
\r
7 import java.util.ArrayList;
\r
8 import java.util.Arrays;
\r
9 import java.util.List;
\r
10 import java.util.Scanner;
\r
11 import java.util.TreeSet;
\r
12 import java.util.regex.Pattern;
\r
14 import org.apache.log4j.Logger;
\r
16 // Utility class for reading alifold output
\r
18 public class RNAStructReader {
\r
20 private static Logger log = Logger.getLogger(RNAStructReader.class);
\r
22 // Whitespace patterns
\r
23 static String s = "[+\\s=]+";
\r
24 static String bracket = "\\(|\\)|\\{|\\}|\\[|\\]";
\r
25 static String notData = "[\\s=+]+";
\r
27 // RNAOut data type patterns
\r
28 static String seqP = "[_\\-a-zA-Z]{2,}"; // Has to match --mis output aswell (not just ACGU_)
\r
29 static String structP = "[\\.)({}\\[\\],]{2,}";
\r
30 static String floatP = "-?\\d+\\.\\d*(e[+\\-]\\d+)?";
\r
31 static String energyP = "-?[0-9]*\\.?[0-9]{2}";
\r
32 static String freqP = "^-?\\d\\.\\d{6,}(e[+\\-]\\d+)?$";
\r
34 // alifold out line patterns
\r
35 static String ps = "\\s*";
\r
36 static String alignmentP = "^"+seqP+ps+"$";
\r
37 static String mfeStructP = "^"+structP+s+"\\("+ps+floatP+s+floatP+s+floatP+ps+"\\)"+ps+"$";
\r
38 static String justStructP = "^"+structP+ps+"$";
\r
39 static String stochBTStructP = "^"+structP+s+floatP+s+floatP+ps+"$";
\r
40 static String PStructP = "^"+structP+s+"\\["+ps+floatP+ps+"\\]"+ps+"$";
\r
41 static String centStructP = "^"+structP+s+floatP+ps+"\\{"+ps+floatP+s+floatP+ps+"\\}"+ps+"$";
\r
42 static String MEAStructP = "^"+structP+s+"\\{"+ps+floatP+s+"MEA="+floatP+ps+"\\}"+ps+"$";
\r
43 static String freeEnergyP = "^"+ps+"free energy of ensemble"+ps+"="+ps+floatP+ps+"kcal/mol"+ps+"$";
\r
44 static String ensembleFreqP = "^"+ps+"frequency of mfe structure in ensemble "+floatP+ps+"$";
\r
47 public static RNAStructScoreManager readRNAStructStream(InputStream stdout)
\r
48 throws IOException {
\r
50 String error = "Error in parsing alifold stdout file: ";
\r
51 // The Lists required to construct a ScoreManager Using the new constructor
\r
52 List<String> structs = new ArrayList<String>();
\r
53 List<TreeSet<Score>> data = new ArrayList<TreeSet<Score>>();
\r
55 // Allocate necessry data structures for creating Score objects
\r
56 ArrayList<Float> scores = new ArrayList<Float>();
\r
58 BufferedReader reader = new BufferedReader(new InputStreamReader(stdout));
\r
59 // The first 2 lines of the alifold stdout file are always the same format
\r
60 String fline = reader.readLine();
\r
61 assert (Pattern.matches(AlifoldLine.alignment.regex, fline)) :
\r
62 error + "Sequence Alignment Expected";
\r
63 structs.add(fline.trim());
\r
64 data.add(newEmptyScore(AlifoldResult.consensusAlignment));
\r
66 fline = reader.readLine();
\r
67 assert (Pattern.matches(AlifoldLine.mfeStruct.regex, fline)) :
\r
68 error + "Consensus Structure and Energy Expected";
\r
69 Scanner sc = new Scanner(fline);
\r
70 structs.add(sc.next());
\r
71 for (int i = 0; i < 3; i++) {
\r
72 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
74 data.add(newSetScore(AlifoldResult.mfeStructure, scores));
\r
76 // Now the alifold stdout file formats diverge based on arguments
\r
77 fline = reader.readLine();
\r
80 while ( fline != null) {
\r
82 AlifoldLine ftype = identifyLine(fline);
\r
83 sline = reader.readLine(); // Look ahead
\r
84 sc = new Scanner(fline);
\r
85 if (sline != null) nsc = new Scanner(sline);
\r
87 if (ftype.equals(AlifoldLine.PStruct)) {
\r
88 // The -p or --MEA option is specified
\r
89 // The next line should always be frequency of mfe structure
\r
90 assert ( sline != null && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :
\r
91 error + "Expected frequency of mfe structure";
\r
92 structs.add(sc.next());
\r
93 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
94 scores.add(Float.parseFloat(nsc.findInLine(floatP)));
\r
95 data.add(newSetScore(AlifoldResult.contactProbabilityStructure, scores));
\r
97 sline = reader.readLine();
\r
99 else if (ftype.equals(AlifoldLine.centStruct)) {
\r
100 structs.add(sc.next());
\r
101 for (int i = 0; i < 3; i++) {
\r
102 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
104 data.add(newSetScore(AlifoldResult.centroidStructure, scores));
\r
106 else if (ftype.equals(AlifoldLine.MEAStruct)) {
\r
107 structs.add(sc.next());
\r
108 for (int i = 0; i < 2; i++) {
\r
109 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
111 data.add(newSetScore(AlifoldResult.MEAStucture, scores));
\r
113 else if (ftype.equals(AlifoldLine.justStruct)) {
\r
114 structs.add(sc.next());
\r
115 data.add(newEmptyScore(AlifoldResult.stochBTStructure));
\r
117 else if (ftype.equals(AlifoldLine.stochBTStruct)) {
\r
118 structs.add(sc.next());
\r
119 scores.add(sc.nextFloat());
\r
120 scores.add(sc.nextFloat());
\r
121 data.add(newSetScore(AlifoldResult.stochBTStructure, scores));
\r
123 else if (ftype.equals(AlifoldLine.freeEnergy)) {
\r
124 assert (sline != null
\r
125 && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :
\r
126 error + "Found 'freeEnergy' line on its own";
\r
127 structs.add("Free energy of ensemble (kcal/mol) followed by "
\r
128 + "frequency of mfe structure in ensemble");
\r
129 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
130 scores.add(Float.parseFloat(nsc.findInLine(floatP)));
\r
131 data.add(newSetScore(AlifoldResult.ensembleValues, scores));
\r
133 sline = reader.readLine();
\r
137 assert(!ftype.equals(AlifoldLine.ensembleFreq)) :
\r
138 error + "Wasn't expecting 'frequency of mfe structure'!";
\r
139 assert(!ftype.equals(AlifoldLine.mfeStruct)) :
\r
140 error + "'Standard output' line at a place other than line 2!";
\r
141 assert(!ftype.equals(AlifoldLine.alignment)) :
\r
142 error + "Wasn't expecting an alignment sequence!";
\r
143 assert(!ftype.equals(AlifoldLine.OTHER)) :
\r
144 error + "Wasn't expecting this whatever it is: " + fline;
\r
145 if (Pattern.matches("^\\s*$", fline)) {
\r
146 log.warn("While parsing alifold stdout: A line is either empty or"
\r
147 + " contains only whitespace");
\r
154 if (nsc != null) nsc.close();
\r
156 return new RNAStructScoreManager(structs, data);
\r
159 // Just for the purpose of creating new TreeSet<Score> objects of length one
\r
160 // for adding to a 'data' list to make a ScoreManager
\r
161 private static TreeSet<Score> newSetScore(Enum<?> res, List<Float> scores) {
\r
162 // first convert List<Float> to float[]
\r
163 float[] scoresf = new float[scores.size()];
\r
165 for (int i = 0; i < scoresf.length; i++) {
\r
167 scoresf[i] = ( f != null ? f : Float.NaN);
\r
169 return new TreeSet<Score>(Arrays.asList(new Score(res, scoresf)));
\r
172 // A method just for the purpose of neatly creating Almost Empty score objects
\r
173 // that can't be null
\r
174 public static TreeSet<Score> newEmptyScore(Enum<?> res) {
\r
175 return new TreeSet<Score>(Arrays.asList(new Score(res, new float[0])));
\r
178 public static RNAStructScoreManager readRNAStructStream(InputStream stdout,
\r
179 InputStream alifold) throws IOException {
\r
181 // The Lists required to construct a ScoreManager Using the new constructor
\r
182 List<String> structs;
\r
183 List<TreeSet<Score>> data;
\r
185 // Get a ScoreManager that takes the std output but ignores alifold.out (-p)
\r
186 RNAStructScoreManager stdSM = readRNAStructStream(stdout);
\r
188 // Unpack this into the structs and data lists
\r
189 structs = stdSM.getStructs();
\r
190 data = stdSM.getData();
\r
192 // Now parse alifold.out
\r
193 Scanner sc = new Scanner(alifold);
\r
194 sc.useDelimiter("[\\s%]+");
\r
196 // jump two lines to the data
\r
197 sc.nextLine(); sc.nextLine();
\r
199 // Read the first, second and fourth columns. Ignoring everything else.
\r
200 // Allocate necessry data structures for creating Score objects
\r
201 ArrayList<Float> scores = new ArrayList<Float>();
\r
202 List<Range> rangeHolder = new ArrayList<Range>();
\r
206 if (java.util.regex.Pattern.matches("^[\\.)(]{2,}$", s)) break;
\r
207 if (!sc.hasNextLine()) break;
\r
208 int t = sc.nextInt();
\r
209 rangeHolder.add(new Range(Integer.parseInt(s), t));
\r
211 scores.add(sc.nextFloat());
\r
216 // Update the first ScoreHolder TreeSet<Score> element
\r
217 assert (rangeHolder.size() == scores.size());
\r
218 TreeSet<Score> sHolder = new TreeSet<Score>();
\r
219 for (int i = 0; i < rangeHolder.size(); i++) {
\r
220 ArrayList<Float> singleS = new ArrayList<Float>(Arrays.asList(scores.get(i)));
\r
221 TreeSet<Range> singleR = new TreeSet<Range>(Arrays.asList(rangeHolder.get(i)));
\r
222 sHolder.add(new Score(AlifoldResult.contactProbabilities, singleS, singleR));
\r
225 data.set(0, sHolder);
\r
227 return new RNAStructScoreManager(structs, data);
\r
230 private static RNAOut identify(String token) {
\r
231 if (Pattern.matches(seqP, token)) {
\r
233 } else if (Pattern.matches(structP, token)) {
\r
234 return RNAOut.STRUCT;
\r
235 } else if (Pattern.matches(energyP, token)) {
\r
236 return RNAOut.ENERGY;
\r
237 } else if (Pattern.matches(freqP, token)) {
\r
238 return RNAOut.FREQ;
\r
241 return RNAOut.OTHER;
\r
244 private static AlifoldLine identifyLine(String line) {
\r
246 for (AlifoldLine il : AlifoldLine.values()) {
\r
247 if (Pattern.matches(il.regex, line)) return il;
\r
249 return AlifoldLine.OTHER;
\r
252 static enum AlifoldLine {
\r
253 mfeStruct (mfeStructP),
\r
254 justStruct (justStructP),
\r
255 stochBTStruct (stochBTStructP),
\r
256 PStruct (PStructP),
\r
257 centStruct (centStructP),
\r
258 MEAStruct (MEAStructP),
\r
259 freeEnergy (freeEnergyP),
\r
260 ensembleFreq (ensembleFreqP),
\r
261 alignment (alignmentP),
\r
265 AlifoldLine(String regex) { this.regex = regex; }
\r
269 //The types of data in an RNAalifold stdout file
\r
270 static enum RNAOut {
\r
271 SEQ, STRUCT, ENERGY, FREQ, OTHER
\r
274 //Something to put in the Score objects of the alifold result which gives information
\r
275 //about what kind of sequence it is holding in its String Id.
\r
277 public static enum AlifoldResult {
\r
278 mfeStructure, contactProbabilityStructure, MEAStucture, centroidStructure, stochBTStructure, consensusAlignment, ensembleValues, contactProbabilities
\r
282 // Print the full regex Strings for testing
\r
283 public static void main(String[] args) {
\r
284 for (AlifoldLine l : AlifoldLine.values()) {
\r
285 System.out.println(l.toString() + ": " + l.regex.replace("^","").replace("$",""));
\r