1 package compbio.data.sequence;
\r
3 import java.io.BufferedReader;
\r
4 import java.io.InputStream;
\r
5 import java.io.InputStreamReader;
\r
6 import java.io.IOException;
\r
7 import java.util.ArrayList;
\r
8 import java.util.Arrays;
\r
9 import java.util.List;
\r
10 import java.util.Scanner;
\r
11 import java.util.TreeSet;
\r
12 import java.util.regex.Pattern;
\r
14 import org.apache.log4j.Logger;
\r
16 // Utility class for reading alifold output
\r
18 public class RNAStructReader {
\r
20 // Whitespace patterns
\r
21 static String s = "[+\\s=]+";
\r
22 static String bracket = "\\(|\\)|\\{|\\}|\\[|\\]";
\r
23 static String notData = "[\\s=+]+";
\r
25 // RNAOut data type patterns
\r
26 static String seqP = "[_\\-a-zA-Z]{2,}"; // Has to match --mis output aswell (not just ACGU_)
\r
27 static String structP = "[\\.)({}\\[\\],]{2,}";
\r
28 static String floatP = "-?\\d+\\.\\d*(e[+\\-]\\d+)?";
\r
29 static String energyP = "-?[0-9]*\\.?[0-9]{2}";
\r
30 static String freqP = "^-?\\d\\.\\d{6,}(e[+\\-]\\d+)?$";
\r
32 // alifold out line patterns
\r
33 static String ps = "\\s*";
\r
34 static String alignmentP = "^"+seqP+ps+"$";
\r
35 static String mfeStructP = "^"+structP+s+"\\("+ps+floatP+s+floatP+s+floatP+ps+"\\)"+ps+"$";
\r
36 static String justStructP = "^"+structP+ps+"$";
\r
37 static String stochBTStructP = "^"+structP+s+floatP+s+floatP+ps+"$";
\r
38 static String PStructP = "^"+structP+s+"\\["+ps+floatP+ps+"\\]"+ps+"$";
\r
39 static String centStructP = "^"+structP+s+floatP+ps+"\\{"+ps+floatP+s+floatP+ps+"\\}"+ps+"$";
\r
40 static String MEAStructP = "^"+structP+s+"\\{"+ps+floatP+s+"MEA="+floatP+ps+"\\}"+ps+"$";
\r
41 static String freeEnergyP = "^"+ps+"free energy of ensemble"+ps+"="+ps+floatP+ps+"kcal/mol"+ps+"$";
\r
42 static String ensembleFreqP = "^"+ps+"frequency of mfe structure in ensemble "+floatP+ps+"$";
\r
45 public static RNAStructScoreManager readRNAStructStream(InputStream stdout)
\r
46 throws IOException {
\r
48 String error = "Error in parsing alifold stdout file: ";
\r
49 // The Lists required to construct a ScoreManager Using the new constructor
\r
50 List<String> structs = new ArrayList<String>();
\r
51 List<TreeSet<Score>> data = new ArrayList<TreeSet<Score>>();
\r
53 // Allocate necessry data structures for creating Score objects
\r
54 ArrayList<Float> scores = new ArrayList<Float>();
\r
56 BufferedReader reader = new BufferedReader(new InputStreamReader(stdout));
\r
57 // The first 2 lines of the alifold stdout file are always the same format
\r
58 String fline = reader.readLine();
\r
59 assert (Pattern.matches(AlifoldLine.alignment.regex, fline)) :
\r
60 error + "Sequence Alignment Expected";
\r
61 structs.add(fline.trim());
\r
62 data.add(newEmptyScore(AlifoldResult.consensusAlignment));
\r
64 fline = reader.readLine();
\r
65 assert (Pattern.matches(AlifoldLine.mfeStruct.regex, fline)) :
\r
66 error + "Consensus Structure and Energy Expected";
\r
67 Scanner sc = new Scanner(fline);
\r
68 structs.add(sc.next());
\r
69 for (int i = 0; i < 3; i++) {
\r
70 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
72 data.add(newSetScore(AlifoldResult.mfeStructure, scores));
\r
74 // Now the alifold stdout file formats diverge based on arguments
\r
75 fline = reader.readLine();
\r
78 while ( fline != null) {
\r
80 AlifoldLine ftype = identifyLine(fline);
\r
81 sline = reader.readLine(); // Look ahead
\r
82 sc = new Scanner(fline);
\r
83 if (sline != null) nsc = new Scanner(sline);
\r
85 if (ftype.equals(AlifoldLine.PStruct)) {
\r
86 // The -p or --MEA option is specified
\r
87 // The next line should always be frequency of mfe structure
\r
88 assert ( sline != null && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :
\r
89 error + "Expected frequency of mfe structure";
\r
90 structs.add(sc.next());
\r
91 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
92 scores.add(Float.parseFloat(nsc.findInLine(floatP)));
\r
93 data.add(newSetScore(AlifoldResult.contactProbabilityStructure, scores));
\r
95 sline = reader.readLine();
\r
97 else if (ftype.equals(AlifoldLine.centStruct)) {
\r
98 structs.add(sc.next());
\r
99 for (int i = 0; i < 3; i++) {
\r
100 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
102 data.add(newSetScore(AlifoldResult.centroidStructure, scores));
\r
104 else if (ftype.equals(AlifoldLine.MEAStruct)) {
\r
105 structs.add(sc.next());
\r
106 for (int i = 0; i < 2; i++) {
\r
107 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
109 data.add(newSetScore(AlifoldResult.MEAStucture, scores));
\r
111 else if (ftype.equals(AlifoldLine.justStruct)) {
\r
112 structs.add(sc.next());
\r
113 data.add(newEmptyScore(AlifoldResult.stochBTStructure));
\r
115 else if (ftype.equals(AlifoldLine.stochBTStruct)) {
\r
116 structs.add(sc.next());
\r
117 scores.add(sc.nextFloat());
\r
118 scores.add(sc.nextFloat());
\r
119 data.add(newSetScore(AlifoldResult.stochBTStructure, scores));
\r
121 else if (ftype.equals(AlifoldLine.freeEnergy)) {
\r
122 assert (sline != null
\r
123 && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) :
\r
124 error + "Found 'freeEnergy' line on its own";
\r
125 structs.add("Free energy of ensemble (kcal/mol) followed by frequency of mfe structure in ensemble");
\r
126 scores.add(Float.parseFloat(sc.findInLine(floatP)));
\r
127 scores.add(Float.parseFloat(nsc.findInLine(floatP)));
\r
128 data.add(newSetScore(AlifoldResult.ensembleValues, scores));
\r
130 sline = reader.readLine();
\r
133 assert(!ftype.equals(AlifoldLine.ensembleFreq)) :
\r
134 error + "Wasn't expecting 'frequency of mfe structure'!";
\r
135 assert(!ftype.equals(AlifoldLine.mfeStruct)) :
\r
136 error + "'Standard output' line at a place other than line 2!";
\r
137 assert(!ftype.equals(AlifoldLine.alignment)) :
\r
138 error + "Wasn't expecting an alignment sequence!";
\r
139 assert(!ftype.equals(AlifoldLine.OTHER)) :
\r
140 error + "Wasn't expecting this whatever it is: " + fline;
\r
146 if (nsc != null) nsc.close();
\r
148 return new RNAStructScoreManager(structs, data);
\r
151 // Just for the purpose of creating new TreeSet<Score> objects of length one
\r
152 // for adding to a 'data' list to make a ScoreManager
\r
153 private static TreeSet<Score> newSetScore(Enum<?> res, List<Float> scores) {
\r
154 // first convert List<Float> to float[]
\r
155 float[] scoresf = new float[scores.size()];
\r
157 for (int i = 0; i < scoresf.length; i++) {
\r
159 scoresf[i] = ( f != null ? f : Float.NaN);
\r
161 return new TreeSet<Score>(Arrays.asList(new Score(res, scoresf)));
\r
164 // A method just for the purpose of neatly creating Almost Empty score objects
\r
165 // that can't be null
\r
166 public static TreeSet<Score> newEmptyScore(Enum<?> res) {
\r
167 return new TreeSet<Score>(Arrays.asList(new Score(res, new float[0])));
\r
170 public static RNAStructScoreManager readRNAStructStream(InputStream stdout,
\r
171 InputStream alifold) throws IOException {
\r
173 // The Lists required to construct a ScoreManager Using the new constructor
\r
174 List<String> structs;
\r
175 List<TreeSet<Score>> data;
\r
177 // Get a ScoreManager that takes the std output but ignores alifold.out (-p)
\r
178 RNAStructScoreManager stdSM = readRNAStructStream(stdout);
\r
180 // Unpack this into the structs and data lists
\r
181 structs = stdSM.getStructs();
\r
182 data = stdSM.getData();
\r
184 // Now parse alifold.out
\r
185 Scanner sc = new Scanner(alifold);
\r
186 sc.useDelimiter("[\\s%]+");
\r
188 // jump two lines to the data
\r
189 sc.nextLine(); sc.nextLine();
\r
191 // Read the first, second and fourth columns. Ignoring everything else.
\r
192 // Allocate necessry data structures for creating Score objects
\r
193 ArrayList<Float> scores = new ArrayList<Float>();
\r
194 List<Range> rangeHolder = new ArrayList<Range>();
\r
198 if (java.util.regex.Pattern.matches("^[\\.)(]{2,}$", s)) break;
\r
199 if (!sc.hasNextLine()) break;
\r
200 int t = sc.nextInt();
\r
201 rangeHolder.add(new Range(Integer.parseInt(s), t));
\r
203 scores.add(sc.nextFloat());
\r
208 // Update the first ScoreHolder TreeSet<Score> element
\r
209 assert (rangeHolder.size() == scores.size());
\r
210 TreeSet<Score> sHolder = new TreeSet<Score>();
\r
211 for (int i = 0; i < rangeHolder.size(); i++) {
\r
212 ArrayList<Float> singleS = new ArrayList<Float>(Arrays.asList(scores.get(i)));
\r
213 TreeSet<Range> singleR = new TreeSet<Range>(Arrays.asList(rangeHolder.get(i)));
\r
214 sHolder.add(new Score(AlifoldResult.contactProbabilities, singleS, singleR));
\r
217 data.set(0, sHolder);
\r
219 return new RNAStructScoreManager(structs, data);
\r
222 private static RNAOut identify(String token) {
\r
223 if (Pattern.matches(seqP, token)) {
\r
225 } else if (Pattern.matches(structP, token)) {
\r
226 return RNAOut.STRUCT;
\r
227 } else if (Pattern.matches(energyP, token)) {
\r
228 return RNAOut.ENERGY;
\r
229 } else if (Pattern.matches(freqP, token)) {
\r
230 return RNAOut.FREQ;
\r
233 return RNAOut.OTHER;
\r
236 private static AlifoldLine identifyLine(String line) {
\r
238 for (AlifoldLine il : AlifoldLine.values()) {
\r
239 if (Pattern.matches(il.regex, line)) return il;
\r
241 return AlifoldLine.OTHER;
\r
244 static enum AlifoldLine {
\r
245 mfeStruct (mfeStructP),
\r
246 justStruct (justStructP),
\r
247 stochBTStruct (stochBTStructP),
\r
248 PStruct (PStructP),
\r
249 centStruct (centStructP),
\r
250 MEAStruct (MEAStructP),
\r
251 freeEnergy (freeEnergyP),
\r
252 ensembleFreq (ensembleFreqP),
\r
253 alignment (alignmentP),
\r
257 AlifoldLine(String regex) { this.regex = regex; }
\r
261 //The types of data in an RNAalifold stdout file
\r
262 static enum RNAOut {
\r
263 SEQ, STRUCT, ENERGY, FREQ, OTHER
\r
266 //Something to put in the Score objects of the alifold result which gives information
\r
267 //about what kind of sequence it is holding in its String Id.
\r
269 public static enum AlifoldResult {
\r
270 mfeStructure, contactProbabilityStructure, MEAStucture, centroidStructure, stochBTStructure, consensusAlignment, ensembleValues, contactProbabilities
\r
273 // Print the full regex Strings for testing
\r
274 public static void main(String[] args) {
\r
275 for (AlifoldLine l : AlifoldLine.values()) {
\r
276 System.out.println(l.toString() + ": " + l.regex.replace("^","").replace("$",""));
\r