package compbio.data.sequence; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Scanner; import java.util.TreeSet; import java.util.regex.Pattern; import org.apache.log4j.Logger; // Utility class for reading alifold output public class RNAStructReader { private static Logger log = Logger.getLogger(RNAStructReader.class); // Whitespace patterns static String s = "[+\\s=]+"; static String bracket = "\\(|\\)|\\{|\\}|\\[|\\]"; static String notData = "[\\s=+]+"; // RNAOut data type patterns static String seqP = "[_\\-a-zA-Z]{2,}"; // Has to match --mis output aswell (not just ACGU_) static String structP = "[\\.)({}\\[\\],]{2,}"; static String floatP = "-?\\d+\\.\\d*(e[+\\-]\\d+)?"; static String energyP = "-?[0-9]*\\.?[0-9]{2}"; static String freqP = "^-?\\d\\.\\d{6,}(e[+\\-]\\d+)?$"; // alifold out line patterns static String ps = "\\s*"; static String alignmentP = "^"+seqP+ps+"$"; static String mfeStructP = "^"+structP+s+"\\("+ps+floatP+s+floatP+s+floatP+ps+"\\)"+ps+"$"; static String justStructP = "^"+structP+ps+"$"; static String stochBTStructP = "^"+structP+s+floatP+s+floatP+ps+"$"; static String PStructP = "^"+structP+s+"\\["+ps+floatP+ps+"\\]"+ps+"$"; static String centStructP = "^"+structP+s+floatP+ps+"\\{"+ps+floatP+s+floatP+ps+"\\}"+ps+"$"; static String MEAStructP = "^"+structP+s+"\\{"+ps+floatP+s+"MEA="+floatP+ps+"\\}"+ps+"$"; static String freeEnergyP = "^"+ps+"free energy of ensemble"+ps+"="+ps+floatP+ps+"kcal/mol"+ps+"$"; static String ensembleFreqP = "^"+ps+"frequency of mfe structure in ensemble "+floatP+ps+"$"; public static RNAStructScoreManager readRNAStructStream(InputStream stdout) throws IOException { String error = "Error in parsing alifold stdout file: "; // The Lists required to construct a ScoreManager Using the new constructor List structs = new ArrayList(); List> data = new ArrayList>(); // Allocate necessry data structures for creating Score objects ArrayList scores = new ArrayList(); BufferedReader reader = new BufferedReader(new InputStreamReader(stdout)); // The first 2 lines of the alifold stdout file are always the same format String fline = reader.readLine(); assert (Pattern.matches(AlifoldLine.alignment.regex, fline)) : error + "Sequence Alignment Expected"; structs.add(fline.trim()); data.add(newEmptyScore(AlifoldResult.consensusAlignment)); fline = reader.readLine(); assert (Pattern.matches(AlifoldLine.mfeStruct.regex, fline)) : error + "Consensus Structure and Energy Expected"; Scanner sc = new Scanner(fline); structs.add(sc.next()); for (int i = 0; i < 3; i++) { scores.add(Float.parseFloat(sc.findInLine(floatP))); } data.add(newSetScore(AlifoldResult.mfeStructure, scores)); // Now the alifold stdout file formats diverge based on arguments fline = reader.readLine(); String sline; Scanner nsc = null; while ( fline != null) { scores.clear(); AlifoldLine ftype = identifyLine(fline); sline = reader.readLine(); // Look ahead sc = new Scanner(fline); if (sline != null) nsc = new Scanner(sline); if (ftype.equals(AlifoldLine.PStruct)) { // The -p or --MEA option is specified // The next line should always be frequency of mfe structure assert ( sline != null && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) : error + "Expected frequency of mfe structure"; structs.add(sc.next()); scores.add(Float.parseFloat(sc.findInLine(floatP))); scores.add(Float.parseFloat(nsc.findInLine(floatP))); data.add(newSetScore(AlifoldResult.contactProbabilityStructure, scores)); // Jump line sline = reader.readLine(); } else if (ftype.equals(AlifoldLine.centStruct)) { structs.add(sc.next()); for (int i = 0; i < 3; i++) { scores.add(Float.parseFloat(sc.findInLine(floatP))); } data.add(newSetScore(AlifoldResult.centroidStructure, scores)); } else if (ftype.equals(AlifoldLine.MEAStruct)) { structs.add(sc.next()); for (int i = 0; i < 2; i++) { scores.add(Float.parseFloat(sc.findInLine(floatP))); } data.add(newSetScore(AlifoldResult.MEAStucture, scores)); } else if (ftype.equals(AlifoldLine.justStruct)) { structs.add(sc.next()); data.add(newEmptyScore(AlifoldResult.stochBTStructure)); } else if (ftype.equals(AlifoldLine.stochBTStruct)) { structs.add(sc.next()); scores.add(sc.nextFloat()); scores.add(sc.nextFloat()); data.add(newSetScore(AlifoldResult.stochBTStructure, scores)); } else if (ftype.equals(AlifoldLine.freeEnergy)) { assert (sline != null && Pattern.matches(AlifoldLine.ensembleFreq.regex, sline)) : error + "Found 'freeEnergy' line on its own"; structs.add("Free energy of ensemble (kcal/mol) followed by " + "frequency of mfe structure in ensemble"); scores.add(Float.parseFloat(sc.findInLine(floatP))); scores.add(Float.parseFloat(nsc.findInLine(floatP))); data.add(newSetScore(AlifoldResult.ensembleValues, scores)); // jump line sline = reader.readLine(); } assert(!ftype.equals(AlifoldLine.ensembleFreq)) : error + "Wasn't expecting 'frequency of mfe structure'!"; assert(!ftype.equals(AlifoldLine.mfeStruct)) : error + "'Standard output' line at a place other than line 2!"; assert(!ftype.equals(AlifoldLine.alignment)) : error + "Wasn't expecting an alignment sequence!"; assert(!ftype.equals(AlifoldLine.OTHER)) : error + "Wasn't expecting this whatever it is: " + fline; if (Pattern.matches("^\\s*$", fline)) { log.warn("While parsing alifold stdout: A line is either empty or" + " contains only whitespace"); } fline = sline; } sc.close(); if (nsc != null) nsc.close(); return new RNAStructScoreManager(structs, data); } // Just for the purpose of creating new TreeSet objects of length one // for adding to a 'data' list to make a ScoreManager private static TreeSet newSetScore(Enum res, List scores) { // first convert List to float[] float[] scoresf = new float[scores.size()]; Float f; for (int i = 0; i < scoresf.length; i++) { f = scores.get(i); scoresf[i] = ( f != null ? f : Float.NaN); } return new TreeSet(Arrays.asList(new Score(res, scoresf))); } // A method just for the purpose of neatly creating Almost Empty score objects // that can't be null public static TreeSet newEmptyScore(Enum res) { return new TreeSet(Arrays.asList(new Score(res, new float[0]))); } public static RNAStructScoreManager readRNAStructStream(InputStream stdout, InputStream alifold) throws IOException { // The Lists required to construct a ScoreManager Using the new constructor List structs; List> data; // Get a ScoreManager that takes the std output but ignores alifold.out (-p) RNAStructScoreManager stdSM = readRNAStructStream(stdout); // Unpack this into the structs and data lists structs = stdSM.getStructs(); data = stdSM.getData(); // Now parse alifold.out Scanner sc = new Scanner(alifold); sc.useDelimiter("[\\s%]+"); // jump two lines to the data sc.nextLine(); sc.nextLine(); // Read the first, second and fourth columns. Ignoring everything else. // Allocate necessry data structures for creating Score objects ArrayList scores = new ArrayList(); List rangeHolder = new ArrayList(); String s = "null"; while (true) { s = sc.next(); if (java.util.regex.Pattern.matches("^[\\.)(]{2,}$", s)) break; if (!sc.hasNextLine()) break; int t = sc.nextInt(); rangeHolder.add(new Range(Integer.parseInt(s), t)); sc.next(); scores.add(sc.nextFloat()); sc.nextLine(); } sc.close(); // Update the first ScoreHolder TreeSet element assert (rangeHolder.size() == scores.size()); TreeSet sHolder = new TreeSet(); for (int i = 0; i < rangeHolder.size(); i++) { ArrayList singleS = new ArrayList(Arrays.asList(scores.get(i))); TreeSet singleR = new TreeSet(Arrays.asList(rangeHolder.get(i))); sHolder.add(new Score(AlifoldResult.contactProbabilities, singleS, singleR)); } data.set(0, sHolder); return new RNAStructScoreManager(structs, data); } private static RNAOut identify(String token) { if (Pattern.matches(seqP, token)) { return RNAOut.SEQ; } else if (Pattern.matches(structP, token)) { return RNAOut.STRUCT; } else if (Pattern.matches(energyP, token)) { return RNAOut.ENERGY; } else if (Pattern.matches(freqP, token)) { return RNAOut.FREQ; } return RNAOut.OTHER; } private static AlifoldLine identifyLine(String line) { for (AlifoldLine il : AlifoldLine.values()) { if (Pattern.matches(il.regex, line)) return il; } return AlifoldLine.OTHER; } static enum AlifoldLine { mfeStruct (mfeStructP), justStruct (justStructP), stochBTStruct (stochBTStructP), PStruct (PStructP), centStruct (centStructP), MEAStruct (MEAStructP), freeEnergy (freeEnergyP), ensembleFreq (ensembleFreqP), alignment (alignmentP), OTHER (".*"); String regex; AlifoldLine(String regex) { this.regex = regex; } } //The types of data in an RNAalifold stdout file static enum RNAOut { SEQ, STRUCT, ENERGY, FREQ, OTHER } //Something to put in the Score objects of the alifold result which gives information //about what kind of sequence it is holding in its String Id. public static enum AlifoldResult { mfeStructure, contactProbabilityStructure, MEAStucture, centroidStructure, stochBTStructure, consensusAlignment, ensembleValues, contactProbabilities } // Print the full regex Strings for testing public static void main(String[] args) { for (AlifoldLine l : AlifoldLine.values()) { System.out.println(l.toString() + ": " + l.regex.replace("^","").replace("$","")); } } }