From: jprocter Date: Fri, 7 Dec 2012 15:07:54 +0000 (+0000) Subject: revised RNA stem/loop processing code that copes with pseudoknots X-Git-Tag: Jalview_2_9~221^2^2~8^2~30 X-Git-Url: http://source.jalview.org/gitweb/?p=jalview.git;a=commitdiff_plain;h=6819474b5da8df0145cfe2c65ea9445609332c20 revised RNA stem/loop processing code that copes with pseudoknots --- diff --git a/src/jalview/analysis/Rna.java b/src/jalview/analysis/Rna.java index 3d7eab5..c5207f7 100644 --- a/src/jalview/analysis/Rna.java +++ b/src/jalview/analysis/Rna.java @@ -23,16 +23,60 @@ package jalview.analysis; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; import java.util.Hashtable; import java.util.Stack; import java.util.Vector; + +import jalview.analysis.SecStrConsensus.SimpleBP; import jalview.datamodel.SequenceFeature; public class Rna { - static Hashtable pairHash = new Hashtable(); + + static Hashtable pairHash = new Hashtable(); + + private static final Character[] openingPars = {'(','[','{','<','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'}; + private static final Character[] closingPars = {')',']','}','>','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'}; + + private static HashSet openingParsSet = new HashSet(Arrays.asList(openingPars)); + private static HashSet closingParsSet = new HashSet(Arrays.asList(closingPars)); + private static Hashtable closingToOpening = new Hashtable() + // Initializing final data structure + { + private static final long serialVersionUID = 1L; + { + for(int i=0;i"+openingPars[i]); + put(closingPars[i],openingPars[i]); + } + }}; + + private static boolean isOpeningParenthesis(char c) + { + return openingParsSet.contains(c); + } + + private static boolean isClosingParenthesis(char c) + { + return closingParsSet.contains(c); + } + private static char matchingOpeningParenthesis(char closingParenthesis) throws WUSSParseException + { + if (!isClosingParenthesis(closingParenthesis)) + { + throw new WUSSParseException("Querying matching opening parenthesis for non-closing parenthesis character "+closingParenthesis, -1); + } + + return closingToOpening.get(closingParenthesis); + } + /** * Based off of RALEE code ralee-get-base-pairs. Keeps track of open bracket * positions in "stack" vector. When a close bracket is reached, pair this @@ -45,69 +89,89 @@ public class Rna * @return Array of SequenceFeature; type = RNA helix, begin is open base * pair, end is close base pair */ - public static SequenceFeature[] GetBasePairs(CharSequence line) - throws WUSSParseException + public static Vector GetSimpleBPs(CharSequence line) throws WUSSParseException { - Stack stack = new Stack(); - Vector pairs = new Vector(); - + System.out.println(line); + Hashtable> stacks = new Hashtable>(); + Vector pairs = new Vector(); int i = 0; while (i < line.length()) { char base = line.charAt(i); - - if ((base == '<') || (base == '(') || (base == '{') || (base == '[')|| (base == 'A')|| (base == 'B')|| (base == 'C')|| (base == 'D')|| (base == '1')|| (base == 'F')|| (base == 'G')|| (base == '2')|| (base == 'I')|| (base == 'J')|| (base == 'K')|| (base == 'L')|| (base == 'M')|| (base == 'N')|| (base == 'O')|| (base == 'P')|| (base == 'Q')|| (base == 'R')|| (base == 'S')|| (base == 'T')|| (base == 'U')|| (base == 'V')|| (base == 'W')|| (base == 'X')|| (base == 'Y')|| (base == 'Z')) + + if (isOpeningParenthesis(base)) { - stack.push(i); + if (!stacks.containsKey(base)){ + stacks.put(base, new Stack()); + } + stacks.get(base).push(i); + } - else if ((base == '>') || (base == ')') || (base == '}')|| (base == ']')|| (base == 'a')|| (base == 'b')|| (base == 'c')|| (base == 'd')|| (base == 'e')|| (base == 'f')|| (base == 'g')|| (base == 'h')|| (base == 'i')|| (base == 'j')|| (base == 'k')|| (base == 'l')|| (base == 'm')|| (base == 'n')|| (base == 'o')|| (base == 'p')|| (base == 'q')|| (base == 'r')|| (base == 's')|| (base == 't')|| (base == 'u')|| (base == 'v')|| (base == 'w')|| (base == 'x')|| (base == 'y')|| (base == 'z')) - + else if (isClosingParenthesis(base)) { - + + char opening = matchingOpeningParenthesis(base); + + if (!stacks.containsKey(opening)){ + throw new WUSSParseException("Mismatched (unseen) closing character "+base, i); + } + + Stack stack = stacks.get(opening); if (stack.isEmpty()) { // error whilst parsing i'th position. pass back - throw new WUSSParseException("Mismatched closing bracket", i); + throw new WUSSParseException("Mismatched closing character "+base, i); } - Object temp = stack.pop(); - pairs.addElement(temp); - pairs.addElement(i); + int temp = stack.pop(); + + pairs.add(new SimpleBP(temp,i)); } - i++; } - - int numpairs = pairs.size() / 2; - SequenceFeature[] outPairs = new SequenceFeature[numpairs]; - - // Convert pairs to array - for (int p = 0; p < pairs.size(); p += 2) + for(char opening: stacks.keySet()) { - int begin = Integer.parseInt(pairs.elementAt(p).toString()); - int end = Integer.parseInt(pairs.elementAt(p + 1).toString()); - - outPairs[p / 2] = new SequenceFeature("RNA helix", "", "", begin, - end, ""); - // pairHash.put(begin, end); - + Stack stack = stacks.get(opening); + if (!stack.empty()) + { + throw new WUSSParseException("Mismatched opening character "+opening+" at "+stack.pop(), i); + } + } + return pairs; + } + + public static SequenceFeature[] GetBasePairs(CharSequence line) throws WUSSParseException + { + Vector bps = GetSimpleBPs(line); + SequenceFeature[] outPairs = new SequenceFeature[bps.size()]; + for (int p = 0; p < bps.size(); p++) + { + SimpleBP bp = bps.elementAt(p); + outPairs[p] = new SequenceFeature("RNA helix", "", "", bp.getBP5(),bp.getBP3(), ""); } - return outPairs; } - + + + public static ArrayList GetModeleBP(CharSequence line) throws WUSSParseException + { + Vector bps = GetSimpleBPs(line); + return new ArrayList(bps); + } + + /** * Function to get the end position corresponding to a given start position - * - * @param indice - * - start position of a base pair + * @param indice - start position of a base pair * @return - end position of a base pair */ - /* - * makes no sense at the moment :( public int findEnd(int indice){ //TODO: - * Probably extend this to find the start to a given end? //could be done by - * putting everything twice to the hash ArrayList pair = new - * ArrayList(); return pairHash.get(indice); } - */ + /*makes no sense at the moment :( + public int findEnd(int indice){ + //TODO: Probably extend this to find the start to a given end? + //could be done by putting everything twice to the hash + ArrayList pair = new ArrayList(); + return pairHash.get(indice); + }*/ + /** * Figures out which helix each position belongs to and stores the helix @@ -191,3 +255,4 @@ public class Rna } } } +