RegexTokenizer.java

   1 //\r
   2 // This software is now distributed according to\r
   3 // the Lesser Gnu Public License.  Please see\r
   4 // http://www.gnu.org/copyleft/lesser.txt for\r
   5 // the details.\r
   6 //    -- Happy Computing!\r
   7 //\r
   8 package com.stevesoft.pat;\r
   9 \r
  10 import java.util.*;\r
  11 \r
  12 /**\r
  13         Shareware: package pat\r
  14    <a href="copyright.html">Copyright 2001, Steven R. Brandt</a>\r
  15  */\r
  16 /**\r
  17  The RegexTokenizer is similar to the StringTokenizer class\r
  18  provided with java, but allows one to tokenize using\r
  19  regular expressions, rather than a simple list of characters.\r
  20  Tokens are any strings between the supplied regular expression,\r
  21  as well as any backreferences (things in parenthesis)\r
  22  contained within the regular expression. */\r
  23 public class RegexTokenizer\r
  24     implements Enumeration\r
  25 {\r
  26   String toParse;\r
  27   Regex r;\r
  28   int count = 0;\r
  29   Vector v = new Vector();\r
  30   Vector vi = new Vector();\r
  31   int pos = 0;\r
  32 \r
  33   int offset = 1;\r
  34   void getMore()\r
  35   {\r
  36     String s = r.right();\r
  37     if (r.searchFrom(toParse, pos))\r
  38     {\r
  39       v.addElement(r.left().substring(pos));\r
  40       vi.addElement(new Integer(r.matchFrom() +\r
  41                                 r.charsMatched()));\r
  42       for (int i = 0; i < r.numSubs(); i++)\r
  43       {\r
  44         if (r.substring() != null)\r
  45         {\r
  46           v.addElement(r.substring(i + offset));\r
  47           vi.addElement(\r
  48               new Integer(r.matchFrom(i + offset) +\r
  49                           r.charsMatched(i + offset)));\r
  50         }\r
  51       }\r
  52       pos = r.matchFrom() + r.charsMatched();\r
  53     }\r
  54     else if (s != null)\r
  55     {\r
  56       v.addElement(s);\r
  57     }\r
  58   }\r
  59 \r
  60   /** Initialize the tokenizer with a string of text and a pattern */\r
  61   public RegexTokenizer(String txt, String ptrn)\r
  62   {\r
  63     toParse = txt;\r
  64     r = new Regex(ptrn);\r
  65     offset = Regex.BackRefOffset;\r
  66     getMore();\r
  67   }\r
  68 \r
  69   /** Initialize the tokenizer with a Regex object. */\r
  70   public RegexTokenizer(String txt, Regex r)\r
  71   {\r
  72     toParse = txt;\r
  73     this.r = r;\r
  74     offset = Regex.BackRefOffset;\r
  75     getMore();\r
  76   }\r
  77 \r
  78   /** This should always be cast to a String, as in StringTokenizer,\r
  79        and as in StringTokenizer one can do this by calling\r
  80        nextString(). */\r
  81   public Object nextElement()\r
  82   {\r
  83     if (count >= v.size())\r
  84     {\r
  85       getMore();\r
  86     }\r
  87     return v.elementAt(count++);\r
  88   }\r
  89 \r
  90   /** This is the equivalent (String)nextElement(). */\r
  91   public String nextToken()\r
  92   {\r
  93     return (String) nextElement();\r
  94   }\r
  95 \r
  96   /** This asks for the next token, and changes the pattern\r
  97        being used at the same time. */\r
  98   public String nextToken(String newpat)\r
  99   {\r
 100     try\r
 101     {\r
 102       r.compile(newpat);\r
 103     }\r
 104     catch (RegSyntax r_)\r
 105     {}\r
 106     return nextToken(r);\r
 107   }\r
 108 \r
 109   /** This asks for the next token, and changes the pattern\r
 110        being used at the same time. */\r
 111   public String nextToken(Regex nr)\r
 112   {\r
 113     r = nr;\r
 114     if (vi.size() > count)\r
 115     {\r
 116       pos = ( (Integer) vi.elementAt(count)).intValue();\r
 117       v.setSize(count);\r
 118       vi.setSize(count);\r
 119     }\r
 120     getMore();\r
 121     return nextToken();\r
 122   }\r
 123 \r
 124   /** Tells whether there are more tokens in the pattern. */\r
 125   public boolean hasMoreElements()\r
 126   {\r
 127     if (count >= v.size())\r
 128     {\r
 129       getMore();\r
 130     }\r
 131     return count < v.size();\r
 132   }\r
 133 \r
 134   /** Tells whether there are more tokens in the pattern, but\r
 135        in the fashion of StringTokenizer. */\r
 136   public boolean hasMoreTokens()\r
 137   {\r
 138     return hasMoreElements();\r
 139   }\r
 140 \r
 141   /** Determines the # of remaining tokens */\r
 142   public int countTokens()\r
 143   {\r
 144     int _count = count;\r
 145     while (hasMoreTokens())\r
 146     {\r
 147       nextToken();\r
 148     }\r
 149     count = _count;\r
 150     return v.size() - count;\r
 151   }\r
 152 \r
 153   /** Returns all tokens in the String */\r
 154   public String[] allTokens()\r
 155   {\r
 156     countTokens();\r
 157     String[] ret = new String[v.size()];\r
 158     v.copyInto(ret);\r
 159     return ret;\r
 160   }\r
 161 };\r