src/org/json/XMLTokener.java

   1 package org.json;
   2
   3 /*
   4 Copyright (c) 2002 JSON.org
   5
   6 Permission is hereby granted, free of charge, to any person obtaining a copy
   7 of this software and associated documentation files (the "Software"), to deal
   8 in the Software without restriction, including without limitation the rights
   9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 copies of the Software, and to permit persons to whom the Software is
  11 furnished to do so, subject to the following conditions:
  12
  13 The above copyright notice and this permission notice shall be included in all
  14 copies or substantial portions of the Software.
  15
  16 The Software shall be used for Good, not Evil.
  17
  18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24 SOFTWARE.
  25 */
  26
  27 import java.io.Reader;
  28
  29 /**
  30  * The XMLTokener extends the JSONTokener to provide additional methods
  31  * for the parsing of XML texts.
  32  * @author JSON.org
  33  * @version 2015-12-09
  34  */
  35 public class XMLTokener extends JSONTokener {
  36
  37
  38    /** The table of entity values. It initially contains Character values for
  39     * amp, apos, gt, lt, quot.
  40     */
  41    public static final java.util.HashMap<String, Character> entity;
  42
  43    static {
  44        entity = new java.util.HashMap<String, Character>(8);
  45        entity.put("amp",  XML.AMP);
  46        entity.put("apos", XML.APOS);
  47        entity.put("gt",   XML.GT);
  48        entity.put("lt",   XML.LT);
  49        entity.put("quot", XML.QUOT);
  50    }
  51
  52     /**
  53      * Construct an XMLTokener from a Reader.
  54      * @param r A source reader.
  55      */
  56     public XMLTokener(Reader r) {
  57         super(r);
  58     }
  59
  60     /**
  61      * Construct an XMLTokener from a string.
  62      * @param s A source string.
  63      */
  64     public XMLTokener(String s) {
  65         super(s);
  66     }
  67
  68     /**
  69      * Get the text in the CDATA block.
  70      * @return The string up to the <code>]]&gt;</code>.
  71      * @throws JSONException If the <code>]]&gt;</code> is not found.
  72      */
  73     public String nextCDATA() throws JSONException {
  74         char         c;
  75         int          i;
  76         StringBuilder sb = new StringBuilder();
  77         while (more()) {
  78             c = next();
  79             sb.append(c);
  80             i = sb.length() - 3;
  81             if (i >= 0 && sb.charAt(i) == ']' &&
  82                           sb.charAt(i + 1) == ']' && sb.charAt(i + 2) == '>') {
  83                 sb.setLength(i);
  84                 return sb.toString();
  85             }
  86         }
  87         throw syntaxError("Unclosed CDATA");
  88     }
  89
  90
  91     /**
  92      * Get the next XML outer token, trimming whitespace. There are two kinds
  93      * of tokens: the '<' character which begins a markup tag, and the content
  94      * text between markup tags.
  95      *
  96      * @return  A string, or a '<' Character, or null if there is no more
  97      * source text.
  98      * @throws JSONException
  99      */
 100     public Object nextContent() throws JSONException {
 101         char         c;
 102         StringBuilder sb;
 103         do {
 104             c = next();
 105         } while (Character.isWhitespace(c));
 106         if (c == 0) {
 107             return null;
 108         }
 109         if (c == '<') {
 110             return XML.LT;
 111         }
 112         sb = new StringBuilder();
 113         for (;;) {
 114             if (c == 0) {
 115                 return sb.toString().trim();
 116             }
 117             if (c == '<') {
 118                 back();
 119                 return sb.toString().trim();
 120             }
 121             if (c == '&') {
 122                 sb.append(nextEntity(c));
 123             } else {
 124                 sb.append(c);
 125             }
 126             c = next();
 127         }
 128     }
 129
 130
 131     /**
 132      * Return the next entity. These entities are translated to Characters:
 133      *     <code>&amp;  &apos;  &gt;  &lt;  &quot;</code>.
 134      * @param ampersand An ampersand character.
 135      * @return  A Character or an entity String if the entity is not recognized.
 136      * @throws JSONException If missing ';' in XML entity.
 137      */
 138     public Object nextEntity(char ampersand) throws JSONException {
 139         StringBuilder sb = new StringBuilder();
 140         for (;;) {
 141             char c = next();
 142             if (Character.isLetterOrDigit(c) || c == '#') {
 143                 sb.append(Character.toLowerCase(c));
 144             } else if (c == ';') {
 145                 break;
 146             } else {
 147                 throw syntaxError("Missing ';' in XML entity: &" + sb);
 148             }
 149         }
 150         String string = sb.toString();
 151         return unescapeEntity(string);
 152     }
 153
 154     /**
 155      * Unescapes an XML entity encoding;
 156      * @param e entity (only the actual entity value, not the preceding & or ending ;
 157      * @return
 158      */
 159     static String unescapeEntity(String e) {
 160         // validate
 161         if (e == null || e.isEmpty()) {
 162             return "";
 163         }
 164         // if our entity is an encoded unicode point, parse it.
 165         if (e.charAt(0) == '#') {
 166             int cp;
 167             if (e.charAt(1) == 'x') {
 168                 // hex encoded unicode
 169                 cp = Integer.parseInt(e.substring(2), 16);
 170             } else {
 171                 // decimal encoded unicode
 172                 cp = Integer.parseInt(e.substring(1));
 173             }
 174             return new String(new int[] {cp},0,1);
 175         }
 176         Character knownEntity = entity.get(e);
 177         if(knownEntity==null) {
 178             // we don't know the entity so keep it encoded
 179             return '&' + e + ';';
 180         }
 181         return knownEntity.toString();
 182     }
 183
 184
 185     /**
 186      * Returns the next XML meta token. This is used for skipping over <!...>
 187      * and <?...?> structures.
 188      * @return Syntax characters (<code>< > / = ! ?</code>) are returned as
 189      *  Character, and strings and names are returned as Boolean. We don't care
 190      *  what the values actually are.
 191      * @throws JSONException If a string is not properly closed or if the XML
 192      *  is badly structured.
 193      */
 194     public Object nextMeta() throws JSONException {
 195         char c;
 196         char q;
 197         do {
 198             c = next();
 199         } while (Character.isWhitespace(c));
 200         switch (c) {
 201         case 0:
 202             throw syntaxError("Misshaped meta tag");
 203         case '<':
 204             return XML.LT;
 205         case '>':
 206             return XML.GT;
 207         case '/':
 208             return XML.SLASH;
 209         case '=':
 210             return XML.EQ;
 211         case '!':
 212             return XML.BANG;
 213         case '?':
 214             return XML.QUEST;
 215         case '"':
 216         case '\'':
 217             q = c;
 218             for (;;) {
 219                 c = next();
 220                 if (c == 0) {
 221                     throw syntaxError("Unterminated string");
 222                 }
 223                 if (c == q) {
 224                     return Boolean.TRUE;
 225                 }
 226             }
 227         default:
 228             for (;;) {
 229                 c = next();
 230                 if (Character.isWhitespace(c)) {
 231                     return Boolean.TRUE;
 232                 }
 233                 switch (c) {
 234                 case 0:
 235                 case '<':
 236                 case '>':
 237                 case '/':
 238                 case '=':
 239                 case '!':
 240                 case '?':
 241                 case '"':
 242                 case '\'':
 243                     back();
 244                     return Boolean.TRUE;
 245                 }
 246             }
 247         }
 248     }
 249
 250
 251     /**
 252      * Get the next XML Token. These tokens are found inside of angle
 253      * brackets. It may be one of these characters: <code>/ > = ! ?</code> or it
 254      * may be a string wrapped in single quotes or double quotes, or it may be a
 255      * name.
 256      * @return a String or a Character.
 257      * @throws JSONException If the XML is not well formed.
 258      */
 259     public Object nextToken() throws JSONException {
 260         char c;
 261         char q;
 262         StringBuilder sb;
 263         do {
 264             c = next();
 265         } while (Character.isWhitespace(c));
 266         switch (c) {
 267         case 0:
 268             throw syntaxError("Misshaped element");
 269         case '<':
 270             throw syntaxError("Misplaced '<'");
 271         case '>':
 272             return XML.GT;
 273         case '/':
 274             return XML.SLASH;
 275         case '=':
 276             return XML.EQ;
 277         case '!':
 278             return XML.BANG;
 279         case '?':
 280             return XML.QUEST;
 281
 282 // Quoted string
 283
 284         case '"':
 285         case '\'':
 286             q = c;
 287             sb = new StringBuilder();
 288             for (;;) {
 289                 c = next();
 290                 if (c == 0) {
 291                     throw syntaxError("Unterminated string");
 292                 }
 293                 if (c == q) {
 294                     return sb.toString();
 295                 }
 296                 if (c == '&') {
 297                     sb.append(nextEntity(c));
 298                 } else {
 299                     sb.append(c);
 300                 }
 301             }
 302         default:
 303
 304 // Name
 305
 306             sb = new StringBuilder();
 307             for (;;) {
 308                 sb.append(c);
 309                 c = next();
 310                 if (Character.isWhitespace(c)) {
 311                     return sb.toString();
 312                 }
 313                 switch (c) {
 314                 case 0:
 315                     return sb.toString();
 316                 case '>':
 317                 case '/':
 318                 case '=':
 319                 case '!':
 320                 case '?':
 321                 case '[':
 322                 case ']':
 323                     back();
 324                     return sb.toString();
 325                 case '<':
 326                 case '"':
 327                 case '\'':
 328                     throw syntaxError("Bad character in a name");
 329                 }
 330             }
 331         }
 332     }
 333
 334
 335     /**
 336      * Skip characters until past the requested string.
 337      * If it is not found, we are left at the end of the source with a result of false.
 338      * @param to A string to skip past.
 339      */
 340     // The Android implementation of JSONTokener has a public method of public void skipPast(String to)
 341     // even though ours does not have that method, to have API compatibility, our method in the subclass
 342     // should match.
 343     public void skipPast(String to) {
 344         boolean b;
 345         char c;
 346         int i;
 347         int j;
 348         int offset = 0;
 349         int length = to.length();
 350         char[] circle = new char[length];
 351
 352         /*
 353          * First fill the circle buffer with as many characters as are in the
 354          * to string. If we reach an early end, bail.
 355          */
 356
 357         for (i = 0; i < length; i += 1) {
 358             c = next();
 359             if (c == 0) {
 360                 return;
 361             }
 362             circle[i] = c;
 363         }
 364
 365         /* We will loop, possibly for all of the remaining characters. */
 366
 367         for (;;) {
 368             j = offset;
 369             b = true;
 370
 371             /* Compare the circle buffer with the to string. */
 372
 373             for (i = 0; i < length; i += 1) {
 374                 if (circle[j] != to.charAt(i)) {
 375                     b = false;
 376                     break;
 377                 }
 378                 j += 1;
 379                 if (j >= length) {
 380                     j -= length;
 381                 }
 382             }
 383
 384             /* If we exit the loop with b intact, then victory is ours. */
 385
 386             if (b) {
 387                 return;
 388             }
 389
 390             /* Get the next character. If there isn't one, then defeat is ours. */
 391
 392             c = next();
 393             if (c == 0) {
 394                 return;
 395             }
 396             /*
 397              * Shove the character in the circle buffer and advance the
 398              * circle offset. The offset is mod n.
 399              */
 400             circle[offset] = c;
 401             offset += 1;
 402             if (offset >= length) {
 403                 offset -= length;
 404             }
 405         }
 406     }
 407 }