src/javajs/util/CifDataParser.java

   1 package javajs.util;\r
   2 \r
   3 import java.io.BufferedReader;\r
   4 \r
   5 import java.util.Hashtable;\r
   6 \r
   7 import java.util.Map;\r
   8 \r
   9 import javajs.api.GenericCifDataParser;\r
  10 import javajs.api.GenericLineReader;\r
  11 \r
  12 \r
  13 \r
  14 public class CifDataParser implements GenericCifDataParser {\r
  15   /**\r
  16    *\r
  17    * A special tokenizer class for dealing with quoted strings in CIF files.\r
  18    * \r
  19    * Greek letters implemented in Jmol 13.3.9 and only for \r
  20    * titles and space groups. All other mark ups ignored.\r
  21    * \r
  22    *<p>\r
  23    * regarding the treatment of single quotes vs. primes in\r
  24    * cif file, PMR wrote:\r
  25    *</p>\r
  26    *<p>\r
  27    *   * There is a formal grammar for CIF\r
  28    * (see http://www.iucr.org/iucr-top/cif/index.html)\r
  29    * which confirms this. The textual explanation is\r
  30    *<p />\r
  31    *<p>\r
  32    * 14. Matching single or double quote characters (' or ") may\r
  33    * be used to bound a string representing a non-simple data value\r
  34    * provided the string does not extend over more than one line.\r
  35    *<p />\r
  36    *<p>\r
  37    * 15. Because data values are invariably separated from other\r
  38    * tokens in the file by white space, such a quote-delimited\r
  39    * character string may contain instances of the character used\r
  40    * to delimit the string provided they are not followed by white\r
  41    * space. For example, the data item\r
  42    *<code>\r
  43    *  _example  'a dog's life'\r
  44    *</code>\r
  45    * is legal; the data value is a dog's life.\r
  46    *</p>\r
  47    *<p>\r
  48    * [PMR - the terminating character(s) are quote+whitespace.\r
  49    * That would mean that:\r
  50    *<code>\r
  51    *  _example 'Jones' life'\r
  52    *</code>\r
  53    * would be an error\r
  54    *</p>\r
  55    *<p>\r
  56    * The CIF format was developed in that late 1980's under the aegis of the\r
  57    * International Union of Crystallography (I am a consultant to the COMCIFs \r
  58    * committee). It was ratified by the Union and there have been several \r
  59    * workshops. mmCIF is an extension of CIF which includes a relational \r
  60    * structure. The formal publications are:\r
  61    *</p>\r
  62    *<p>\r
  63    * Hall, S. R. (1991). "The STAR File: A New Format for Electronic Data \r
  64    * Transfer and Archiving", J. Chem. Inform. Comp. Sci., 31, 326-333.\r
  65    * Hall, S. R., Allen, F. H. and Brown, I. D. (1991). "The Crystallographic\r
  66    * Information File (CIF): A New Standard Archive File for Crystallography",\r
  67    * Acta Cryst., A47, 655-685.\r
  68    * Hall, S.R. & Spadaccini, N. (1994). "The STAR File: Detailed \r
  69    * Specifications," J. Chem. Info. Comp. Sci., 34, 505-508.\r
  70    *</p>\r
  71    */\r
  72   private GenericLineReader reader;\r
  73   private BufferedReader br;\r
  74 \r
  75   private String line;  \r
  76   private String str;\r
  77   private int ich;\r
  78   private int cch;\r
  79   private boolean wasUnQuoted;\r
  80   private String strPeeked;\r
  81   private int ichPeeked;\r
  82   private int fieldCount;\r
  83   private String[] loopData;\r
  84   private SB fileHeader = new SB();\r
  85   private boolean isHeader = true;\r
  86   private String nullString = "\0";\r
  87 \r
  88   /**\r
  89    * Set the string value of what is returned for "." and "?"\r
  90    * \r
  91    * @param nullString null here returns "." and "?"; default is "\0"\r
  92    * \r
  93    */\r
  94   public void setNullValue(String nullString) {\r
  95     this.nullString  = nullString;    \r
  96   }\r
  97 \r
  98   /**\r
  99    * A global, static map that contains field information. The assumption is that\r
 100    * if we read a set of fields for, say, atom_site, once in a lifetime, then\r
 101    * that should be good forever. Those are static lists. Or should be....\r
 102    */\r
 103   private static Map<String, Integer> htFields = new Hashtable<String, Integer>();\r
 104   \r
 105   ////////////////////////////////////////////////////////////////\r
 106   // special tokenizer class\r
 107   ////////////////////////////////////////////////////////////////\r
 108 \r
 109   public CifDataParser() {\r
 110     // for reflection\r
 111   }\r
 112     \r
 113   private String[] fields;\r
 114 \r
 115   @Override\r
 116   public String getLoopData(int i) {\r
 117     return loopData[i];\r
 118   }\r
 119 \r
 120   @Override\r
 121   public int getFieldCount() {\r
 122     return fieldCount;\r
 123   }\r
 124 \r
 125   @Override\r
 126   public String getField(int i) {\r
 127     return fields[i];\r
 128   }\r
 129 \r
 130   /**\r
 131    * A Chemical Information File data parser.\r
 132    * \r
 133    * Should be called immediately upon construction.\r
 134    *  \r
 135    * Two options; one of reader or br should be null, or reader will be\r
 136    * ignored. Just simpler this way...\r
 137    * \r
 138    * @param reader  Anything that can deliver a line of text or null\r
 139    * @param br      A standard BufferedReader.\r
 140    *  \r
 141    */\r
 142   @Override\r
 143   public CifDataParser set(GenericLineReader reader, BufferedReader br) {\r
 144     this.reader = reader;\r
 145     this.br = br;\r
 146     return this;\r
 147   }\r
 148 \r
 149   /**\r
 150    * \r
 151    * @return commented-out section at the start of a CIF file.\r
 152    * \r
 153    */\r
 154   @Override\r
 155   public String getFileHeader() {\r
 156     return fileHeader.toString();\r
 157   }\r
 158   \r
 159   \r
 160   /**\r
 161    * Parses all CIF data for a reader defined in the constructor\r
 162    * into a standard Map structure and close the BufferedReader if\r
 163    * it exists. \r
 164    * \r
 165    * @return Hashtable of models Vector of Hashtable data\r
 166    */\r
 167   @Override\r
 168   public Map<String, Object> getAllCifData() {\r
 169     line = "";\r
 170     String key;\r
 171     Map<String, Object> data = null;\r
 172     Map<String, Object> allData = new Hashtable<String, Object>();\r
 173     Lst<Map<String, Object>> models = new  Lst<Map<String,Object>>();\r
 174     allData.put("models", models);\r
 175     try {\r
 176       while ((key = getNextToken()) != null) {\r
 177         if (key.startsWith("global_") || key.startsWith("data_")) {\r
 178           models.addLast(data = new Hashtable<String, Object>());\r
 179           data.put("name", key);\r
 180           continue;\r
 181         }\r
 182         if (key.startsWith("loop_")) {\r
 183           getAllCifLoopData(data);\r
 184           continue;\r
 185         }\r
 186         if (key.charAt(0) != '_') {\r
 187           System.out.println("CIF ERROR ? should be an underscore: " + key);\r
 188         } else {\r
 189           String value = getNextToken();\r
 190           if (value == null) {\r
 191             System.out.println("CIF ERROR ? end of file; data missing: " + key);\r
 192           } else {\r
 193             data.put(fixKey(key), value);\r
 194           }\r
 195         }\r
 196       }\r
 197     } catch (Exception e) {\r
 198       // ?\r
 199     }\r
 200     try {\r
 201       if (br != null)\r
 202         br.close();\r
 203     } catch (Exception e) {\r
 204       // ?\r
 205     }\r
 206     return allData;\r
 207   }\r
 208 \r
 209   /**\r
 210    * create our own list of keywords and for each one create a list\r
 211    * of data associated with that keyword. For example, a list of all \r
 212    * x coordinates, then a list of all y coordinates, etc.\r
 213    * \r
 214    * @param data\r
 215    * @throws Exception\r
 216    */\r
 217   @SuppressWarnings("unchecked")\r
 218   private void getAllCifLoopData(Map<String, Object> data) throws Exception {\r
 219     String key;\r
 220     Lst<String> keyWords = new  Lst<String>();\r
 221     while ((key = peekToken()) != null && key.charAt(0) == '_') {\r
 222       key = fixKey(getTokenPeeked());\r
 223       keyWords.addLast(key);\r
 224       data.put(key, new  Lst<String>());\r
 225     }\r
 226     fieldCount = keyWords.size();\r
 227     if (fieldCount == 0)\r
 228       return;\r
 229     loopData = new String[fieldCount];\r
 230     while (getData())\r
 231       for (int i = 0; i < fieldCount; i++)\r
 232         ((Lst<String>)data.get(keyWords.get(i))).addLast(loopData[i]);\r
 233   }\r
 234 \r
 235   @Override\r
 236   public String readLine() {\r
 237     try {\r
 238       line = (reader == null ? br.readLine() : reader.readNextLine());\r
 239       if (line == null)\r
 240         return null;\r
 241       if (isHeader) {\r
 242         if (line.startsWith("#"))\r
 243           fileHeader.append(line).appendC('\n');\r
 244         else\r
 245           isHeader = false;\r
 246       }\r
 247       return line;\r
 248     } catch (Exception e) {\r
 249       return null;\r
 250     }\r
 251   }\r
 252   \r
 253   /**\r
 254    * The work horse; a general reader for loop data.\r
 255    * Fills loopData with fieldCount fields.\r
 256    * \r
 257    * @return false if EOF\r
 258    * @throws Exception\r
 259    */\r
 260   @Override\r
 261   public boolean getData() throws Exception {\r
 262     // line is already present, and we leave with the next line to parse\r
 263     for (int i = 0; i < fieldCount; ++i)\r
 264       if ((loopData[i] = getNextDataToken()) == null)\r
 265         return false;\r
 266     return (fieldCount > 0);\r
 267   }\r
 268 \r
 269   /**\r
 270    * \r
 271    * Skips all associated loop data. (Skips to next control word.)\r
 272    * \r
 273    * @throws Exception\r
 274    */\r
 275   @Override\r
 276   public String skipLoop(boolean doReport) throws Exception {\r
 277     String str;\r
 278     SB ret = (doReport ? new SB() : null);\r
 279     int n = 0;\r
 280     while ((str = peekToken()) != null && str.charAt(0) == '_') {\r
 281       if (ret != null)\r
 282         ret.append(str).append("\n");\r
 283       getTokenPeeked();\r
 284       n++;\r
 285     }\r
 286     int m = 0;\r
 287     while ((str = getNextDataToken()) != null) {\r
 288       if (ret == null)\r
 289         continue; \r
 290       ret.append(str).append(" ");\r
 291       if ((++m % n) == 0)\r
 292         ret.append("\n");\r
 293     }\r
 294     return (ret == null ? null : ret.toString());\r
 295   }\r
 296 \r
 297   /**\r
 298    * \r
 299    * @return the next token of any kind, or null\r
 300    * @throws Exception\r
 301    */\r
 302   @Override\r
 303   public String getNextToken() throws Exception {\r
 304     while (!strHasMoreTokens())\r
 305       if (setStringNextLine() == null)\r
 306         return null;\r
 307     return nextStrToken();\r
 308   }\r
 309 \r
 310   /**\r
 311    * \r
 312    * first checks to see if the next token is an unquoted\r
 313    * control code, and if so, returns null \r
 314    * \r
 315    * @return next data token or null\r
 316    * @throws Exception\r
 317    */\r
 318   @Override\r
 319   public String getNextDataToken() throws Exception { \r
 320     String str = peekToken();\r
 321     if (str == null)\r
 322       return null;\r
 323     if (wasUnQuoted)\r
 324       if (str.charAt(0) == '_' || str.startsWith("loop_")\r
 325           || str.startsWith("data_")\r
 326           || str.startsWith("stop_")\r
 327           || str.startsWith("global_"))\r
 328         return null;\r
 329     return getTokenPeeked();\r
 330   }\r
 331   \r
 332   /**\r
 333    * Just look at the next token. Saves it for retrieval \r
 334    * using getTokenPeeked()\r
 335    * \r
 336    * @return next token or null if EOF\r
 337    * @throws Exception\r
 338    */\r
 339   @Override\r
 340   public String peekToken() throws Exception {\r
 341     while (!strHasMoreTokens())\r
 342       if (setStringNextLine() == null)\r
 343         return null;\r
 344     int ich = this.ich;\r
 345     strPeeked = nextStrToken();\r
 346     ichPeeked= this.ich;\r
 347     this.ich = ich;\r
 348     return strPeeked;\r
 349   }\r
 350   \r
 351   /**\r
 352    * \r
 353    * @return the token last acquired; may be null\r
 354    */\r
 355   @Override\r
 356   public String getTokenPeeked() {\r
 357     ich = ichPeeked;\r
 358     return strPeeked;\r
 359   }\r
 360   \r
 361   /**\r
 362    * Used especially for data that might be multi-line data that\r
 363    * might have unwanted white space at start or end.\r
 364    * \r
 365    * @param str\r
 366    * @return str without any leading/trailing white space, and no '\n'\r
 367    */\r
 368   @Override\r
 369   public String fullTrim(String str) {\r
 370     int pt0 = -1;\r
 371     int pt1 = str.length();\r
 372     while (++pt0 < pt1 && PT.isWhitespace(str.charAt(pt0))) {\r
 373     }\r
 374     while (--pt1 > pt0 && PT.isWhitespace(str.charAt(pt1))) {      \r
 375     }\r
 376     return str.substring(pt0, pt1 + 1);\r
 377   }\r
 378 \r
 379   private final static String grABC =\r
 380       "ABX\u0394E\u03A6\u0393H"   // ABCDEFGH\r
 381       + "I_K\u039BMNO\u03A0"      // I_KLMNOP\r
 382       + "\u0398P\u03A3TY_\u03A9\u039E\u03A5Z"; // QRSTU_WXYZ\r
 383   private final static String grabc =\r
 384       "\u03B1\u03B2\u03C7\u03A4\u03A5\u03C6\u03B3\u03B7" // abcdefgh\r
 385       + "\u03B9_\u03BA\u03BB\u03BC\u03BD\u03BF\u03C0"    // i_klmnop\r
 386       + "\u03B8\u03C1\u03C3\u03C4\u03C5_\u03C9\u03BE\u03C5\u03B6"; // qrstu_wxyz\r
 387 \r
 388   /**\r
 389    * Only translating the basic Greek set here, not all the other stuff. See\r
 390    * http://www.iucr.org/resources/cif/spec/version1.1/semantics#markup\r
 391    * \r
 392    * @param data\r
 393    * @return cleaned string\r
 394    */\r
 395   @Override\r
 396   public String toUnicode(String data) {\r
 397     int pt;\r
 398     try {\r
 399       while ((pt = data.indexOf('\\')) >= 0) {\r
 400         int c = data.charAt(pt + 1);\r
 401         String ch = (c >= 65 && c <= 90 ? grABC.substring(c - 65, c - 64)\r
 402             : c >= 97 && c <= 122 ? grabc.substring(c - 97, c - 96) : "_");\r
 403         data = data.substring(0, pt) + ch + data.substring(pt + 2);\r
 404       }\r
 405     } catch (Exception e) {\r
 406       // ignore\r
 407     }\r
 408 \r
 409     return data;\r
 410   }\r
 411 \r
 412   /**\r
 413    * Passing an array of field names, this method fills two arrays. \r
 414    * The first, fieldOf, identifies \r
 415    * It does this by first creating a map of names to their indices in fields[].\r
 416    * \r
 417    * Alternatively, if fields is null, then a private array is filled, in order, \r
 418    * with key data. This is used in cases such as matrices for which there are simply\r
 419    * too many possibilities to list, and the key name itself contains the x-y \r
 420    * information that we need.\r
 421    * \r
 422    */\r
 423    @Override\r
 424   public int parseLoopParameters(String[] fields, int[] fieldOf, int[] propertyOf) throws Exception {\r
 425      int propertyCount = 0;\r
 426      if (fields == null) {\r
 427        // for reading full list of keys, as for matrices\r
 428        this.fields = new String[100];\r
 429      } else {\r
 430        if (!htFields.containsKey(fields[0]))\r
 431          for (int i = fields.length; --i >= 0;)\r
 432            htFields.put(fields[i], Integer.valueOf(i));\r
 433        for (int i = fields.length; --i >= 0;)\r
 434          fieldOf[i] = NONE;\r
 435        propertyCount = fields.length;\r
 436      }\r
 437      fieldCount = 0;\r
 438      while (true) {\r
 439        String str = peekToken();\r
 440        if (str == null) {\r
 441          // we are PREMATURELY done; reset\r
 442          fieldCount = 0;\r
 443          break;\r
 444        }\r
 445        // end of the loop is a new token starting with underscore\r
 446        if (str.charAt(0) != '_')\r
 447          break;\r
 448        \r
 449        int pt = fieldCount++;\r
 450        str = fixKey(getTokenPeeked());\r
 451        if (fields == null) {\r
 452          // just make a linear model, saving the list\r
 453          this.fields[propertyOf[pt] = fieldOf[pt] = pt] = str;\r
 454          continue;\r
 455        }\r
 456        Integer iField = htFields.get(str);\r
 457        int i = (iField == null ? NONE : iField.intValue());\r
 458        if ((propertyOf[pt] = i) != NONE)\r
 459          fieldOf[i] = pt;\r
 460      }\r
 461      if (fieldCount > 0)\r
 462        loopData = new String[fieldCount];\r
 463      return propertyCount;\r
 464   }\r
 465 \r
 466   @Override\r
 467   public String fixKey(String key) {\r
 468     // PRELIMINARY -- BilBao _magnetic\r
 469     // PRELIMINARY -- Jana2006\r
 470     return (PT.rep(\r
 471         key.startsWith("_magnetic") ? key.substring(9) \r
 472             : key.startsWith("_jana") ? key.substring(5) \r
 473             : key, ".", "_").toLowerCase());\r
 474   }\r
 475 \r
 476   //////////////////// private methods ////////////////////\r
 477   \r
 478   \r
 479   /**\r
 480    * sets a string to be parsed from the beginning\r
 481    * \r
 482    * @param str\r
 483    */\r
 484   private void setString(String str) {\r
 485     this.str = line = str;\r
 486     cch = (str == null ? 0 : str.length());\r
 487     ich = 0;\r
 488   }\r
 489 \r
 490   /*\r
 491    * http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax\r
 492    * \r
 493    * 17. The special sequence of end-of-line followed \r
 494    * immediately by a semicolon in column one (denoted "<eol>;") \r
 495    * may also be used as a delimiter at the beginning and end \r
 496    * of a character string comprising a data value. The complete \r
 497    * bounded string is called a text field, and may be used to \r
 498    * convey multi-line values. The end-of-line associated with \r
 499    * the closing semicolon does not form part of the data value. \r
 500    * Within a multi-line text field, leading white space within \r
 501    * text lines must be retained as part of the data value; trailing \r
 502    * white space on a line may however be elided.\r
 503    * \r
 504    * 18. A text field delimited by the <eol>; digraph may not \r
 505    * include a semicolon at the start of a line of text as \r
 506    * part of its value.\r
 507    * \r
 508    * 20. For example, the data value foo may be expressed \r
 509    * equivalently as an unquoted string foo, as a quoted \r
 510    * string 'foo' or as a text field\r
 511    *\r
 512    *;foo\r
 513    *;\r
 514    *\r
 515    * By contrast the value of the text field\r
 516    *\r
 517    *; foo\r
 518    *  bar\r
 519    *;\r
 520    *\r
 521    * is  foo<eol>  bar (where <eol> represents an end-of-line); \r
 522    * the embedded space characters are significant.\r
 523    * \r
 524    * \r
 525    * I (BH) note, however, that we sometimes have:\r
 526    * \r
 527    * _some_name\r
 528    * ;\r
 529    * the name here\r
 530    * ;\r
 531    * \r
 532    * so this should actually be\r
 533    * \r
 534    * ;the name here\r
 535    * ;\r
 536    * \r
 537    * for this, we use fullTrim();\r
 538    * \r
 539    */\r
 540   \r
 541   /**\r
 542    * \r
 543    * sets the string for parsing to be from the next line \r
 544    * when the token buffer is empty, and if ';' is at the \r
 545    * beginning of that line, extends the string to include\r
 546    * that full multiline string. Uses \1 to indicate that \r
 547    * this is a special quotation. \r
 548    * \r
 549    * @return  the next line or null if EOF\r
 550    * @throws Exception\r
 551    */\r
 552   private String setStringNextLine() throws Exception {\r
 553     setString(readLine());\r
 554     if (line == null || line.length() == 0)\r
 555       return line;\r
 556     if (line.charAt(0) != ';') {\r
 557       if (str.startsWith("###non-st#"))\r
 558         ich = 10;\r
 559       return line;\r
 560     }\r
 561     ich = 1;\r
 562     String str = '\1' + line.substring(1) + '\n';\r
 563     while (readLine() != null) {\r
 564       if (line.startsWith(";")) {\r
 565         // remove trailing <eol> only, and attach rest of next line\r
 566         str = str.substring(0, str.length() - 1)\r
 567           + '\1' + line.substring(1);\r
 568         break;\r
 569       }\r
 570       str += line + '\n';\r
 571     }\r
 572     setString(str);\r
 573     return str;\r
 574   }\r
 575 \r
 576   /**\r
 577    * @return TRUE if there are more tokens in the line buffer\r
 578    * \r
 579    */\r
 580   private boolean strHasMoreTokens() {\r
 581     if (str == null)\r
 582       return false;\r
 583     char ch = '#';\r
 584     while (ich < cch && ((ch = str.charAt(ich)) == ' ' || ch == '\t'))\r
 585       ++ich;\r
 586     return (ich < cch && ch != '#');\r
 587   }\r
 588 \r
 589   /**\r
 590    * assume that hasMoreTokens() has been called and that\r
 591    * ich is pointing at a non-white character. Also sets\r
 592    * boolean wasUnQuoted, because we need to know if we should \r
 593    * be checking for a control keyword. 'loop_' is different from just \r
 594    * loop_ without the quotes.\r
 595    *\r
 596    * @return null if no more tokens, "\0" if '.' or '?', or next token \r
 597    */\r
 598   private String nextStrToken() {\r
 599     if (ich == cch)\r
 600       return null;\r
 601     int ichStart = ich;\r
 602     char ch = str.charAt(ichStart);\r
 603     if (ch != '\'' && ch != '"' && ch != '\1') {\r
 604       wasUnQuoted = true;\r
 605       while (ich < cch && (ch = str.charAt(ich)) != ' ' && ch != '\t')\r
 606         ++ich;\r
 607       if (ich == ichStart + 1)\r
 608         if (nullString != null && (str.charAt(ichStart) == '.' || str.charAt(ichStart) == '?'))\r
 609           return nullString;\r
 610       String s = str.substring(ichStart, ich);\r
 611       return s;\r
 612     }\r
 613     wasUnQuoted = false;\r
 614     char chOpeningQuote = ch;\r
 615     boolean previousCharacterWasQuote = false;\r
 616     while (++ich < cch) {\r
 617       ch = str.charAt(ich);\r
 618       if (previousCharacterWasQuote && (ch == ' ' || ch == '\t'))\r
 619         break;\r
 620       previousCharacterWasQuote = (ch == chOpeningQuote);\r
 621     }\r
 622     if (ich == cch) {\r
 623       if (previousCharacterWasQuote) // close quote was last char of string\r
 624         return str.substring(ichStart + 1, ich - 1);\r
 625       // reached the end of the string without finding closing '\r
 626       return str.substring(ichStart, ich);\r
 627     }\r
 628     ++ich; // throw away the last white character\r
 629     return str.substring(ichStart + 1, ich - 2);\r
 630   }\r
 631 \r
 632   \r
 633 }