src/javajs/util/CifDataParser.java

   1 package javajs.util;
   2
   3 import java.io.BufferedReader;
   4
   5 import java.util.Hashtable;
   6
   7 import java.util.Map;
   8
   9 import javajs.api.GenericCifDataParser;
  10 import javajs.api.GenericLineReader;
  11
  12
  13
  14 public class CifDataParser implements GenericCifDataParser {
  15   /**
  16    *
  17    * A special tokenizer class for dealing with quoted strings in CIF files.
  18    *
  19    * Greek letters implemented in Jmol 13.3.9 and only for
  20    * titles and space groups. All other mark ups ignored.
  21    *
  22    *<p>
  23    * regarding the treatment of single quotes vs. primes in
  24    * cif file, PMR wrote:
  25    *</p>
  26    *<p>
  27    *   * There is a formal grammar for CIF
  28    * (see http://www.iucr.org/iucr-top/cif/index.html)
  29    * which confirms this. The textual explanation is
  30    *<p />
  31    *<p>
  32    * 14. Matching single or double quote characters (' or ") may
  33    * be used to bound a string representing a non-simple data value
  34    * provided the string does not extend over more than one line.
  35    *<p />
  36    *<p>
  37    * 15. Because data values are invariably separated from other
  38    * tokens in the file by white space, such a quote-delimited
  39    * character string may contain instances of the character used
  40    * to delimit the string provided they are not followed by white
  41    * space. For example, the data item
  42    *<code>
  43    *  _example  'a dog's life'
  44    *</code>
  45    * is legal; the data value is a dog's life.
  46    *</p>
  47    *<p>
  48    * [PMR - the terminating character(s) are quote+whitespace.
  49    * That would mean that:
  50    *<code>
  51    *  _example 'Jones' life'
  52    *</code>
  53    * would be an error
  54    *</p>
  55    *<p>
  56    * The CIF format was developed in that late 1980's under the aegis of the
  57    * International Union of Crystallography (I am a consultant to the COMCIFs
  58    * committee). It was ratified by the Union and there have been several
  59    * workshops. mmCIF is an extension of CIF which includes a relational
  60    * structure. The formal publications are:
  61    *</p>
  62    *<p>
  63    * Hall, S. R. (1991). "The STAR File: A New Format for Electronic Data
  64    * Transfer and Archiving", J. Chem. Inform. Comp. Sci., 31, 326-333.
  65    * Hall, S. R., Allen, F. H. and Brown, I. D. (1991). "The Crystallographic
  66    * Information File (CIF): A New Standard Archive File for Crystallography",
  67    * Acta Cryst., A47, 655-685.
  68    * Hall, S.R. & Spadaccini, N. (1994). "The STAR File: Detailed
  69    * Specifications," J. Chem. Info. Comp. Sci., 34, 505-508.
  70    *</p>
  71    */
  72   private GenericLineReader reader;
  73   private BufferedReader br;
  74
  75   private String line;
  76   private String str;
  77   private int ich;
  78   private int cch;
  79   private boolean wasUnQuoted;
  80   private String strPeeked;
  81   private int ichPeeked;
  82   private int fieldCount;
  83   private String[] loopData;
  84   private SB fileHeader = new SB();
  85   private boolean isHeader = true;
  86   private String nullString = "\0";
  87
  88   /**
  89    * Set the string value of what is returned for "." and "?"
  90    *
  91    * @param nullString null here returns "." and "?"; default is "\0"
  92    *
  93    */
  94   public void setNullValue(String nullString) {
  95     this.nullString  = nullString;
  96   }
  97
  98   /**
  99    * A global, static map that contains field information. The assumption is that
 100    * if we read a set of fields for, say, atom_site, once in a lifetime, then
 101    * that should be good forever. Those are static lists. Or should be....
 102    */
 103   private static Map<String, Integer> htFields = new Hashtable<String, Integer>();
 104
 105   ////////////////////////////////////////////////////////////////
 106   // special tokenizer class
 107   ////////////////////////////////////////////////////////////////
 108
 109   public CifDataParser() {
 110     // for reflection
 111   }
 112
 113   private String[] fields;
 114
 115   @Override
 116   public String getLoopData(int i) {
 117     return loopData[i];
 118   }
 119
 120   @Override
 121   public int getFieldCount() {
 122     return fieldCount;
 123   }
 124
 125   @Override
 126   public String getField(int i) {
 127     return fields[i];
 128   }
 129
 130   /**
 131    * A Chemical Information File data parser.
 132    *
 133    * Should be called immediately upon construction.
 134    *
 135    * Two options; one of reader or br should be null, or reader will be
 136    * ignored. Just simpler this way...
 137    *
 138    * @param reader  Anything that can deliver a line of text or null
 139    * @param br      A standard BufferedReader.
 140    *
 141    */
 142   @Override
 143   public CifDataParser set(GenericLineReader reader, BufferedReader br) {
 144     this.reader = reader;
 145     this.br = br;
 146     return this;
 147   }
 148
 149   /**
 150    *
 151    * @return commented-out section at the start of a CIF file.
 152    *
 153    */
 154   @Override
 155   public String getFileHeader() {
 156     return fileHeader.toString();
 157   }
 158
 159
 160   /**
 161    * Parses all CIF data for a reader defined in the constructor
 162    * into a standard Map structure and close the BufferedReader if
 163    * it exists.
 164    *
 165    * @return Hashtable of models Vector of Hashtable data
 166    */
 167   @Override
 168   public Map<String, Object> getAllCifData() {
 169     line = "";
 170     String key;
 171     Map<String, Object> data = null;
 172     Map<String, Object> allData = new Hashtable<String, Object>();
 173     Lst<Map<String, Object>> models = new  Lst<Map<String,Object>>();
 174     allData.put("models", models);
 175     try {
 176       while ((key = getNextToken()) != null) {
 177         if (key.startsWith("global_") || key.startsWith("data_")) {
 178           models.addLast(data = new Hashtable<String, Object>());
 179           data.put("name", key);
 180           continue;
 181         }
 182         if (key.startsWith("loop_")) {
 183           getAllCifLoopData(data);
 184           continue;
 185         }
 186         if (key.charAt(0) != '_') {
 187           System.out.println("CIF ERROR ? should be an underscore: " + key);
 188         } else {
 189           String value = getNextToken();
 190           if (value == null) {
 191             System.out.println("CIF ERROR ? end of file; data missing: " + key);
 192           } else {
 193             data.put(fixKey(key), value);
 194           }
 195         }
 196       }
 197     } catch (Exception e) {
 198       // ?
 199     }
 200     try {
 201       if (br != null)
 202         br.close();
 203     } catch (Exception e) {
 204       // ?
 205     }
 206     return allData;
 207   }
 208
 209   /**
 210    * create our own list of keywords and for each one create a list
 211    * of data associated with that keyword. For example, a list of all
 212    * x coordinates, then a list of all y coordinates, etc.
 213    *
 214    * @param data
 215    * @throws Exception
 216    */
 217   @SuppressWarnings("unchecked")
 218   private void getAllCifLoopData(Map<String, Object> data) throws Exception {
 219     String key;
 220     Lst<String> keyWords = new  Lst<String>();
 221     while ((key = peekToken()) != null && key.charAt(0) == '_') {
 222       key = fixKey(getTokenPeeked());
 223       keyWords.addLast(key);
 224       data.put(key, new  Lst<String>());
 225     }
 226     fieldCount = keyWords.size();
 227     if (fieldCount == 0)
 228       return;
 229     loopData = new String[fieldCount];
 230     while (getData())
 231       for (int i = 0; i < fieldCount; i++)
 232         ((Lst<String>)data.get(keyWords.get(i))).addLast(loopData[i]);
 233   }
 234
 235   @Override
 236   public String readLine() {
 237     try {
 238       line = (reader == null ? br.readLine() : reader.readNextLine());
 239       if (line == null)
 240         return null;
 241       if (isHeader) {
 242         if (line.startsWith("#"))
 243           fileHeader.append(line).appendC('\n');
 244         else
 245           isHeader = false;
 246       }
 247       return line;
 248     } catch (Exception e) {
 249       return null;
 250     }
 251   }
 252
 253   /**
 254    * The work horse; a general reader for loop data.
 255    * Fills loopData with fieldCount fields.
 256    *
 257    * @return false if EOF
 258    * @throws Exception
 259    */
 260   @Override
 261   public boolean getData() throws Exception {
 262     // line is already present, and we leave with the next line to parse
 263     for (int i = 0; i < fieldCount; ++i)
 264       if ((loopData[i] = getNextDataToken()) == null)
 265         return false;
 266     return (fieldCount > 0);
 267   }
 268
 269   /**
 270    *
 271    * Skips all associated loop data. (Skips to next control word.)
 272    *
 273    * @throws Exception
 274    */
 275   @Override
 276   public String skipLoop(boolean doReport) throws Exception {
 277     String str;
 278     SB ret = (doReport ? new SB() : null);
 279     int n = 0;
 280     while ((str = peekToken()) != null && str.charAt(0) == '_') {
 281       if (ret != null)
 282         ret.append(str).append("\n");
 283       getTokenPeeked();
 284       n++;
 285     }
 286     int m = 0;
 287     while ((str = getNextDataToken()) != null) {
 288       if (ret == null)
 289         continue;
 290       ret.append(str).append(" ");
 291       if ((++m % n) == 0)
 292         ret.append("\n");
 293     }
 294     return (ret == null ? null : ret.toString());
 295   }
 296
 297   /**
 298    *
 299    * @return the next token of any kind, or null
 300    * @throws Exception
 301    */
 302   @Override
 303   public String getNextToken() throws Exception {
 304     while (!strHasMoreTokens())
 305       if (setStringNextLine() == null)
 306         return null;
 307     return nextStrToken();
 308   }
 309
 310   /**
 311    *
 312    * first checks to see if the next token is an unquoted
 313    * control code, and if so, returns null
 314    *
 315    * @return next data token or null
 316    * @throws Exception
 317    */
 318   @Override
 319   public String getNextDataToken() throws Exception {
 320     String str = peekToken();
 321     if (str == null)
 322       return null;
 323     if (wasUnQuoted)
 324       if (str.charAt(0) == '_' || str.startsWith("loop_")
 325           || str.startsWith("data_")
 326           || str.startsWith("stop_")
 327           || str.startsWith("global_"))
 328         return null;
 329     return getTokenPeeked();
 330   }
 331
 332   /**
 333    * Just look at the next token. Saves it for retrieval
 334    * using getTokenPeeked()
 335    *
 336    * @return next token or null if EOF
 337    * @throws Exception
 338    */
 339   @Override
 340   public String peekToken() throws Exception {
 341     while (!strHasMoreTokens())
 342       if (setStringNextLine() == null)
 343         return null;
 344     int ich = this.ich;
 345     strPeeked = nextStrToken();
 346     ichPeeked= this.ich;
 347     this.ich = ich;
 348     return strPeeked;
 349   }
 350
 351   /**
 352    *
 353    * @return the token last acquired; may be null
 354    */
 355   @Override
 356   public String getTokenPeeked() {
 357     ich = ichPeeked;
 358     return strPeeked;
 359   }
 360
 361   /**
 362    * Used especially for data that might be multi-line data that
 363    * might have unwanted white space at start or end.
 364    *
 365    * @param str
 366    * @return str without any leading/trailing white space, and no '\n'
 367    */
 368   @Override
 369   public String fullTrim(String str) {
 370     int pt0 = -1;
 371     int pt1 = str.length();
 372     while (++pt0 < pt1 && PT.isWhitespace(str.charAt(pt0))) {
 373     }
 374     while (--pt1 > pt0 && PT.isWhitespace(str.charAt(pt1))) {
 375     }
 376     return str.substring(pt0, pt1 + 1);
 377   }
 378
 379   private final static String grABC =
 380       "ABX\u0394E\u03A6\u0393H"   // ABCDEFGH
 381       + "I_K\u039BMNO\u03A0"      // I_KLMNOP
 382       + "\u0398P\u03A3TY_\u03A9\u039E\u03A5Z"; // QRSTU_WXYZ
 383   private final static String grabc =
 384       "\u03B1\u03B2\u03C7\u03A4\u03A5\u03C6\u03B3\u03B7" // abcdefgh
 385       + "\u03B9_\u03BA\u03BB\u03BC\u03BD\u03BF\u03C0"    // i_klmnop
 386       + "\u03B8\u03C1\u03C3\u03C4\u03C5_\u03C9\u03BE\u03C5\u03B6"; // qrstu_wxyz
 387
 388   /**
 389    * Only translating the basic Greek set here, not all the other stuff. See
 390    * http://www.iucr.org/resources/cif/spec/version1.1/semantics#markup
 391    *
 392    * @param data
 393    * @return cleaned string
 394    */
 395   @Override
 396   public String toUnicode(String data) {
 397     int pt;
 398     try {
 399       while ((pt = data.indexOf('\\')) >= 0) {
 400         int c = data.charAt(pt + 1);
 401         String ch = (c >= 65 && c <= 90 ? grABC.substring(c - 65, c - 64)
 402             : c >= 97 && c <= 122 ? grabc.substring(c - 97, c - 96) : "_");
 403         data = data.substring(0, pt) + ch + data.substring(pt + 2);
 404       }
 405     } catch (Exception e) {
 406       // ignore
 407     }
 408
 409     return data;
 410   }
 411
 412   /**
 413    * Passing an array of field names, this method fills two arrays.
 414    * The first, fieldOf, identifies
 415    * It does this by first creating a map of names to their indices in fields[].
 416    *
 417    * Alternatively, if fields is null, then a private array is filled, in order,
 418    * with key data. This is used in cases such as matrices for which there are simply
 419    * too many possibilities to list, and the key name itself contains the x-y
 420    * information that we need.
 421    *
 422    */
 423    @Override
 424   public int parseLoopParameters(String[] fields, int[] fieldOf, int[] propertyOf) throws Exception {
 425      int propertyCount = 0;
 426      if (fields == null) {
 427        // for reading full list of keys, as for matrices
 428        this.fields = new String[100];
 429      } else {
 430        if (!htFields.containsKey(fields[0]))
 431          for (int i = fields.length; --i >= 0;)
 432            htFields.put(fields[i], Integer.valueOf(i));
 433        for (int i = fields.length; --i >= 0;)
 434          fieldOf[i] = NONE;
 435        propertyCount = fields.length;
 436      }
 437      fieldCount = 0;
 438      while (true) {
 439        String str = peekToken();
 440        if (str == null) {
 441          // we are PREMATURELY done; reset
 442          fieldCount = 0;
 443          break;
 444        }
 445        // end of the loop is a new token starting with underscore
 446        if (str.charAt(0) != '_')
 447          break;
 448
 449        int pt = fieldCount++;
 450        str = fixKey(getTokenPeeked());
 451        if (fields == null) {
 452          // just make a linear model, saving the list
 453          this.fields[propertyOf[pt] = fieldOf[pt] = pt] = str;
 454          continue;
 455        }
 456        Integer iField = htFields.get(str);
 457        int i = (iField == null ? NONE : iField.intValue());
 458        if ((propertyOf[pt] = i) != NONE)
 459          fieldOf[i] = pt;
 460      }
 461      if (fieldCount > 0)
 462        loopData = new String[fieldCount];
 463      return propertyCount;
 464   }
 465
 466   @Override
 467   public String fixKey(String key) {
 468     // PRELIMINARY -- BilBao _magnetic
 469     // PRELIMINARY -- Jana2006
 470     return (PT.rep(
 471         key.startsWith("_magnetic") ? key.substring(9)
 472             : key.startsWith("_jana") ? key.substring(5)
 473             : key, ".", "_").toLowerCase());
 474   }
 475
 476   //////////////////// private methods ////////////////////
 477
 478
 479   /**
 480    * sets a string to be parsed from the beginning
 481    *
 482    * @param str
 483    */
 484   private void setString(String str) {
 485     this.str = line = str;
 486     cch = (str == null ? 0 : str.length());
 487     ich = 0;
 488   }
 489
 490   /*
 491    * http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax
 492    *
 493    * 17. The special sequence of end-of-line followed
 494    * immediately by a semicolon in column one (denoted "<eol>;")
 495    * may also be used as a delimiter at the beginning and end
 496    * of a character string comprising a data value. The complete
 497    * bounded string is called a text field, and may be used to
 498    * convey multi-line values. The end-of-line associated with
 499    * the closing semicolon does not form part of the data value.
 500    * Within a multi-line text field, leading white space within
 501    * text lines must be retained as part of the data value; trailing
 502    * white space on a line may however be elided.
 503    *
 504    * 18. A text field delimited by the <eol>; digraph may not
 505    * include a semicolon at the start of a line of text as
 506    * part of its value.
 507    *
 508    * 20. For example, the data value foo may be expressed
 509    * equivalently as an unquoted string foo, as a quoted
 510    * string 'foo' or as a text field
 511    *
 512    *;foo
 513    *;
 514    *
 515    * By contrast the value of the text field
 516    *
 517    *; foo
 518    *  bar
 519    *;
 520    *
 521    * is  foo<eol>  bar (where <eol> represents an end-of-line);
 522    * the embedded space characters are significant.
 523    *
 524    *
 525    * I (BH) note, however, that we sometimes have:
 526    *
 527    * _some_name
 528    * ;
 529    * the name here
 530    * ;
 531    *
 532    * so this should actually be
 533    *
 534    * ;the name here
 535    * ;
 536    *
 537    * for this, we use fullTrim();
 538    *
 539    */
 540
 541   /**
 542    *
 543    * sets the string for parsing to be from the next line
 544    * when the token buffer is empty, and if ';' is at the
 545    * beginning of that line, extends the string to include
 546    * that full multiline string. Uses \1 to indicate that
 547    * this is a special quotation.
 548    *
 549    * @return  the next line or null if EOF
 550    * @throws Exception
 551    */
 552   private String setStringNextLine() throws Exception {
 553     setString(readLine());
 554     if (line == null || line.length() == 0)
 555       return line;
 556     if (line.charAt(0) != ';') {
 557       if (str.startsWith("###non-st#"))
 558         ich = 10;
 559       return line;
 560     }
 561     ich = 1;
 562     String str = '\1' + line.substring(1) + '\n';
 563     while (readLine() != null) {
 564       if (line.startsWith(";")) {
 565         // remove trailing <eol> only, and attach rest of next line
 566         str = str.substring(0, str.length() - 1)
 567           + '\1' + line.substring(1);
 568         break;
 569       }
 570       str += line + '\n';
 571     }
 572     setString(str);
 573     return str;
 574   }
 575
 576   /**
 577    * @return TRUE if there are more tokens in the line buffer
 578    *
 579    */
 580   private boolean strHasMoreTokens() {
 581     if (str == null)
 582       return false;
 583     char ch = '#';
 584     while (ich < cch && ((ch = str.charAt(ich)) == ' ' || ch == '\t'))
 585       ++ich;
 586     return (ich < cch && ch != '#');
 587   }
 588
 589   /**
 590    * assume that hasMoreTokens() has been called and that
 591    * ich is pointing at a non-white character. Also sets
 592    * boolean wasUnQuoted, because we need to know if we should
 593    * be checking for a control keyword. 'loop_' is different from just
 594    * loop_ without the quotes.
 595    *
 596    * @return null if no more tokens, "\0" if '.' or '?', or next token
 597    */
 598   private String nextStrToken() {
 599     if (ich == cch)
 600       return null;
 601     int ichStart = ich;
 602     char ch = str.charAt(ichStart);
 603     if (ch != '\'' && ch != '"' && ch != '\1') {
 604       wasUnQuoted = true;
 605       while (ich < cch && (ch = str.charAt(ich)) != ' ' && ch != '\t')
 606         ++ich;
 607       if (ich == ichStart + 1)
 608         if (nullString != null && (str.charAt(ichStart) == '.' || str.charAt(ichStart) == '?'))
 609           return nullString;
 610       String s = str.substring(ichStart, ich);
 611       return s;
 612     }
 613     wasUnQuoted = false;
 614     char chOpeningQuote = ch;
 615     boolean previousCharacterWasQuote = false;
 616     while (++ich < cch) {
 617       ch = str.charAt(ich);
 618       if (previousCharacterWasQuote && (ch == ' ' || ch == '\t'))
 619         break;
 620       previousCharacterWasQuote = (ch == chOpeningQuote);
 621     }
 622     if (ich == cch) {
 623       if (previousCharacterWasQuote) // close quote was last char of string
 624         return str.substring(ichStart + 1, ich - 1);
 625       // reached the end of the string without finding closing '
 626       return str.substring(ichStart, ich);
 627     }
 628     ++ich; // throw away the last white character
 629     return str.substring(ichStart + 1, ich - 2);
 630   }
 631
 632
 633 }