3 import java.io.BufferedReader;
5 import java.util.Hashtable;
9 import javajs.api.GenericCifDataParser;
10 import javajs.api.GenericLineReader;
14 public class CifDataParser implements GenericCifDataParser {
17 * A special tokenizer class for dealing with quoted strings in CIF files.
19 * Greek letters implemented in Jmol 13.3.9 and only for
20 * titles and space groups. All other mark ups ignored.
23 * regarding the treatment of single quotes vs. primes in
24 * cif file, PMR wrote:
27 * * There is a formal grammar for CIF
28 * (see http://www.iucr.org/iucr-top/cif/index.html)
29 * which confirms this. The textual explanation is
32 * 14. Matching single or double quote characters (' or ") may
33 * be used to bound a string representing a non-simple data value
34 * provided the string does not extend over more than one line.
37 * 15. Because data values are invariably separated from other
38 * tokens in the file by white space, such a quote-delimited
39 * character string may contain instances of the character used
40 * to delimit the string provided they are not followed by white
41 * space. For example, the data item
43 * _example 'a dog's life'
45 * is legal; the data value is a dog's life.
48 * [PMR - the terminating character(s) are quote+whitespace.
49 * That would mean that:
51 * _example 'Jones' life'
56 * The CIF format was developed in that late 1980's under the aegis of the
57 * International Union of Crystallography (I am a consultant to the COMCIFs
58 * committee). It was ratified by the Union and there have been several
59 * workshops. mmCIF is an extension of CIF which includes a relational
60 * structure. The formal publications are:
63 * Hall, S. R. (1991). "The STAR File: A New Format for Electronic Data
64 * Transfer and Archiving", J. Chem. Inform. Comp. Sci., 31, 326-333.
65 * Hall, S. R., Allen, F. H. and Brown, I. D. (1991). "The Crystallographic
66 * Information File (CIF): A New Standard Archive File for Crystallography",
67 * Acta Cryst., A47, 655-685.
68 * Hall, S.R. & Spadaccini, N. (1994). "The STAR File: Detailed
69 * Specifications," J. Chem. Info. Comp. Sci., 34, 505-508.
72 private GenericLineReader reader;
73 private BufferedReader br;
79 private boolean wasUnQuoted;
80 private String strPeeked;
81 private int ichPeeked;
82 private int fieldCount;
83 private String[] loopData;
84 private SB fileHeader = new SB();
85 private boolean isHeader = true;
86 private String nullString = "\0";
89 * Set the string value of what is returned for "." and "?"
91 * @param nullString null here returns "." and "?"; default is "\0"
94 public void setNullValue(String nullString) {
95 this.nullString = nullString;
99 * A global, static map that contains field information. The assumption is that
100 * if we read a set of fields for, say, atom_site, once in a lifetime, then
101 * that should be good forever. Those are static lists. Or should be....
103 private static Map<String, Integer> htFields = new Hashtable<String, Integer>();
105 ////////////////////////////////////////////////////////////////
106 // special tokenizer class
107 ////////////////////////////////////////////////////////////////
109 public CifDataParser() {
113 private String[] fields;
116 public String getLoopData(int i) {
121 public int getFieldCount() {
126 public String getField(int i) {
131 * A Chemical Information File data parser.
133 * Should be called immediately upon construction.
135 * Two options; one of reader or br should be null, or reader will be
136 * ignored. Just simpler this way...
138 * @param reader Anything that can deliver a line of text or null
139 * @param br A standard BufferedReader.
143 public CifDataParser set(GenericLineReader reader, BufferedReader br) {
144 this.reader = reader;
151 * @return commented-out section at the start of a CIF file.
155 public String getFileHeader() {
156 return fileHeader.toString();
161 * Parses all CIF data for a reader defined in the constructor
162 * into a standard Map structure and close the BufferedReader if
165 * @return Hashtable of models Vector of Hashtable data
168 public Map<String, Object> getAllCifData() {
171 Map<String, Object> data = null;
172 Map<String, Object> allData = new Hashtable<String, Object>();
173 Lst<Map<String, Object>> models = new Lst<Map<String,Object>>();
174 allData.put("models", models);
176 while ((key = getNextToken()) != null) {
177 if (key.startsWith("global_") || key.startsWith("data_")) {
178 models.addLast(data = new Hashtable<String, Object>());
179 data.put("name", key);
182 if (key.startsWith("loop_")) {
183 getAllCifLoopData(data);
186 if (key.charAt(0) != '_') {
187 System.out.println("CIF ERROR ? should be an underscore: " + key);
189 String value = getNextToken();
191 System.out.println("CIF ERROR ? end of file; data missing: " + key);
193 data.put(fixKey(key), value);
197 } catch (Exception e) {
203 } catch (Exception e) {
210 * create our own list of keywords and for each one create a list
211 * of data associated with that keyword. For example, a list of all
212 * x coordinates, then a list of all y coordinates, etc.
217 @SuppressWarnings("unchecked")
218 private void getAllCifLoopData(Map<String, Object> data) throws Exception {
220 Lst<String> keyWords = new Lst<String>();
221 while ((key = peekToken()) != null && key.charAt(0) == '_') {
222 key = fixKey(getTokenPeeked());
223 keyWords.addLast(key);
224 data.put(key, new Lst<String>());
226 fieldCount = keyWords.size();
229 loopData = new String[fieldCount];
231 for (int i = 0; i < fieldCount; i++)
232 ((Lst<String>)data.get(keyWords.get(i))).addLast(loopData[i]);
236 public String readLine() {
238 line = (reader == null ? br.readLine() : reader.readNextLine());
242 if (line.startsWith("#"))
243 fileHeader.append(line).appendC('\n');
248 } catch (Exception e) {
254 * The work horse; a general reader for loop data.
255 * Fills loopData with fieldCount fields.
257 * @return false if EOF
261 public boolean getData() throws Exception {
262 // line is already present, and we leave with the next line to parse
263 for (int i = 0; i < fieldCount; ++i)
264 if ((loopData[i] = getNextDataToken()) == null)
266 return (fieldCount > 0);
271 * Skips all associated loop data. (Skips to next control word.)
276 public String skipLoop(boolean doReport) throws Exception {
278 SB ret = (doReport ? new SB() : null);
280 while ((str = peekToken()) != null && str.charAt(0) == '_') {
282 ret.append(str).append("\n");
287 while ((str = getNextDataToken()) != null) {
290 ret.append(str).append(" ");
294 return (ret == null ? null : ret.toString());
299 * @return the next token of any kind, or null
303 public String getNextToken() throws Exception {
304 while (!strHasMoreTokens())
305 if (setStringNextLine() == null)
307 return nextStrToken();
312 * first checks to see if the next token is an unquoted
313 * control code, and if so, returns null
315 * @return next data token or null
319 public String getNextDataToken() throws Exception {
320 String str = peekToken();
324 if (str.charAt(0) == '_' || str.startsWith("loop_")
325 || str.startsWith("data_")
326 || str.startsWith("stop_")
327 || str.startsWith("global_"))
329 return getTokenPeeked();
333 * Just look at the next token. Saves it for retrieval
334 * using getTokenPeeked()
336 * @return next token or null if EOF
340 public String peekToken() throws Exception {
341 while (!strHasMoreTokens())
342 if (setStringNextLine() == null)
345 strPeeked = nextStrToken();
353 * @return the token last acquired; may be null
356 public String getTokenPeeked() {
362 * Used especially for data that might be multi-line data that
363 * might have unwanted white space at start or end.
366 * @return str without any leading/trailing white space, and no '\n'
369 public String fullTrim(String str) {
371 int pt1 = str.length();
372 while (++pt0 < pt1 && PT.isWhitespace(str.charAt(pt0))) {
374 while (--pt1 > pt0 && PT.isWhitespace(str.charAt(pt1))) {
376 return str.substring(pt0, pt1 + 1);
379 private final static String grABC =
380 "ABX\u0394E\u03A6\u0393H" // ABCDEFGH
381 + "I_K\u039BMNO\u03A0" // I_KLMNOP
382 + "\u0398P\u03A3TY_\u03A9\u039E\u03A5Z"; // QRSTU_WXYZ
383 private final static String grabc =
384 "\u03B1\u03B2\u03C7\u03A4\u03A5\u03C6\u03B3\u03B7" // abcdefgh
385 + "\u03B9_\u03BA\u03BB\u03BC\u03BD\u03BF\u03C0" // i_klmnop
386 + "\u03B8\u03C1\u03C3\u03C4\u03C5_\u03C9\u03BE\u03C5\u03B6"; // qrstu_wxyz
389 * Only translating the basic Greek set here, not all the other stuff. See
390 * http://www.iucr.org/resources/cif/spec/version1.1/semantics#markup
393 * @return cleaned string
396 public String toUnicode(String data) {
399 while ((pt = data.indexOf('\\')) >= 0) {
400 int c = data.charAt(pt + 1);
401 String ch = (c >= 65 && c <= 90 ? grABC.substring(c - 65, c - 64)
402 : c >= 97 && c <= 122 ? grabc.substring(c - 97, c - 96) : "_");
403 data = data.substring(0, pt) + ch + data.substring(pt + 2);
405 } catch (Exception e) {
413 * Passing an array of field names, this method fills two arrays.
414 * The first, fieldOf, identifies
415 * It does this by first creating a map of names to their indices in fields[].
417 * Alternatively, if fields is null, then a private array is filled, in order,
418 * with key data. This is used in cases such as matrices for which there are simply
419 * too many possibilities to list, and the key name itself contains the x-y
420 * information that we need.
424 public int parseLoopParameters(String[] fields, int[] fieldOf, int[] propertyOf) throws Exception {
425 int propertyCount = 0;
426 if (fields == null) {
427 // for reading full list of keys, as for matrices
428 this.fields = new String[100];
430 if (!htFields.containsKey(fields[0]))
431 for (int i = fields.length; --i >= 0;)
432 htFields.put(fields[i], Integer.valueOf(i));
433 for (int i = fields.length; --i >= 0;)
435 propertyCount = fields.length;
439 String str = peekToken();
441 // we are PREMATURELY done; reset
445 // end of the loop is a new token starting with underscore
446 if (str.charAt(0) != '_')
449 int pt = fieldCount++;
450 str = fixKey(getTokenPeeked());
451 if (fields == null) {
452 // just make a linear model, saving the list
453 this.fields[propertyOf[pt] = fieldOf[pt] = pt] = str;
456 Integer iField = htFields.get(str);
457 int i = (iField == null ? NONE : iField.intValue());
458 if ((propertyOf[pt] = i) != NONE)
462 loopData = new String[fieldCount];
463 return propertyCount;
467 public String fixKey(String key) {
468 // PRELIMINARY -- BilBao _magnetic
469 // PRELIMINARY -- Jana2006
471 key.startsWith("_magnetic") ? key.substring(9)
472 : key.startsWith("_jana") ? key.substring(5)
473 : key, ".", "_").toLowerCase());
476 //////////////////// private methods ////////////////////
480 * sets a string to be parsed from the beginning
484 private void setString(String str) {
485 this.str = line = str;
486 cch = (str == null ? 0 : str.length());
491 * http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax
493 * 17. The special sequence of end-of-line followed
494 * immediately by a semicolon in column one (denoted "<eol>;")
495 * may also be used as a delimiter at the beginning and end
496 * of a character string comprising a data value. The complete
497 * bounded string is called a text field, and may be used to
498 * convey multi-line values. The end-of-line associated with
499 * the closing semicolon does not form part of the data value.
500 * Within a multi-line text field, leading white space within
501 * text lines must be retained as part of the data value; trailing
502 * white space on a line may however be elided.
504 * 18. A text field delimited by the <eol>; digraph may not
505 * include a semicolon at the start of a line of text as
508 * 20. For example, the data value foo may be expressed
509 * equivalently as an unquoted string foo, as a quoted
510 * string 'foo' or as a text field
515 * By contrast the value of the text field
521 * is foo<eol> bar (where <eol> represents an end-of-line);
522 * the embedded space characters are significant.
525 * I (BH) note, however, that we sometimes have:
532 * so this should actually be
537 * for this, we use fullTrim();
543 * sets the string for parsing to be from the next line
544 * when the token buffer is empty, and if ';' is at the
545 * beginning of that line, extends the string to include
546 * that full multiline string. Uses \1 to indicate that
547 * this is a special quotation.
549 * @return the next line or null if EOF
552 private String setStringNextLine() throws Exception {
553 setString(readLine());
554 if (line == null || line.length() == 0)
556 if (line.charAt(0) != ';') {
557 if (str.startsWith("###non-st#"))
562 String str = '\1' + line.substring(1) + '\n';
563 while (readLine() != null) {
564 if (line.startsWith(";")) {
565 // remove trailing <eol> only, and attach rest of next line
566 str = str.substring(0, str.length() - 1)
567 + '\1' + line.substring(1);
577 * @return TRUE if there are more tokens in the line buffer
580 private boolean strHasMoreTokens() {
584 while (ich < cch && ((ch = str.charAt(ich)) == ' ' || ch == '\t'))
586 return (ich < cch && ch != '#');
590 * assume that hasMoreTokens() has been called and that
591 * ich is pointing at a non-white character. Also sets
592 * boolean wasUnQuoted, because we need to know if we should
593 * be checking for a control keyword. 'loop_' is different from just
594 * loop_ without the quotes.
596 * @return null if no more tokens, "\0" if '.' or '?', or next token
598 private String nextStrToken() {
602 char ch = str.charAt(ichStart);
603 if (ch != '\'' && ch != '"' && ch != '\1') {
605 while (ich < cch && (ch = str.charAt(ich)) != ' ' && ch != '\t')
607 if (ich == ichStart + 1)
608 if (nullString != null && (str.charAt(ichStart) == '.' || str.charAt(ichStart) == '?'))
610 String s = str.substring(ichStart, ich);
614 char chOpeningQuote = ch;
615 boolean previousCharacterWasQuote = false;
616 while (++ich < cch) {
617 ch = str.charAt(ich);
618 if (previousCharacterWasQuote && (ch == ' ' || ch == '\t'))
620 previousCharacterWasQuote = (ch == chOpeningQuote);
623 if (previousCharacterWasQuote) // close quote was last char of string
624 return str.substring(ichStart + 1, ich - 1);
625 // reached the end of the string without finding closing '
626 return str.substring(ichStart, ich);
628 ++ich; // throw away the last white character
629 return str.substring(ichStart + 1, ich - 2);