3 import java.io.BufferedReader;
5 import java.util.Hashtable;
9 import javajs.api.GenericCifDataParser;
10 import javajs.api.GenericLineReader;
13 // BH 11/21/16 -- adds support for array grouping [...] - used in 2016-format magCIF files
17 * A CIF 1.0 tokenizer class for dealing with quoted strings in CIF files.
19 * Subclassed by org.jmol.adapters.readers.cif.Cif2DataParser
21 * Greek letters implemented in Jmol 13.3.9 and only for
22 * titles and space groups. All other mark ups ignored.
25 * regarding the treatment of single quotes vs. primes in
26 * cif file, PMR wrote:
29 * * There is a formal grammar for CIF
30 * (see http://www.iucr.org/iucr-top/cif/index.html)
31 * which confirms this. The textual explanation is
34 * 14. Matching single or double quote characters (' or ") may
35 * be used to bound a string representing a non-simple data value
36 * provided the string does not extend over more than one line.
39 * 15. Because data values are invariably separated from other
40 * tokens in the file by white space, such a quote-delimited
41 * character string may contain instances of the character used
42 * to delimit the string provided they are not followed by white
43 * space. For example, the data item
45 * _example 'a dog's life'
47 * is legal; the data value is a dog's life.
50 * [PMR - the terminating character(s) are quote+whitespace.
51 * That would mean that:
53 * _example 'Jones' life'
58 * The CIF format was developed in that late 1980's under the aegis of the
59 * International Union of Crystallography (I am a consultant to the COMCIFs
60 * committee). It was ratified by the Union and there have been several
61 * workshops. mmCIF is an extension of CIF which includes a relational
62 * structure. The formal publications are:
65 * Hall, S. R. (1991). "The STAR File: A New Format for Electronic Data
66 * Transfer and Archiving", J. Chem. Inform. Comp. Sci., 31, 326-333.
67 * Hall, S. R., Allen, F. H. and Brown, I. D. (1991). "The Crystallographic
68 * Information File (CIF): A New Standard Archive File for Crystallography",
69 * Acta Cryst., A47, 655-685.
70 * Hall, S.R. & Spadaccini, N. (1994). "The STAR File: Detailed
71 * Specifications," J. Chem. Info. Comp. Sci., 34, 505-508.
75 public class CifDataParser implements GenericCifDataParser {
77 protected int getVersion() {
82 * The maximum number of columns (data keys) passed to the parser or found in the file
83 * for a given loop_ or category.subkey listing.
86 public static final int KEY_MAX = 100;
88 private GenericLineReader reader;
89 private BufferedReader br;
92 * from buffered reader
94 protected String line;
97 * working string (buffer)
100 protected String str;
103 * pointer to current character on str
114 * whether we are processing an unquoted value or key
116 protected boolean wasUnquoted;
119 * optional token terminator; in CIF 2.0 could be } or ]
121 protected char cterm = '\0';
124 * string to return for CIF data value . and ?
126 protected String nullString = "\0";
129 * A flag to create and return Java objects, not strings.
130 * Used only by Jmol scripting x = getProperty("cifInfo", filename).
132 protected boolean asObject;
136 * debugging flag passed from reader; unused
139 protected boolean debugging;
143 * private processing fields
146 private Object strPeeked;
147 private int ichPeeked;
148 private int columnCount;
149 private String[] columnNames;
150 private Object[] columnData = new Object[KEY_MAX];
151 private boolean isLoop;
152 private boolean haveData;
155 * comments at the top of a file, including #\#CIF_2.0, for example
157 private SB fileHeader = new SB();
158 private boolean isHeader = true;
162 * Set the string value of what is returned for "." and "?"
164 * @param nullString null here returns "." and "?"; default is "\0"
167 public void setNullValue(String nullString) {
168 this.nullString = nullString;
172 * A global, static map that contains field information. The assumption is that
173 * if we read a set of fields for, say, atom_site, once in a lifetime, then
174 * that should be good forever. Those are static lists. Or should be....
176 private static Map<String, Integer> htFields = new Hashtable<String, Integer>();
178 ////////////////////////////////////////////////////////////////
179 // special tokenizer class
180 ////////////////////////////////////////////////////////////////
182 public CifDataParser() {
187 public Object getColumnData(int i) {
188 return columnData[i];
192 public int getColumnCount() {
197 public String getColumnName(int i) {
198 return columnNames[i];
202 * A Chemical Information File data parser.
204 * set() should be called immediately upon construction.
206 * Two options; one of reader or br should be null, or reader will be
207 * ignored. Just simpler this way...
209 * @param reader Anything that can deliver a line of text or null
210 * @param br A standard BufferedReader.
215 public CifDataParser set(GenericLineReader reader, BufferedReader br, boolean debugging) {
216 this.reader = reader;
218 this.debugging = debugging;
225 * @return commented-out section at the start of a CIF file.
229 public String getFileHeader() {
230 return fileHeader.toString();
235 * Parses all CIF data for a reader defined in the constructor
236 * into a standard Map structure and close the BufferedReader if
239 * @return Hashtable of models Vector of Hashtable data
242 public Map<String, Object> getAllCifData() {
245 Map<String, Object> data = null, data0 = null;
246 Map<String, Object> allData = new Hashtable<String, Object>();
247 Lst<Map<String, Object>> models = new Lst<Map<String,Object>>();
248 allData.put("models", models);
249 asObject = (getVersion() >= 2);
251 Lst<Map<String, Object>> saveFrames = new Lst<Map<String, Object>>();
253 while ((key = getNextToken()) != null) {
254 if (key.startsWith("global_") || key.startsWith("data_")) {
255 models.addLast(data0 = data = new Hashtable<String, Object>());
256 data.put("name", key);
259 if (key.startsWith("loop_")) {
260 getAllCifLoopData(data);
263 if (key.startsWith("save_")) {
264 if (key.equals("save_")) {
265 int n = saveFrames.size();
267 System.out.println("CIF ERROR ? save_ without corresponding save_xxxx");
270 data = saveFrames.removeItemAt(n - 1);
273 saveFrames.addLast(data);
274 Map<String, Object> d = data;
275 data = new Hashtable<String, Object>();
280 if (key.charAt(0) != '_') {
281 System.out.println("CIF ERROR ? should be an underscore: " + key);
283 Object value = (asObject ? getNextTokenObject() : getNextToken());
285 System.out.println("CIF ERROR ? end of file; data missing: " + key);
287 data.put(fixKey(key), value);
291 } catch (Exception e) {
298 } catch (Exception e) {
306 * create our own list of keywords and for each one create a list
307 * of data associated with that keyword. For example, a list of all
308 * x coordinates, then a list of all y coordinates, etc.
313 @SuppressWarnings("unchecked")
314 private void getAllCifLoopData(Map<String, Object> data) throws Exception {
316 Lst<String> keyWords = new Lst<String>();
318 while ((o = peekToken()) != null && o instanceof String && ((String) o).charAt(0) == '_') {
319 key = fixKey((String) getTokenPeeked());
320 keyWords.addLast(key);
321 data.put(key, new Lst<String>());
323 columnCount = keyWords.size();
324 if (columnCount == 0)
328 for (int i = 0; i < columnCount; i++)
329 ((Lst<Object>)data.get(keyWords.get(i))).addLast(columnData[i]);
334 public String readLine() {
336 line = (reader == null ? br.readLine() : reader.readNextLine());
340 if (line.startsWith("#"))
341 fileHeader.append(line).appendC('\n');
346 } catch (Exception e) {
352 * The work horse; a general reader for loop data. Fills colunnData with
355 * @return false if EOF
359 public boolean getData() throws Exception {
360 // line is already present, and we leave with the next line to parse
362 for (int i = 0; i < columnCount; ++i)
363 if ((columnData[i] = getNextDataToken()) == null)
365 } else if (haveData) {
370 return (columnCount > 0);
375 * Skips all associated loop data. (Skips to next control word.)
380 public String skipLoop(boolean doReport) throws Exception {
382 SB ret = (doReport ? new SB() : null);
384 while ((str = (String) peekToken()) != null && str.charAt(0) == '_') {
386 ret.append(str).append("\n");
391 n = columnCount; // end-of-label-section skip
393 while ((str = (String) getNextDataToken()) != null) {
396 ret.append(str).append(" ");
400 return (ret == null ? null : ret.toString());
404 * Get a token as a String value (for the reader)
406 * @return the next token of any kind, or null
410 public String getNextToken() throws Exception {
412 return (String) getNextTokenProtected();
416 * Get the token as a Java Object
418 * @return the next token of any kind, or null
421 public Object getNextTokenObject() throws Exception {
423 return getNextTokenProtected();
428 * @return String from buffer.
431 protected Object getNextTokenProtected() throws Exception {
432 return (getNextLine() ? nextStrToken() : null);
437 * first checks to see if the next token is an unquoted
438 * control code, and if so, returns null
440 * @return next data token or null
444 public Object getNextDataToken() throws Exception {
445 Object o = peekToken();
448 if (wasUnquoted && o instanceof String) {
449 String str = (String) o;
450 if (str.charAt(0) == '_' || str.startsWith("loop_")
451 || str.startsWith("data_")
452 || str.startsWith("save_")
453 || str.startsWith("stop_")
454 || str.startsWith("global_"))
457 return getTokenPeeked();
461 * Just look at the next token. Saves it for retrieval
462 * using getTokenPeeked()
464 * @return next token or null if EOF
468 public Object peekToken() throws Exception {
472 strPeeked = nextStrToken();
479 * grab a new line if necessary and prepare it
480 * if it starts with ";"
482 * @return updated this.str
485 private boolean getNextLine() throws Exception {
486 while (!strHasMoreTokens())
487 if (prepareNextLine() == null)
494 * @return the token last acquired; may be null
497 public Object getTokenPeeked() {
503 * Used especially for data that might be multi-line data that
504 * might have unwanted white space at start or end.
507 * @return str without any leading/trailing white space, and no '\n'
510 public String fullTrim(String str) {
512 int pt1 = str.length();
513 while (++pt0 < pt1 && PT.isWhitespace(str.charAt(pt0))) {
515 while (--pt1 > pt0 && PT.isWhitespace(str.charAt(pt1))) {
517 return str.substring(pt0, pt1 + 1);
520 private final static String grABC =
521 "ABX\u0394E\u03A6\u0393H" // ABCDEFGH
522 + "I_K\u039BMNO\u03A0" // I_KLMNOP
523 + "\u0398P\u03A3TY_\u03A9\u039E\u03A5Z"; // QRSTU_WXYZ
524 private final static String grabc =
525 "\u03B1\u03B2\u03C7\u03A4\u03A5\u03C6\u03B3\u03B7" // abcdefgh
526 + "\u03B9_\u03BA\u03BB\u03BC\u03BD\u03BF\u03C0" // i_klmnop
527 + "\u03B8\u03C1\u03C3\u03C4\u03C5_\u03C9\u03BE\u03C5\u03B6"; // qrstu_wxyz
530 * Only translating the basic Greek set here, not all the other stuff. See
531 * http://www.iucr.org/resources/cif/spec/version1.1/semantics#markup
534 * @return cleaned string
537 public String toUnicode(String data) {
540 while ((pt = data.indexOf('\\')) >= 0) {
541 int c = data.charAt(pt + 1);
542 String ch = (c >= 65 && c <= 90 ? grABC.substring(c - 65, c - 64)
543 : c >= 97 && c <= 122 ? grabc.substring(c - 97, c - 96) : "_");
544 data = data.substring(0, pt) + ch + data.substring(pt + 2);
546 } catch (Exception e) {
554 * Process a data block, with or without a loop_.
556 * Passed an array of field names, this method fills two int[] arrays. The
557 * first, key2col, maps desired key values to actual order of appearance
558 * (column number) in the file; the second, col2key, is a reverse loop-up for
559 * that, mapping column numbers to desired field indices.
561 * When called within a loop_ context, this.columnData will be created but not filled.
563 * Alternatively, if fields is null, then this.fieldNames is
564 * filled, in order, with key data, and both key2col and col2key will be
565 * simply 0,1,2,... This array is used in cases such as matrices for which
566 * there are simply too many possibilities to list, and the key name itself
567 * contains information that we need.
569 * When not a loop_ context, keys are expected to be in the mmCIF form
570 * category.subkey and will be unique within a data block (see
571 * http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html).
572 * Keys and data will be read for all data in the same category, filling this.columnData.
575 * In this way, the calling class does not need to enumerate all possible
576 * category names, but instead can focus on just those of interest.
580 * list of normalized field names, such as
581 * "_pdbx_struct_assembly_gen_assembly_id" (with "_" instead of ".")
583 * null to indicate a loop_ construct, otherwise the initial category.subkey
586 * when not loop_ the initial data read, otherwise ignored
588 * map of desired keys to actual columns
590 * map of actual columns to desired keys
594 public void parseDataBlockParameters(String[] fields, String key,
595 String data, int[] key2col, int[] col2key) throws Exception {
596 isLoop = (key == null);
599 if (fields == null) {
600 // for reading full list of keys, as for matrices
601 columnNames = new String[KEY_MAX];
603 if (!htFields.containsKey(fields[0]))
604 for (int i = fields.length; --i >= 0;)
605 htFields.put(fields[i], Integer.valueOf(i));
606 for (int i = fields.length; --i >= 0;)
615 // we are PREMATURELY done; reset
619 // end of the loop is a new token not starting with underscore
620 if (!(o instanceof String) || ((String) o).charAt(0) != '_')
624 s = fixKey((String) getTokenPeeked());
625 if (fields == null) {
626 // just make a linear model, saving the list
627 columnNames[col2key[pt] = key2col[pt] = pt] = s;
630 Integer iField = htFields.get(s);
631 i = (iField == null ? NONE : iField.intValue());
632 if ((col2key[pt] = i) != NONE)
636 pt = key.indexOf(".");
637 String str0 = (pt < 0 ? key : key.substring(0, pt + 1));
639 // end of the loop is a new token not starting with underscore
642 key = (String) getTokenPeeked();
643 data = getNextToken();
645 Integer iField = htFields.get(fixKey(key));
646 i = (iField == null ? NONE : iField.intValue());
647 if ((col2key[pt] = i) != NONE)
648 columnData[key2col[i] = pt] = data;
649 if ((o = peekToken()) == null || !(o instanceof String) || !((String) o).startsWith(str0))
653 haveData = (columnCount > 0);
658 public String fixKey(String key) {
659 // PRELIMINARY -- BilBao _magnetic
660 // PRELIMINARY -- Jana2006
662 key.startsWith("_magnetic") ? key.substring(9)
663 : key.startsWith("_jana") ? key.substring(5)
664 : key).replace('.', '_').toLowerCase();
667 //////////////////// private methods ////////////////////
671 * sets global str and line to be parsed from the beginning
673 * \1 .... \1 indicates an embedded fully escaped data object
675 * @param str new data string
678 protected String setString(String str) {
679 this.str = line = str;
680 cch = (str == null ? 0 : str.length());
686 * http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax
688 * 17. The special sequence of end-of-line followed
689 * immediately by a semicolon in column one (denoted "<eol>;")
690 * may also be used as a delimiter at the beginning and end
691 * of a character string comprising a data value. The complete
692 * bounded string is called a text field, and may be used to
693 * convey multi-line values. The end-of-line associated with
694 * the closing semicolon does not form part of the data value.
695 * Within a multi-line text field, leading white space within
696 * text lines must be retained as part of the data value; trailing
697 * white space on a line may however be elided.
699 * 18. A text field delimited by the <eol>; digraph may not
700 * include a semicolon at the start of a line of text as
703 * 20. For example, the data value foo may be expressed
704 * equivalently as an unquoted string foo, as a quoted
705 * string 'foo' or as a text field
710 * By contrast the value of the text field
716 * is foo<eol> bar (where <eol> represents an end-of-line);
717 * the embedded space characters are significant.
720 * I (BH) note, however, that we sometimes have:
727 * so this should actually be
732 * for this, we use fullTrim();
738 * sets the string for parsing to be from the next line
739 * when the token buffer is empty, and if ';' is at the
740 * beginning of that line, extends the string to include
741 * that full multiline string. Uses \1 to indicate that
742 * this is a special quotation.
746 * @return the next line or null if EOF
749 protected String prepareNextLine() throws Exception {
750 setString(readLine());
751 if (line == null || line.length() == 0)
753 if (line.charAt(0) == ';')
754 return preprocessString();
755 if (str.startsWith("###non-st#"))
761 * Preprocess the string on a line starting with a semicolon
762 * to produce a string with a \1 ... \1 segment
763 * that will be picked up in the next round
765 * @return escaped part with attached extra data
768 protected String preprocessString() throws Exception {
769 return setString(preprocessSemiString());
773 * Encapsulate a multi-line ; .... ; string with \1 ... \1
775 * CIF 1.0 and CIF 2.0
777 * @return ecapsulated string
780 protected String preprocessSemiString() throws Exception {
782 String str = '\1' + line.substring(1) + '\n';
783 while (readLine() != null) {
784 if (line.startsWith(";")) {
785 // remove trailing <eol> only, and attach rest of next line
786 str = str.substring(0, str.length() - 1)
787 + '\1' + line.substring(1);
796 * @return TRUE if there are more tokens in the line buffer
799 private boolean strHasMoreTokens() {
803 while (ich < cch && ((ch = str.charAt(ich)) == ' ' || ch == '\t'))
805 return (ich < cch && ch != '#');
809 * assume that hasMoreTokens() has been called and that ich is pointing at a
810 * non-white character. Also sets boolean wasUnQuoted, because we need to know
811 * if we should be checking for a control keyword. 'loop_' is different from
812 * just loop_ without the quotes.
814 * @return null if no more tokens, "\0" if '.' or '?', or next token
816 private Object nextStrToken() {
819 char ch = str.charAt(ich);
822 return getQuotedStringOrObject(ch);
826 while (ich < cch && !isTerminator(ch = str.charAt(ich)))
828 if (ich == ichStart + 1)
829 if (nullString != null
830 && (str.charAt(ichStart) == '.' || str.charAt(ichStart) == '?'))
832 String s = str.substring(ichStart, ich);
837 * In CIF 2.0, this method turns a String into an Integer or Float
838 * In CIF 1.0 (here) just return the unchanged value.
839 * @param s unquoted string
840 * @return unchanged value
842 protected Object unquoted(String s) {
847 * The token terminator is space or tab in CIF 1.0,
848 * but it can be quoted strings in CIF 2.0.
851 * @return true if this character is a terminator
853 protected boolean isTerminator(char c) {
854 return c == ' ' || c == '\t' || c == cterm ;
858 * CIF 1.0 only; we handle various quote types here
860 * @return true if this character is a (starting) quote
862 protected boolean isQuote(char ch) {
876 * @param ch current character being pointed to
877 * @return a String data object
879 protected Object getQuotedStringOrObject(char ch) {
881 char chClosingQuote = ch;
882 boolean wasQuote = false;
883 while (++ich < cch) {
884 ch = str.charAt(ich);
885 // CIF 1.0 rules require that the closing ' or "" be followed by space or tab or EOL
886 if (wasQuote && (ch == ' ' || ch == '\t'))
888 wasQuote = (ch == chClosingQuote);
890 int pt1 = ichStart + 1;
892 if (ich == cch && !wasQuote) {
893 // reached the end of the string without finding closing '
894 // so take the whole thing. Probably a bad CIF file.
898 // throw away the last white character
901 return str.substring(pt1, pt2);