3 import java.io.BufferedReader;
\r
5 import java.util.Hashtable;
\r
7 import java.util.Map;
\r
9 import javajs.api.GenericCifDataParser;
\r
10 import javajs.api.GenericLineReader;
\r
14 public class CifDataParser implements GenericCifDataParser {
\r
17 * A special tokenizer class for dealing with quoted strings in CIF files.
\r
19 * Greek letters implemented in Jmol 13.3.9 and only for
\r
20 * titles and space groups. All other mark ups ignored.
\r
23 * regarding the treatment of single quotes vs. primes in
\r
24 * cif file, PMR wrote:
\r
27 * * There is a formal grammar for CIF
\r
28 * (see http://www.iucr.org/iucr-top/cif/index.html)
\r
29 * which confirms this. The textual explanation is
\r
32 * 14. Matching single or double quote characters (' or ") may
\r
33 * be used to bound a string representing a non-simple data value
\r
34 * provided the string does not extend over more than one line.
\r
37 * 15. Because data values are invariably separated from other
\r
38 * tokens in the file by white space, such a quote-delimited
\r
39 * character string may contain instances of the character used
\r
40 * to delimit the string provided they are not followed by white
\r
41 * space. For example, the data item
\r
43 * _example 'a dog's life'
\r
45 * is legal; the data value is a dog's life.
\r
48 * [PMR - the terminating character(s) are quote+whitespace.
\r
49 * That would mean that:
\r
51 * _example 'Jones' life'
\r
56 * The CIF format was developed in that late 1980's under the aegis of the
\r
57 * International Union of Crystallography (I am a consultant to the COMCIFs
\r
58 * committee). It was ratified by the Union and there have been several
\r
59 * workshops. mmCIF is an extension of CIF which includes a relational
\r
60 * structure. The formal publications are:
\r
63 * Hall, S. R. (1991). "The STAR File: A New Format for Electronic Data
\r
64 * Transfer and Archiving", J. Chem. Inform. Comp. Sci., 31, 326-333.
\r
65 * Hall, S. R., Allen, F. H. and Brown, I. D. (1991). "The Crystallographic
\r
66 * Information File (CIF): A New Standard Archive File for Crystallography",
\r
67 * Acta Cryst., A47, 655-685.
\r
68 * Hall, S.R. & Spadaccini, N. (1994). "The STAR File: Detailed
\r
69 * Specifications," J. Chem. Info. Comp. Sci., 34, 505-508.
\r
72 private GenericLineReader reader;
\r
73 private BufferedReader br;
\r
75 private String line;
\r
79 private boolean wasUnQuoted;
\r
80 private String strPeeked;
\r
81 private int ichPeeked;
\r
82 private int fieldCount;
\r
83 private String[] loopData;
\r
84 private SB fileHeader = new SB();
\r
85 private boolean isHeader = true;
\r
86 private String nullString = "\0";
\r
89 * Set the string value of what is returned for "." and "?"
\r
91 * @param nullString null here returns "." and "?"; default is "\0"
\r
94 public void setNullValue(String nullString) {
\r
95 this.nullString = nullString;
\r
99 * A global, static map that contains field information. The assumption is that
\r
100 * if we read a set of fields for, say, atom_site, once in a lifetime, then
\r
101 * that should be good forever. Those are static lists. Or should be....
\r
103 private static Map<String, Integer> htFields = new Hashtable<String, Integer>();
\r
105 ////////////////////////////////////////////////////////////////
\r
106 // special tokenizer class
\r
107 ////////////////////////////////////////////////////////////////
\r
109 public CifDataParser() {
\r
113 private String[] fields;
\r
116 public String getLoopData(int i) {
\r
117 return loopData[i];
\r
121 public int getFieldCount() {
\r
126 public String getField(int i) {
\r
131 * A Chemical Information File data parser.
\r
133 * Should be called immediately upon construction.
\r
135 * Two options; one of reader or br should be null, or reader will be
\r
136 * ignored. Just simpler this way...
\r
138 * @param reader Anything that can deliver a line of text or null
\r
139 * @param br A standard BufferedReader.
\r
143 public CifDataParser set(GenericLineReader reader, BufferedReader br) {
\r
144 this.reader = reader;
\r
151 * @return commented-out section at the start of a CIF file.
\r
155 public String getFileHeader() {
\r
156 return fileHeader.toString();
\r
161 * Parses all CIF data for a reader defined in the constructor
\r
162 * into a standard Map structure and close the BufferedReader if
\r
165 * @return Hashtable of models Vector of Hashtable data
\r
168 public Map<String, Object> getAllCifData() {
\r
171 Map<String, Object> data = null;
\r
172 Map<String, Object> allData = new Hashtable<String, Object>();
\r
173 Lst<Map<String, Object>> models = new Lst<Map<String,Object>>();
\r
174 allData.put("models", models);
\r
176 while ((key = getNextToken()) != null) {
\r
177 if (key.startsWith("global_") || key.startsWith("data_")) {
\r
178 models.addLast(data = new Hashtable<String, Object>());
\r
179 data.put("name", key);
\r
182 if (key.startsWith("loop_")) {
\r
183 getAllCifLoopData(data);
\r
186 if (key.charAt(0) != '_') {
\r
187 System.out.println("CIF ERROR ? should be an underscore: " + key);
\r
189 String value = getNextToken();
\r
190 if (value == null) {
\r
191 System.out.println("CIF ERROR ? end of file; data missing: " + key);
\r
193 data.put(fixKey(key), value);
\r
197 } catch (Exception e) {
\r
203 } catch (Exception e) {
\r
210 * create our own list of keywords and for each one create a list
\r
211 * of data associated with that keyword. For example, a list of all
\r
212 * x coordinates, then a list of all y coordinates, etc.
\r
215 * @throws Exception
\r
217 @SuppressWarnings("unchecked")
\r
218 private void getAllCifLoopData(Map<String, Object> data) throws Exception {
\r
220 Lst<String> keyWords = new Lst<String>();
\r
221 while ((key = peekToken()) != null && key.charAt(0) == '_') {
\r
222 key = fixKey(getTokenPeeked());
\r
223 keyWords.addLast(key);
\r
224 data.put(key, new Lst<String>());
\r
226 fieldCount = keyWords.size();
\r
227 if (fieldCount == 0)
\r
229 loopData = new String[fieldCount];
\r
231 for (int i = 0; i < fieldCount; i++)
\r
232 ((Lst<String>)data.get(keyWords.get(i))).addLast(loopData[i]);
\r
236 public String readLine() {
\r
238 line = (reader == null ? br.readLine() : reader.readNextLine());
\r
242 if (line.startsWith("#"))
\r
243 fileHeader.append(line).appendC('\n');
\r
248 } catch (Exception e) {
\r
254 * The work horse; a general reader for loop data.
\r
255 * Fills loopData with fieldCount fields.
\r
257 * @return false if EOF
\r
258 * @throws Exception
\r
261 public boolean getData() throws Exception {
\r
262 // line is already present, and we leave with the next line to parse
\r
263 for (int i = 0; i < fieldCount; ++i)
\r
264 if ((loopData[i] = getNextDataToken()) == null)
\r
266 return (fieldCount > 0);
\r
271 * Skips all associated loop data. (Skips to next control word.)
\r
273 * @throws Exception
\r
276 public String skipLoop(boolean doReport) throws Exception {
\r
278 SB ret = (doReport ? new SB() : null);
\r
280 while ((str = peekToken()) != null && str.charAt(0) == '_') {
\r
282 ret.append(str).append("\n");
\r
287 while ((str = getNextDataToken()) != null) {
\r
290 ret.append(str).append(" ");
\r
291 if ((++m % n) == 0)
\r
294 return (ret == null ? null : ret.toString());
\r
299 * @return the next token of any kind, or null
\r
300 * @throws Exception
\r
303 public String getNextToken() throws Exception {
\r
304 while (!strHasMoreTokens())
\r
305 if (setStringNextLine() == null)
\r
307 return nextStrToken();
\r
312 * first checks to see if the next token is an unquoted
\r
313 * control code, and if so, returns null
\r
315 * @return next data token or null
\r
316 * @throws Exception
\r
319 public String getNextDataToken() throws Exception {
\r
320 String str = peekToken();
\r
324 if (str.charAt(0) == '_' || str.startsWith("loop_")
\r
325 || str.startsWith("data_")
\r
326 || str.startsWith("stop_")
\r
327 || str.startsWith("global_"))
\r
329 return getTokenPeeked();
\r
333 * Just look at the next token. Saves it for retrieval
\r
334 * using getTokenPeeked()
\r
336 * @return next token or null if EOF
\r
337 * @throws Exception
\r
340 public String peekToken() throws Exception {
\r
341 while (!strHasMoreTokens())
\r
342 if (setStringNextLine() == null)
\r
344 int ich = this.ich;
\r
345 strPeeked = nextStrToken();
\r
346 ichPeeked= this.ich;
\r
353 * @return the token last acquired; may be null
\r
356 public String getTokenPeeked() {
\r
362 * Used especially for data that might be multi-line data that
\r
363 * might have unwanted white space at start or end.
\r
366 * @return str without any leading/trailing white space, and no '\n'
\r
369 public String fullTrim(String str) {
\r
371 int pt1 = str.length();
\r
372 while (++pt0 < pt1 && PT.isWhitespace(str.charAt(pt0))) {
\r
374 while (--pt1 > pt0 && PT.isWhitespace(str.charAt(pt1))) {
\r
376 return str.substring(pt0, pt1 + 1);
\r
379 private final static String grABC =
\r
380 "ABX\u0394E\u03A6\u0393H" // ABCDEFGH
\r
381 + "I_K\u039BMNO\u03A0" // I_KLMNOP
\r
382 + "\u0398P\u03A3TY_\u03A9\u039E\u03A5Z"; // QRSTU_WXYZ
\r
383 private final static String grabc =
\r
384 "\u03B1\u03B2\u03C7\u03A4\u03A5\u03C6\u03B3\u03B7" // abcdefgh
\r
385 + "\u03B9_\u03BA\u03BB\u03BC\u03BD\u03BF\u03C0" // i_klmnop
\r
386 + "\u03B8\u03C1\u03C3\u03C4\u03C5_\u03C9\u03BE\u03C5\u03B6"; // qrstu_wxyz
\r
389 * Only translating the basic Greek set here, not all the other stuff. See
\r
390 * http://www.iucr.org/resources/cif/spec/version1.1/semantics#markup
\r
393 * @return cleaned string
\r
396 public String toUnicode(String data) {
\r
399 while ((pt = data.indexOf('\\')) >= 0) {
\r
400 int c = data.charAt(pt + 1);
\r
401 String ch = (c >= 65 && c <= 90 ? grABC.substring(c - 65, c - 64)
\r
402 : c >= 97 && c <= 122 ? grabc.substring(c - 97, c - 96) : "_");
\r
403 data = data.substring(0, pt) + ch + data.substring(pt + 2);
\r
405 } catch (Exception e) {
\r
413 * Passing an array of field names, this method fills two arrays.
\r
414 * The first, fieldOf, identifies
\r
415 * It does this by first creating a map of names to their indices in fields[].
\r
417 * Alternatively, if fields is null, then a private array is filled, in order,
\r
418 * with key data. This is used in cases such as matrices for which there are simply
\r
419 * too many possibilities to list, and the key name itself contains the x-y
\r
420 * information that we need.
\r
424 public int parseLoopParameters(String[] fields, int[] fieldOf, int[] propertyOf) throws Exception {
\r
425 int propertyCount = 0;
\r
426 if (fields == null) {
\r
427 // for reading full list of keys, as for matrices
\r
428 this.fields = new String[100];
\r
430 if (!htFields.containsKey(fields[0]))
\r
431 for (int i = fields.length; --i >= 0;)
\r
432 htFields.put(fields[i], Integer.valueOf(i));
\r
433 for (int i = fields.length; --i >= 0;)
\r
435 propertyCount = fields.length;
\r
439 String str = peekToken();
\r
441 // we are PREMATURELY done; reset
\r
445 // end of the loop is a new token starting with underscore
\r
446 if (str.charAt(0) != '_')
\r
449 int pt = fieldCount++;
\r
450 str = fixKey(getTokenPeeked());
\r
451 if (fields == null) {
\r
452 // just make a linear model, saving the list
\r
453 this.fields[propertyOf[pt] = fieldOf[pt] = pt] = str;
\r
456 Integer iField = htFields.get(str);
\r
457 int i = (iField == null ? NONE : iField.intValue());
\r
458 if ((propertyOf[pt] = i) != NONE)
\r
461 if (fieldCount > 0)
\r
462 loopData = new String[fieldCount];
\r
463 return propertyCount;
\r
467 public String fixKey(String key) {
\r
468 // PRELIMINARY -- BilBao _magnetic
\r
469 // PRELIMINARY -- Jana2006
\r
471 key.startsWith("_magnetic") ? key.substring(9)
\r
472 : key.startsWith("_jana") ? key.substring(5)
\r
473 : key, ".", "_").toLowerCase());
\r
476 //////////////////// private methods ////////////////////
\r
480 * sets a string to be parsed from the beginning
\r
484 private void setString(String str) {
\r
485 this.str = line = str;
\r
486 cch = (str == null ? 0 : str.length());
\r
491 * http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax
\r
493 * 17. The special sequence of end-of-line followed
\r
494 * immediately by a semicolon in column one (denoted "<eol>;")
\r
495 * may also be used as a delimiter at the beginning and end
\r
496 * of a character string comprising a data value. The complete
\r
497 * bounded string is called a text field, and may be used to
\r
498 * convey multi-line values. The end-of-line associated with
\r
499 * the closing semicolon does not form part of the data value.
\r
500 * Within a multi-line text field, leading white space within
\r
501 * text lines must be retained as part of the data value; trailing
\r
502 * white space on a line may however be elided.
\r
504 * 18. A text field delimited by the <eol>; digraph may not
\r
505 * include a semicolon at the start of a line of text as
\r
506 * part of its value.
\r
508 * 20. For example, the data value foo may be expressed
\r
509 * equivalently as an unquoted string foo, as a quoted
\r
510 * string 'foo' or as a text field
\r
515 * By contrast the value of the text field
\r
521 * is foo<eol> bar (where <eol> represents an end-of-line);
\r
522 * the embedded space characters are significant.
\r
525 * I (BH) note, however, that we sometimes have:
\r
532 * so this should actually be
\r
537 * for this, we use fullTrim();
\r
543 * sets the string for parsing to be from the next line
\r
544 * when the token buffer is empty, and if ';' is at the
\r
545 * beginning of that line, extends the string to include
\r
546 * that full multiline string. Uses \1 to indicate that
\r
547 * this is a special quotation.
\r
549 * @return the next line or null if EOF
\r
550 * @throws Exception
\r
552 private String setStringNextLine() throws Exception {
\r
553 setString(readLine());
\r
554 if (line == null || line.length() == 0)
\r
556 if (line.charAt(0) != ';') {
\r
557 if (str.startsWith("###non-st#"))
\r
562 String str = '\1' + line.substring(1) + '\n';
\r
563 while (readLine() != null) {
\r
564 if (line.startsWith(";")) {
\r
565 // remove trailing <eol> only, and attach rest of next line
\r
566 str = str.substring(0, str.length() - 1)
\r
567 + '\1' + line.substring(1);
\r
570 str += line + '\n';
\r
577 * @return TRUE if there are more tokens in the line buffer
\r
580 private boolean strHasMoreTokens() {
\r
584 while (ich < cch && ((ch = str.charAt(ich)) == ' ' || ch == '\t'))
\r
586 return (ich < cch && ch != '#');
\r
590 * assume that hasMoreTokens() has been called and that
\r
591 * ich is pointing at a non-white character. Also sets
\r
592 * boolean wasUnQuoted, because we need to know if we should
\r
593 * be checking for a control keyword. 'loop_' is different from just
\r
594 * loop_ without the quotes.
\r
596 * @return null if no more tokens, "\0" if '.' or '?', or next token
\r
598 private String nextStrToken() {
\r
601 int ichStart = ich;
\r
602 char ch = str.charAt(ichStart);
\r
603 if (ch != '\'' && ch != '"' && ch != '\1') {
\r
604 wasUnQuoted = true;
\r
605 while (ich < cch && (ch = str.charAt(ich)) != ' ' && ch != '\t')
\r
607 if (ich == ichStart + 1)
\r
608 if (nullString != null && (str.charAt(ichStart) == '.' || str.charAt(ichStart) == '?'))
\r
610 String s = str.substring(ichStart, ich);
\r
613 wasUnQuoted = false;
\r
614 char chOpeningQuote = ch;
\r
615 boolean previousCharacterWasQuote = false;
\r
616 while (++ich < cch) {
\r
617 ch = str.charAt(ich);
\r
618 if (previousCharacterWasQuote && (ch == ' ' || ch == '\t'))
\r
620 previousCharacterWasQuote = (ch == chOpeningQuote);
\r
623 if (previousCharacterWasQuote) // close quote was last char of string
\r
624 return str.substring(ichStart + 1, ich - 1);
\r
625 // reached the end of the string without finding closing '
\r
626 return str.substring(ichStart, ich);
\r
628 ++ich; // throw away the last white character
\r
629 return str.substring(ichStart + 1, ich - 2);
\r