-package javajs.util;\r
-\r
-import java.io.BufferedReader;\r
-\r
-import java.util.Hashtable;\r
-\r
-import java.util.Map;\r
-\r
-import javajs.api.GenericCifDataParser;\r
-import javajs.api.GenericLineReader;\r
-\r
-\r
-\r
-public class CifDataParser implements GenericCifDataParser {\r
- /**\r
- *\r
- * A special tokenizer class for dealing with quoted strings in CIF files.\r
- * \r
- * Greek letters implemented in Jmol 13.3.9 and only for \r
- * titles and space groups. All other mark ups ignored.\r
- * \r
- *<p>\r
- * regarding the treatment of single quotes vs. primes in\r
- * cif file, PMR wrote:\r
- *</p>\r
- *<p>\r
- * * There is a formal grammar for CIF\r
- * (see http://www.iucr.org/iucr-top/cif/index.html)\r
- * which confirms this. The textual explanation is\r
- *<p />\r
- *<p>\r
- * 14. Matching single or double quote characters (' or ") may\r
- * be used to bound a string representing a non-simple data value\r
- * provided the string does not extend over more than one line.\r
- *<p />\r
- *<p>\r
- * 15. Because data values are invariably separated from other\r
- * tokens in the file by white space, such a quote-delimited\r
- * character string may contain instances of the character used\r
- * to delimit the string provided they are not followed by white\r
- * space. For example, the data item\r
- *<code>\r
- * _example 'a dog's life'\r
- *</code>\r
- * is legal; the data value is a dog's life.\r
- *</p>\r
- *<p>\r
- * [PMR - the terminating character(s) are quote+whitespace.\r
- * That would mean that:\r
- *<code>\r
- * _example 'Jones' life'\r
- *</code>\r
- * would be an error\r
- *</p>\r
- *<p>\r
- * The CIF format was developed in that late 1980's under the aegis of the\r
- * International Union of Crystallography (I am a consultant to the COMCIFs \r
- * committee). It was ratified by the Union and there have been several \r
- * workshops. mmCIF is an extension of CIF which includes a relational \r
- * structure. The formal publications are:\r
- *</p>\r
- *<p>\r
- * Hall, S. R. (1991). "The STAR File: A New Format for Electronic Data \r
- * Transfer and Archiving", J. Chem. Inform. Comp. Sci., 31, 326-333.\r
- * Hall, S. R., Allen, F. H. and Brown, I. D. (1991). "The Crystallographic\r
- * Information File (CIF): A New Standard Archive File for Crystallography",\r
- * Acta Cryst., A47, 655-685.\r
- * Hall, S.R. & Spadaccini, N. (1994). "The STAR File: Detailed \r
- * Specifications," J. Chem. Info. Comp. Sci., 34, 505-508.\r
- *</p>\r
- */\r
- private GenericLineReader reader;\r
- private BufferedReader br;\r
-\r
- private String line; \r
- private String str;\r
- private int ich;\r
- private int cch;\r
- private boolean wasUnQuoted;\r
- private String strPeeked;\r
- private int ichPeeked;\r
- private int fieldCount;\r
- private String[] loopData;\r
- private SB fileHeader = new SB();\r
- private boolean isHeader = true;\r
- private String nullString = "\0";\r
-\r
- /**\r
- * Set the string value of what is returned for "." and "?"\r
- * \r
- * @param nullString null here returns "." and "?"; default is "\0"\r
- * \r
- */\r
- public void setNullValue(String nullString) {\r
- this.nullString = nullString; \r
- }\r
-\r
- /**\r
- * A global, static map that contains field information. The assumption is that\r
- * if we read a set of fields for, say, atom_site, once in a lifetime, then\r
- * that should be good forever. Those are static lists. Or should be....\r
- */\r
- private static Map<String, Integer> htFields = new Hashtable<String, Integer>();\r
- \r
- ////////////////////////////////////////////////////////////////\r
- // special tokenizer class\r
- ////////////////////////////////////////////////////////////////\r
-\r
- public CifDataParser() {\r
- // for reflection\r
- }\r
- \r
- private String[] fields;\r
-\r
- @Override\r
- public String getLoopData(int i) {\r
- return loopData[i];\r
- }\r
-\r
- @Override\r
- public int getFieldCount() {\r
- return fieldCount;\r
- }\r
-\r
- @Override\r
- public String getField(int i) {\r
- return fields[i];\r
- }\r
-\r
- /**\r
- * A Chemical Information File data parser.\r
- * \r
- * Should be called immediately upon construction.\r
- * \r
- * Two options; one of reader or br should be null, or reader will be\r
- * ignored. Just simpler this way...\r
- * \r
- * @param reader Anything that can deliver a line of text or null\r
- * @param br A standard BufferedReader.\r
- * \r
- */\r
- @Override\r
- public CifDataParser set(GenericLineReader reader, BufferedReader br) {\r
- this.reader = reader;\r
- this.br = br;\r
- return this;\r
- }\r
-\r
- /**\r
- * \r
- * @return commented-out section at the start of a CIF file.\r
- * \r
- */\r
- @Override\r
- public String getFileHeader() {\r
- return fileHeader.toString();\r
- }\r
- \r
- \r
- /**\r
- * Parses all CIF data for a reader defined in the constructor\r
- * into a standard Map structure and close the BufferedReader if\r
- * it exists. \r
- * \r
- * @return Hashtable of models Vector of Hashtable data\r
- */\r
- @Override\r
- public Map<String, Object> getAllCifData() {\r
- line = "";\r
- String key;\r
- Map<String, Object> data = null;\r
- Map<String, Object> allData = new Hashtable<String, Object>();\r
- Lst<Map<String, Object>> models = new Lst<Map<String,Object>>();\r
- allData.put("models", models);\r
- try {\r
- while ((key = getNextToken()) != null) {\r
- if (key.startsWith("global_") || key.startsWith("data_")) {\r
- models.addLast(data = new Hashtable<String, Object>());\r
- data.put("name", key);\r
- continue;\r
- }\r
- if (key.startsWith("loop_")) {\r
- getAllCifLoopData(data);\r
- continue;\r
- }\r
- if (key.charAt(0) != '_') {\r
- System.out.println("CIF ERROR ? should be an underscore: " + key);\r
- } else {\r
- String value = getNextToken();\r
- if (value == null) {\r
- System.out.println("CIF ERROR ? end of file; data missing: " + key);\r
- } else {\r
- data.put(fixKey(key), value);\r
- }\r
- }\r
- }\r
- } catch (Exception e) {\r
- // ?\r
- }\r
- try {\r
- if (br != null)\r
- br.close();\r
- } catch (Exception e) {\r
- // ?\r
- }\r
- return allData;\r
- }\r
-\r
- /**\r
- * create our own list of keywords and for each one create a list\r
- * of data associated with that keyword. For example, a list of all \r
- * x coordinates, then a list of all y coordinates, etc.\r
- * \r
- * @param data\r
- * @throws Exception\r
- */\r
- @SuppressWarnings("unchecked")\r
- private void getAllCifLoopData(Map<String, Object> data) throws Exception {\r
- String key;\r
- Lst<String> keyWords = new Lst<String>();\r
- while ((key = peekToken()) != null && key.charAt(0) == '_') {\r
- key = fixKey(getTokenPeeked());\r
- keyWords.addLast(key);\r
- data.put(key, new Lst<String>());\r
- }\r
- fieldCount = keyWords.size();\r
- if (fieldCount == 0)\r
- return;\r
- loopData = new String[fieldCount];\r
- while (getData())\r
- for (int i = 0; i < fieldCount; i++)\r
- ((Lst<String>)data.get(keyWords.get(i))).addLast(loopData[i]);\r
- }\r
-\r
- @Override\r
- public String readLine() {\r
- try {\r
- line = (reader == null ? br.readLine() : reader.readNextLine());\r
- if (line == null)\r
- return null;\r
- if (isHeader) {\r
- if (line.startsWith("#"))\r
- fileHeader.append(line).appendC('\n');\r
- else\r
- isHeader = false;\r
- }\r
- return line;\r
- } catch (Exception e) {\r
- return null;\r
- }\r
- }\r
- \r
- /**\r
- * The work horse; a general reader for loop data.\r
- * Fills loopData with fieldCount fields.\r
- * \r
- * @return false if EOF\r
- * @throws Exception\r
- */\r
- @Override\r
- public boolean getData() throws Exception {\r
- // line is already present, and we leave with the next line to parse\r
- for (int i = 0; i < fieldCount; ++i)\r
- if ((loopData[i] = getNextDataToken()) == null)\r
- return false;\r
- return (fieldCount > 0);\r
- }\r
-\r
- /**\r
- * \r
- * Skips all associated loop data. (Skips to next control word.)\r
- * \r
- * @throws Exception\r
- */\r
- @Override\r
- public String skipLoop(boolean doReport) throws Exception {\r
- String str;\r
- SB ret = (doReport ? new SB() : null);\r
- int n = 0;\r
- while ((str = peekToken()) != null && str.charAt(0) == '_') {\r
- if (ret != null)\r
- ret.append(str).append("\n");\r
- getTokenPeeked();\r
- n++;\r
- }\r
- int m = 0;\r
- while ((str = getNextDataToken()) != null) {\r
- if (ret == null)\r
- continue; \r
- ret.append(str).append(" ");\r
- if ((++m % n) == 0)\r
- ret.append("\n");\r
- }\r
- return (ret == null ? null : ret.toString());\r
- }\r
-\r
- /**\r
- * \r
- * @return the next token of any kind, or null\r
- * @throws Exception\r
- */\r
- @Override\r
- public String getNextToken() throws Exception {\r
- while (!strHasMoreTokens())\r
- if (setStringNextLine() == null)\r
- return null;\r
- return nextStrToken();\r
- }\r
-\r
- /**\r
- * \r
- * first checks to see if the next token is an unquoted\r
- * control code, and if so, returns null \r
- * \r
- * @return next data token or null\r
- * @throws Exception\r
- */\r
- @Override\r
- public String getNextDataToken() throws Exception { \r
- String str = peekToken();\r
- if (str == null)\r
- return null;\r
- if (wasUnQuoted)\r
- if (str.charAt(0) == '_' || str.startsWith("loop_")\r
- || str.startsWith("data_")\r
- || str.startsWith("stop_")\r
- || str.startsWith("global_"))\r
- return null;\r
- return getTokenPeeked();\r
- }\r
- \r
- /**\r
- * Just look at the next token. Saves it for retrieval \r
- * using getTokenPeeked()\r
- * \r
- * @return next token or null if EOF\r
- * @throws Exception\r
- */\r
- @Override\r
- public String peekToken() throws Exception {\r
- while (!strHasMoreTokens())\r
- if (setStringNextLine() == null)\r
- return null;\r
- int ich = this.ich;\r
- strPeeked = nextStrToken();\r
- ichPeeked= this.ich;\r
- this.ich = ich;\r
- return strPeeked;\r
- }\r
- \r
- /**\r
- * \r
- * @return the token last acquired; may be null\r
- */\r
- @Override\r
- public String getTokenPeeked() {\r
- ich = ichPeeked;\r
- return strPeeked;\r
- }\r
- \r
- /**\r
- * Used especially for data that might be multi-line data that\r
- * might have unwanted white space at start or end.\r
- * \r
- * @param str\r
- * @return str without any leading/trailing white space, and no '\n'\r
- */\r
- @Override\r
- public String fullTrim(String str) {\r
- int pt0 = -1;\r
- int pt1 = str.length();\r
- while (++pt0 < pt1 && PT.isWhitespace(str.charAt(pt0))) {\r
- }\r
- while (--pt1 > pt0 && PT.isWhitespace(str.charAt(pt1))) { \r
- }\r
- return str.substring(pt0, pt1 + 1);\r
- }\r
-\r
- private final static String grABC =\r
- "ABX\u0394E\u03A6\u0393H" // ABCDEFGH\r
- + "I_K\u039BMNO\u03A0" // I_KLMNOP\r
- + "\u0398P\u03A3TY_\u03A9\u039E\u03A5Z"; // QRSTU_WXYZ\r
- private final static String grabc =\r
- "\u03B1\u03B2\u03C7\u03A4\u03A5\u03C6\u03B3\u03B7" // abcdefgh\r
- + "\u03B9_\u03BA\u03BB\u03BC\u03BD\u03BF\u03C0" // i_klmnop\r
- + "\u03B8\u03C1\u03C3\u03C4\u03C5_\u03C9\u03BE\u03C5\u03B6"; // qrstu_wxyz\r
-\r
- /**\r
- * Only translating the basic Greek set here, not all the other stuff. See\r
- * http://www.iucr.org/resources/cif/spec/version1.1/semantics#markup\r
- * \r
- * @param data\r
- * @return cleaned string\r
- */\r
- @Override\r
- public String toUnicode(String data) {\r
- int pt;\r
- try {\r
- while ((pt = data.indexOf('\\')) >= 0) {\r
- int c = data.charAt(pt + 1);\r
- String ch = (c >= 65 && c <= 90 ? grABC.substring(c - 65, c - 64)\r
- : c >= 97 && c <= 122 ? grabc.substring(c - 97, c - 96) : "_");\r
- data = data.substring(0, pt) + ch + data.substring(pt + 2);\r
- }\r
- } catch (Exception e) {\r
- // ignore\r
- }\r
-\r
- return data;\r
- }\r
-\r
- /**\r
- * Passing an array of field names, this method fills two arrays. \r
- * The first, fieldOf, identifies \r
- * It does this by first creating a map of names to their indices in fields[].\r
- * \r
- * Alternatively, if fields is null, then a private array is filled, in order, \r
- * with key data. This is used in cases such as matrices for which there are simply\r
- * too many possibilities to list, and the key name itself contains the x-y \r
- * information that we need.\r
- * \r
- */\r
- @Override\r
- public int parseLoopParameters(String[] fields, int[] fieldOf, int[] propertyOf) throws Exception {\r
- int propertyCount = 0;\r
- if (fields == null) {\r
- // for reading full list of keys, as for matrices\r
- this.fields = new String[100];\r
- } else {\r
- if (!htFields.containsKey(fields[0]))\r
- for (int i = fields.length; --i >= 0;)\r
- htFields.put(fields[i], Integer.valueOf(i));\r
- for (int i = fields.length; --i >= 0;)\r
- fieldOf[i] = NONE;\r
- propertyCount = fields.length;\r
- }\r
- fieldCount = 0;\r
- while (true) {\r
- String str = peekToken();\r
- if (str == null) {\r
- // we are PREMATURELY done; reset\r
- fieldCount = 0;\r
- break;\r
- }\r
- // end of the loop is a new token starting with underscore\r
- if (str.charAt(0) != '_')\r
- break;\r
- \r
- int pt = fieldCount++;\r
- str = fixKey(getTokenPeeked());\r
- if (fields == null) {\r
- // just make a linear model, saving the list\r
- this.fields[propertyOf[pt] = fieldOf[pt] = pt] = str;\r
- continue;\r
- }\r
- Integer iField = htFields.get(str);\r
- int i = (iField == null ? NONE : iField.intValue());\r
- if ((propertyOf[pt] = i) != NONE)\r
- fieldOf[i] = pt;\r
- }\r
- if (fieldCount > 0)\r
- loopData = new String[fieldCount];\r
- return propertyCount;\r
- }\r
-\r
- @Override\r
- public String fixKey(String key) {\r
- // PRELIMINARY -- BilBao _magnetic\r
- // PRELIMINARY -- Jana2006\r
- return (PT.rep(\r
- key.startsWith("_magnetic") ? key.substring(9) \r
- : key.startsWith("_jana") ? key.substring(5) \r
- : key, ".", "_").toLowerCase());\r
- }\r
-\r
- //////////////////// private methods ////////////////////\r
- \r
- \r
- /**\r
- * sets a string to be parsed from the beginning\r
- * \r
- * @param str\r
- */\r
- private void setString(String str) {\r
- this.str = line = str;\r
- cch = (str == null ? 0 : str.length());\r
- ich = 0;\r
- }\r
-\r
- /*\r
- * http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax\r
- * \r
- * 17. The special sequence of end-of-line followed \r
- * immediately by a semicolon in column one (denoted "<eol>;") \r
- * may also be used as a delimiter at the beginning and end \r
- * of a character string comprising a data value. The complete \r
- * bounded string is called a text field, and may be used to \r
- * convey multi-line values. The end-of-line associated with \r
- * the closing semicolon does not form part of the data value. \r
- * Within a multi-line text field, leading white space within \r
- * text lines must be retained as part of the data value; trailing \r
- * white space on a line may however be elided.\r
- * \r
- * 18. A text field delimited by the <eol>; digraph may not \r
- * include a semicolon at the start of a line of text as \r
- * part of its value.\r
- * \r
- * 20. For example, the data value foo may be expressed \r
- * equivalently as an unquoted string foo, as a quoted \r
- * string 'foo' or as a text field\r
- *\r
- *;foo\r
- *;\r
- *\r
- * By contrast the value of the text field\r
- *\r
- *; foo\r
- * bar\r
- *;\r
- *\r
- * is foo<eol> bar (where <eol> represents an end-of-line); \r
- * the embedded space characters are significant.\r
- * \r
- * \r
- * I (BH) note, however, that we sometimes have:\r
- * \r
- * _some_name\r
- * ;\r
- * the name here\r
- * ;\r
- * \r
- * so this should actually be\r
- * \r
- * ;the name here\r
- * ;\r
- * \r
- * for this, we use fullTrim();\r
- * \r
- */\r
- \r
- /**\r
- * \r
- * sets the string for parsing to be from the next line \r
- * when the token buffer is empty, and if ';' is at the \r
- * beginning of that line, extends the string to include\r
- * that full multiline string. Uses \1 to indicate that \r
- * this is a special quotation. \r
- * \r
- * @return the next line or null if EOF\r
- * @throws Exception\r
- */\r
- private String setStringNextLine() throws Exception {\r
- setString(readLine());\r
- if (line == null || line.length() == 0)\r
- return line;\r
- if (line.charAt(0) != ';') {\r
- if (str.startsWith("###non-st#"))\r
- ich = 10;\r
- return line;\r
- }\r
- ich = 1;\r
- String str = '\1' + line.substring(1) + '\n';\r
- while (readLine() != null) {\r
- if (line.startsWith(";")) {\r
- // remove trailing <eol> only, and attach rest of next line\r
- str = str.substring(0, str.length() - 1)\r
- + '\1' + line.substring(1);\r
- break;\r
- }\r
- str += line + '\n';\r
- }\r
- setString(str);\r
- return str;\r
- }\r
-\r
- /**\r
- * @return TRUE if there are more tokens in the line buffer\r
- * \r
- */\r
- private boolean strHasMoreTokens() {\r
- if (str == null)\r
- return false;\r
- char ch = '#';\r
- while (ich < cch && ((ch = str.charAt(ich)) == ' ' || ch == '\t'))\r
- ++ich;\r
- return (ich < cch && ch != '#');\r
- }\r
-\r
- /**\r
- * assume that hasMoreTokens() has been called and that\r
- * ich is pointing at a non-white character. Also sets\r
- * boolean wasUnQuoted, because we need to know if we should \r
- * be checking for a control keyword. 'loop_' is different from just \r
- * loop_ without the quotes.\r
- *\r
- * @return null if no more tokens, "\0" if '.' or '?', or next token \r
- */\r
- private String nextStrToken() {\r
- if (ich == cch)\r
- return null;\r
- int ichStart = ich;\r
- char ch = str.charAt(ichStart);\r
- if (ch != '\'' && ch != '"' && ch != '\1') {\r
- wasUnQuoted = true;\r
- while (ich < cch && (ch = str.charAt(ich)) != ' ' && ch != '\t')\r
- ++ich;\r
- if (ich == ichStart + 1)\r
- if (nullString != null && (str.charAt(ichStart) == '.' || str.charAt(ichStart) == '?'))\r
- return nullString;\r
- String s = str.substring(ichStart, ich);\r
- return s;\r
- }\r
- wasUnQuoted = false;\r
- char chOpeningQuote = ch;\r
- boolean previousCharacterWasQuote = false;\r
- while (++ich < cch) {\r
- ch = str.charAt(ich);\r
- if (previousCharacterWasQuote && (ch == ' ' || ch == '\t'))\r
- break;\r
- previousCharacterWasQuote = (ch == chOpeningQuote);\r
- }\r
- if (ich == cch) {\r
- if (previousCharacterWasQuote) // close quote was last char of string\r
- return str.substring(ichStart + 1, ich - 1);\r
- // reached the end of the string without finding closing '\r
- return str.substring(ichStart, ich);\r
- }\r
- ++ich; // throw away the last white character\r
- return str.substring(ichStart + 1, ich - 2);\r
- }\r
-\r
- \r
+package javajs.util;
+
+import java.io.BufferedReader;
+
+import java.util.Hashtable;
+
+import java.util.Map;
+
+import javajs.api.GenericCifDataParser;
+import javajs.api.GenericLineReader;
+
+
+
+public class CifDataParser implements GenericCifDataParser {
+ /**
+ *
+ * A special tokenizer class for dealing with quoted strings in CIF files.
+ *
+ * Greek letters implemented in Jmol 13.3.9 and only for
+ * titles and space groups. All other mark ups ignored.
+ *
+ *<p>
+ * regarding the treatment of single quotes vs. primes in
+ * cif file, PMR wrote:
+ *</p>
+ *<p>
+ * * There is a formal grammar for CIF
+ * (see http://www.iucr.org/iucr-top/cif/index.html)
+ * which confirms this. The textual explanation is
+ *<p />
+ *<p>
+ * 14. Matching single or double quote characters (' or ") may
+ * be used to bound a string representing a non-simple data value
+ * provided the string does not extend over more than one line.
+ *<p />
+ *<p>
+ * 15. Because data values are invariably separated from other
+ * tokens in the file by white space, such a quote-delimited
+ * character string may contain instances of the character used
+ * to delimit the string provided they are not followed by white
+ * space. For example, the data item
+ *<code>
+ * _example 'a dog's life'
+ *</code>
+ * is legal; the data value is a dog's life.
+ *</p>
+ *<p>
+ * [PMR - the terminating character(s) are quote+whitespace.
+ * That would mean that:
+ *<code>
+ * _example 'Jones' life'
+ *</code>
+ * would be an error
+ *</p>
+ *<p>
+ * The CIF format was developed in that late 1980's under the aegis of the
+ * International Union of Crystallography (I am a consultant to the COMCIFs
+ * committee). It was ratified by the Union and there have been several
+ * workshops. mmCIF is an extension of CIF which includes a relational
+ * structure. The formal publications are:
+ *</p>
+ *<p>
+ * Hall, S. R. (1991). "The STAR File: A New Format for Electronic Data
+ * Transfer and Archiving", J. Chem. Inform. Comp. Sci., 31, 326-333.
+ * Hall, S. R., Allen, F. H. and Brown, I. D. (1991). "The Crystallographic
+ * Information File (CIF): A New Standard Archive File for Crystallography",
+ * Acta Cryst., A47, 655-685.
+ * Hall, S.R. & Spadaccini, N. (1994). "The STAR File: Detailed
+ * Specifications," J. Chem. Info. Comp. Sci., 34, 505-508.
+ *</p>
+ */
+ private GenericLineReader reader;
+ private BufferedReader br;
+
+ private String line;
+ private String str;
+ private int ich;
+ private int cch;
+ private boolean wasUnQuoted;
+ private String strPeeked;
+ private int ichPeeked;
+ private int fieldCount;
+ private String[] loopData;
+ private SB fileHeader = new SB();
+ private boolean isHeader = true;
+ private String nullString = "\0";
+
+ /**
+ * Set the string value of what is returned for "." and "?"
+ *
+ * @param nullString null here returns "." and "?"; default is "\0"
+ *
+ */
+ public void setNullValue(String nullString) {
+ this.nullString = nullString;
+ }
+
+ /**
+ * A global, static map that contains field information. The assumption is that
+ * if we read a set of fields for, say, atom_site, once in a lifetime, then
+ * that should be good forever. Those are static lists. Or should be....
+ */
+ private static Map<String, Integer> htFields = new Hashtable<String, Integer>();
+
+ ////////////////////////////////////////////////////////////////
+ // special tokenizer class
+ ////////////////////////////////////////////////////////////////
+
+ public CifDataParser() {
+ // for reflection
+ }
+
+ private String[] fields;
+
+ @Override
+ public String getLoopData(int i) {
+ return loopData[i];
+ }
+
+ @Override
+ public int getFieldCount() {
+ return fieldCount;
+ }
+
+ @Override
+ public String getField(int i) {
+ return fields[i];
+ }
+
+ /**
+ * A Chemical Information File data parser.
+ *
+ * Should be called immediately upon construction.
+ *
+ * Two options; one of reader or br should be null, or reader will be
+ * ignored. Just simpler this way...
+ *
+ * @param reader Anything that can deliver a line of text or null
+ * @param br A standard BufferedReader.
+ *
+ */
+ @Override
+ public CifDataParser set(GenericLineReader reader, BufferedReader br) {
+ this.reader = reader;
+ this.br = br;
+ return this;
+ }
+
+ /**
+ *
+ * @return commented-out section at the start of a CIF file.
+ *
+ */
+ @Override
+ public String getFileHeader() {
+ return fileHeader.toString();
+ }
+
+
+ /**
+ * Parses all CIF data for a reader defined in the constructor
+ * into a standard Map structure and close the BufferedReader if
+ * it exists.
+ *
+ * @return Hashtable of models Vector of Hashtable data
+ */
+ @Override
+ public Map<String, Object> getAllCifData() {
+ line = "";
+ String key;
+ Map<String, Object> data = null;
+ Map<String, Object> allData = new Hashtable<String, Object>();
+ Lst<Map<String, Object>> models = new Lst<Map<String,Object>>();
+ allData.put("models", models);
+ try {
+ while ((key = getNextToken()) != null) {
+ if (key.startsWith("global_") || key.startsWith("data_")) {
+ models.addLast(data = new Hashtable<String, Object>());
+ data.put("name", key);
+ continue;
+ }
+ if (key.startsWith("loop_")) {
+ getAllCifLoopData(data);
+ continue;
+ }
+ if (key.charAt(0) != '_') {
+ System.out.println("CIF ERROR ? should be an underscore: " + key);
+ } else {
+ String value = getNextToken();
+ if (value == null) {
+ System.out.println("CIF ERROR ? end of file; data missing: " + key);
+ } else {
+ data.put(fixKey(key), value);
+ }
+ }
+ }
+ } catch (Exception e) {
+ // ?
+ }
+ try {
+ if (br != null)
+ br.close();
+ } catch (Exception e) {
+ // ?
+ }
+ return allData;
+ }
+
+ /**
+ * create our own list of keywords and for each one create a list
+ * of data associated with that keyword. For example, a list of all
+ * x coordinates, then a list of all y coordinates, etc.
+ *
+ * @param data
+ * @throws Exception
+ */
+ @SuppressWarnings("unchecked")
+ private void getAllCifLoopData(Map<String, Object> data) throws Exception {
+ String key;
+ Lst<String> keyWords = new Lst<String>();
+ while ((key = peekToken()) != null && key.charAt(0) == '_') {
+ key = fixKey(getTokenPeeked());
+ keyWords.addLast(key);
+ data.put(key, new Lst<String>());
+ }
+ fieldCount = keyWords.size();
+ if (fieldCount == 0)
+ return;
+ loopData = new String[fieldCount];
+ while (getData())
+ for (int i = 0; i < fieldCount; i++)
+ ((Lst<String>)data.get(keyWords.get(i))).addLast(loopData[i]);
+ }
+
+ @Override
+ public String readLine() {
+ try {
+ line = (reader == null ? br.readLine() : reader.readNextLine());
+ if (line == null)
+ return null;
+ if (isHeader) {
+ if (line.startsWith("#"))
+ fileHeader.append(line).appendC('\n');
+ else
+ isHeader = false;
+ }
+ return line;
+ } catch (Exception e) {
+ return null;
+ }
+ }
+
+ /**
+ * The work horse; a general reader for loop data.
+ * Fills loopData with fieldCount fields.
+ *
+ * @return false if EOF
+ * @throws Exception
+ */
+ @Override
+ public boolean getData() throws Exception {
+ // line is already present, and we leave with the next line to parse
+ for (int i = 0; i < fieldCount; ++i)
+ if ((loopData[i] = getNextDataToken()) == null)
+ return false;
+ return (fieldCount > 0);
+ }
+
+ /**
+ *
+ * Skips all associated loop data. (Skips to next control word.)
+ *
+ * @throws Exception
+ */
+ @Override
+ public String skipLoop(boolean doReport) throws Exception {
+ String str;
+ SB ret = (doReport ? new SB() : null);
+ int n = 0;
+ while ((str = peekToken()) != null && str.charAt(0) == '_') {
+ if (ret != null)
+ ret.append(str).append("\n");
+ getTokenPeeked();
+ n++;
+ }
+ int m = 0;
+ while ((str = getNextDataToken()) != null) {
+ if (ret == null)
+ continue;
+ ret.append(str).append(" ");
+ if ((++m % n) == 0)
+ ret.append("\n");
+ }
+ return (ret == null ? null : ret.toString());
+ }
+
+ /**
+ *
+ * @return the next token of any kind, or null
+ * @throws Exception
+ */
+ @Override
+ public String getNextToken() throws Exception {
+ while (!strHasMoreTokens())
+ if (setStringNextLine() == null)
+ return null;
+ return nextStrToken();
+ }
+
+ /**
+ *
+ * first checks to see if the next token is an unquoted
+ * control code, and if so, returns null
+ *
+ * @return next data token or null
+ * @throws Exception
+ */
+ @Override
+ public String getNextDataToken() throws Exception {
+ String str = peekToken();
+ if (str == null)
+ return null;
+ if (wasUnQuoted)
+ if (str.charAt(0) == '_' || str.startsWith("loop_")
+ || str.startsWith("data_")
+ || str.startsWith("stop_")
+ || str.startsWith("global_"))
+ return null;
+ return getTokenPeeked();
+ }
+
+ /**
+ * Just look at the next token. Saves it for retrieval
+ * using getTokenPeeked()
+ *
+ * @return next token or null if EOF
+ * @throws Exception
+ */
+ @Override
+ public String peekToken() throws Exception {
+ while (!strHasMoreTokens())
+ if (setStringNextLine() == null)
+ return null;
+ int ich = this.ich;
+ strPeeked = nextStrToken();
+ ichPeeked= this.ich;
+ this.ich = ich;
+ return strPeeked;
+ }
+
+ /**
+ *
+ * @return the token last acquired; may be null
+ */
+ @Override
+ public String getTokenPeeked() {
+ ich = ichPeeked;
+ return strPeeked;
+ }
+
+ /**
+ * Used especially for data that might be multi-line data that
+ * might have unwanted white space at start or end.
+ *
+ * @param str
+ * @return str without any leading/trailing white space, and no '\n'
+ */
+ @Override
+ public String fullTrim(String str) {
+ int pt0 = -1;
+ int pt1 = str.length();
+ while (++pt0 < pt1 && PT.isWhitespace(str.charAt(pt0))) {
+ }
+ while (--pt1 > pt0 && PT.isWhitespace(str.charAt(pt1))) {
+ }
+ return str.substring(pt0, pt1 + 1);
+ }
+
+ private final static String grABC =
+ "ABX\u0394E\u03A6\u0393H" // ABCDEFGH
+ + "I_K\u039BMNO\u03A0" // I_KLMNOP
+ + "\u0398P\u03A3TY_\u03A9\u039E\u03A5Z"; // QRSTU_WXYZ
+ private final static String grabc =
+ "\u03B1\u03B2\u03C7\u03A4\u03A5\u03C6\u03B3\u03B7" // abcdefgh
+ + "\u03B9_\u03BA\u03BB\u03BC\u03BD\u03BF\u03C0" // i_klmnop
+ + "\u03B8\u03C1\u03C3\u03C4\u03C5_\u03C9\u03BE\u03C5\u03B6"; // qrstu_wxyz
+
+ /**
+ * Only translating the basic Greek set here, not all the other stuff. See
+ * http://www.iucr.org/resources/cif/spec/version1.1/semantics#markup
+ *
+ * @param data
+ * @return cleaned string
+ */
+ @Override
+ public String toUnicode(String data) {
+ int pt;
+ try {
+ while ((pt = data.indexOf('\\')) >= 0) {
+ int c = data.charAt(pt + 1);
+ String ch = (c >= 65 && c <= 90 ? grABC.substring(c - 65, c - 64)
+ : c >= 97 && c <= 122 ? grabc.substring(c - 97, c - 96) : "_");
+ data = data.substring(0, pt) + ch + data.substring(pt + 2);
+ }
+ } catch (Exception e) {
+ // ignore
+ }
+
+ return data;
+ }
+
+ /**
+ * Passing an array of field names, this method fills two arrays.
+ * The first, fieldOf, identifies
+ * It does this by first creating a map of names to their indices in fields[].
+ *
+ * Alternatively, if fields is null, then a private array is filled, in order,
+ * with key data. This is used in cases such as matrices for which there are simply
+ * too many possibilities to list, and the key name itself contains the x-y
+ * information that we need.
+ *
+ */
+ @Override
+ public int parseLoopParameters(String[] fields, int[] fieldOf, int[] propertyOf) throws Exception {
+ int propertyCount = 0;
+ if (fields == null) {
+ // for reading full list of keys, as for matrices
+ this.fields = new String[100];
+ } else {
+ if (!htFields.containsKey(fields[0]))
+ for (int i = fields.length; --i >= 0;)
+ htFields.put(fields[i], Integer.valueOf(i));
+ for (int i = fields.length; --i >= 0;)
+ fieldOf[i] = NONE;
+ propertyCount = fields.length;
+ }
+ fieldCount = 0;
+ while (true) {
+ String str = peekToken();
+ if (str == null) {
+ // we are PREMATURELY done; reset
+ fieldCount = 0;
+ break;
+ }
+ // end of the loop is a new token starting with underscore
+ if (str.charAt(0) != '_')
+ break;
+
+ int pt = fieldCount++;
+ str = fixKey(getTokenPeeked());
+ if (fields == null) {
+ // just make a linear model, saving the list
+ this.fields[propertyOf[pt] = fieldOf[pt] = pt] = str;
+ continue;
+ }
+ Integer iField = htFields.get(str);
+ int i = (iField == null ? NONE : iField.intValue());
+ if ((propertyOf[pt] = i) != NONE)
+ fieldOf[i] = pt;
+ }
+ if (fieldCount > 0)
+ loopData = new String[fieldCount];
+ return propertyCount;
+ }
+
+ @Override
+ public String fixKey(String key) {
+ // PRELIMINARY -- BilBao _magnetic
+ // PRELIMINARY -- Jana2006
+ return (PT.rep(
+ key.startsWith("_magnetic") ? key.substring(9)
+ : key.startsWith("_jana") ? key.substring(5)
+ : key, ".", "_").toLowerCase());
+ }
+
+ //////////////////// private methods ////////////////////
+
+
+ /**
+ * sets a string to be parsed from the beginning
+ *
+ * @param str
+ */
+ private void setString(String str) {
+ this.str = line = str;
+ cch = (str == null ? 0 : str.length());
+ ich = 0;
+ }
+
+ /*
+ * http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax
+ *
+ * 17. The special sequence of end-of-line followed
+ * immediately by a semicolon in column one (denoted "<eol>;")
+ * may also be used as a delimiter at the beginning and end
+ * of a character string comprising a data value. The complete
+ * bounded string is called a text field, and may be used to
+ * convey multi-line values. The end-of-line associated with
+ * the closing semicolon does not form part of the data value.
+ * Within a multi-line text field, leading white space within
+ * text lines must be retained as part of the data value; trailing
+ * white space on a line may however be elided.
+ *
+ * 18. A text field delimited by the <eol>; digraph may not
+ * include a semicolon at the start of a line of text as
+ * part of its value.
+ *
+ * 20. For example, the data value foo may be expressed
+ * equivalently as an unquoted string foo, as a quoted
+ * string 'foo' or as a text field
+ *
+ *;foo
+ *;
+ *
+ * By contrast the value of the text field
+ *
+ *; foo
+ * bar
+ *;
+ *
+ * is foo<eol> bar (where <eol> represents an end-of-line);
+ * the embedded space characters are significant.
+ *
+ *
+ * I (BH) note, however, that we sometimes have:
+ *
+ * _some_name
+ * ;
+ * the name here
+ * ;
+ *
+ * so this should actually be
+ *
+ * ;the name here
+ * ;
+ *
+ * for this, we use fullTrim();
+ *
+ */
+
+ /**
+ *
+ * sets the string for parsing to be from the next line
+ * when the token buffer is empty, and if ';' is at the
+ * beginning of that line, extends the string to include
+ * that full multiline string. Uses \1 to indicate that
+ * this is a special quotation.
+ *
+ * @return the next line or null if EOF
+ * @throws Exception
+ */
+ private String setStringNextLine() throws Exception {
+ setString(readLine());
+ if (line == null || line.length() == 0)
+ return line;
+ if (line.charAt(0) != ';') {
+ if (str.startsWith("###non-st#"))
+ ich = 10;
+ return line;
+ }
+ ich = 1;
+ String str = '\1' + line.substring(1) + '\n';
+ while (readLine() != null) {
+ if (line.startsWith(";")) {
+ // remove trailing <eol> only, and attach rest of next line
+ str = str.substring(0, str.length() - 1)
+ + '\1' + line.substring(1);
+ break;
+ }
+ str += line + '\n';
+ }
+ setString(str);
+ return str;
+ }
+
+ /**
+ * @return TRUE if there are more tokens in the line buffer
+ *
+ */
+ private boolean strHasMoreTokens() {
+ if (str == null)
+ return false;
+ char ch = '#';
+ while (ich < cch && ((ch = str.charAt(ich)) == ' ' || ch == '\t'))
+ ++ich;
+ return (ich < cch && ch != '#');
+ }
+
+ /**
+ * assume that hasMoreTokens() has been called and that
+ * ich is pointing at a non-white character. Also sets
+ * boolean wasUnQuoted, because we need to know if we should
+ * be checking for a control keyword. 'loop_' is different from just
+ * loop_ without the quotes.
+ *
+ * @return null if no more tokens, "\0" if '.' or '?', or next token
+ */
+ private String nextStrToken() {
+ if (ich == cch)
+ return null;
+ int ichStart = ich;
+ char ch = str.charAt(ichStart);
+ if (ch != '\'' && ch != '"' && ch != '\1') {
+ wasUnQuoted = true;
+ while (ich < cch && (ch = str.charAt(ich)) != ' ' && ch != '\t')
+ ++ich;
+ if (ich == ichStart + 1)
+ if (nullString != null && (str.charAt(ichStart) == '.' || str.charAt(ichStart) == '?'))
+ return nullString;
+ String s = str.substring(ichStart, ich);
+ return s;
+ }
+ wasUnQuoted = false;
+ char chOpeningQuote = ch;
+ boolean previousCharacterWasQuote = false;
+ while (++ich < cch) {
+ ch = str.charAt(ich);
+ if (previousCharacterWasQuote && (ch == ' ' || ch == '\t'))
+ break;
+ previousCharacterWasQuote = (ch == chOpeningQuote);
+ }
+ if (ich == cch) {
+ if (previousCharacterWasQuote) // close quote was last char of string
+ return str.substring(ichStart + 1, ich - 1);
+ // reached the end of the string without finding closing '
+ return str.substring(ichStart, ich);
+ }
+ ++ich; // throw away the last white character
+ return str.substring(ichStart + 1, ich - 2);
+ }
+
+
}
\ No newline at end of file