X-Git-Url: http://source.jalview.org/gitweb/?p=jalviewjs.git;a=blobdiff_plain;f=src%2Fjavajs%2Futil%2FCifDataParser.java;fp=src%2Fjavajs%2Futil%2FCifDataParser.java;h=d0cb3fda9c8f9390a471c210edd49b710bb7ecfd;hp=39317c7fb6037ade9b0ec87734a37b8b0c55a54d;hb=b9b7a352eee79b7764c3b09c9d19663075061d8c;hpb=7301a2415adab88038b291fc54caeeb3a5a47a44 diff --git a/src/javajs/util/CifDataParser.java b/src/javajs/util/CifDataParser.java index 39317c7..d0cb3fd 100644 --- a/src/javajs/util/CifDataParser.java +++ b/src/javajs/util/CifDataParser.java @@ -1,633 +1,633 @@ -package javajs.util; - -import java.io.BufferedReader; - -import java.util.Hashtable; - -import java.util.Map; - -import javajs.api.GenericCifDataParser; -import javajs.api.GenericLineReader; - - - -public class CifDataParser implements GenericCifDataParser { - /** - * - * A special tokenizer class for dealing with quoted strings in CIF files. - * - * Greek letters implemented in Jmol 13.3.9 and only for - * titles and space groups. All other mark ups ignored. - * - *

- * regarding the treatment of single quotes vs. primes in - * cif file, PMR wrote: - *

- *

- * * There is a formal grammar for CIF - * (see http://www.iucr.org/iucr-top/cif/index.html) - * which confirms this. The textual explanation is - *

- *

- * 14. Matching single or double quote characters (' or ") may - * be used to bound a string representing a non-simple data value - * provided the string does not extend over more than one line. - *

- *

- * 15. Because data values are invariably separated from other - * tokens in the file by white space, such a quote-delimited - * character string may contain instances of the character used - * to delimit the string provided they are not followed by white - * space. For example, the data item - * - * _example 'a dog's life' - * - * is legal; the data value is a dog's life. - *

- *

- * [PMR - the terminating character(s) are quote+whitespace. - * That would mean that: - * - * _example 'Jones' life' - * - * would be an error - *

- *

- * The CIF format was developed in that late 1980's under the aegis of the - * International Union of Crystallography (I am a consultant to the COMCIFs - * committee). It was ratified by the Union and there have been several - * workshops. mmCIF is an extension of CIF which includes a relational - * structure. The formal publications are: - *

- *

- * Hall, S. R. (1991). "The STAR File: A New Format for Electronic Data - * Transfer and Archiving", J. Chem. Inform. Comp. Sci., 31, 326-333. - * Hall, S. R., Allen, F. H. and Brown, I. D. (1991). "The Crystallographic - * Information File (CIF): A New Standard Archive File for Crystallography", - * Acta Cryst., A47, 655-685. - * Hall, S.R. & Spadaccini, N. (1994). "The STAR File: Detailed - * Specifications," J. Chem. Info. Comp. Sci., 34, 505-508. - *

- */ - private GenericLineReader reader; - private BufferedReader br; - - private String line; - private String str; - private int ich; - private int cch; - private boolean wasUnQuoted; - private String strPeeked; - private int ichPeeked; - private int fieldCount; - private String[] loopData; - private SB fileHeader = new SB(); - private boolean isHeader = true; - private String nullString = "\0"; - - /** - * Set the string value of what is returned for "." and "?" - * - * @param nullString null here returns "." and "?"; default is "\0" - * - */ - public void setNullValue(String nullString) { - this.nullString = nullString; - } - - /** - * A global, static map that contains field information. The assumption is that - * if we read a set of fields for, say, atom_site, once in a lifetime, then - * that should be good forever. Those are static lists. Or should be.... - */ - private static Map htFields = new Hashtable(); - - //////////////////////////////////////////////////////////////// - // special tokenizer class - //////////////////////////////////////////////////////////////// - - public CifDataParser() { - // for reflection - } - - private String[] fields; - - @Override - public String getLoopData(int i) { - return loopData[i]; - } - - @Override - public int getFieldCount() { - return fieldCount; - } - - @Override - public String getField(int i) { - return fields[i]; - } - - /** - * A Chemical Information File data parser. - * - * Should be called immediately upon construction. - * - * Two options; one of reader or br should be null, or reader will be - * ignored. Just simpler this way... - * - * @param reader Anything that can deliver a line of text or null - * @param br A standard BufferedReader. - * - */ - @Override - public CifDataParser set(GenericLineReader reader, BufferedReader br) { - this.reader = reader; - this.br = br; - return this; - } - - /** - * - * @return commented-out section at the start of a CIF file. - * - */ - @Override - public String getFileHeader() { - return fileHeader.toString(); - } - - - /** - * Parses all CIF data for a reader defined in the constructor - * into a standard Map structure and close the BufferedReader if - * it exists. - * - * @return Hashtable of models Vector of Hashtable data - */ - @Override - public Map getAllCifData() { - line = ""; - String key; - Map data = null; - Map allData = new Hashtable(); - Lst> models = new Lst>(); - allData.put("models", models); - try { - while ((key = getNextToken()) != null) { - if (key.startsWith("global_") || key.startsWith("data_")) { - models.addLast(data = new Hashtable()); - data.put("name", key); - continue; - } - if (key.startsWith("loop_")) { - getAllCifLoopData(data); - continue; - } - if (key.charAt(0) != '_') { - System.out.println("CIF ERROR ? should be an underscore: " + key); - } else { - String value = getNextToken(); - if (value == null) { - System.out.println("CIF ERROR ? end of file; data missing: " + key); - } else { - data.put(fixKey(key), value); - } - } - } - } catch (Exception e) { - // ? - } - try { - if (br != null) - br.close(); - } catch (Exception e) { - // ? - } - return allData; - } - - /** - * create our own list of keywords and for each one create a list - * of data associated with that keyword. For example, a list of all - * x coordinates, then a list of all y coordinates, etc. - * - * @param data - * @throws Exception - */ - @SuppressWarnings("unchecked") - private void getAllCifLoopData(Map data) throws Exception { - String key; - Lst keyWords = new Lst(); - while ((key = peekToken()) != null && key.charAt(0) == '_') { - key = fixKey(getTokenPeeked()); - keyWords.addLast(key); - data.put(key, new Lst()); - } - fieldCount = keyWords.size(); - if (fieldCount == 0) - return; - loopData = new String[fieldCount]; - while (getData()) - for (int i = 0; i < fieldCount; i++) - ((Lst)data.get(keyWords.get(i))).addLast(loopData[i]); - } - - @Override - public String readLine() { - try { - line = (reader == null ? br.readLine() : reader.readNextLine()); - if (line == null) - return null; - if (isHeader) { - if (line.startsWith("#")) - fileHeader.append(line).appendC('\n'); - else - isHeader = false; - } - return line; - } catch (Exception e) { - return null; - } - } - - /** - * The work horse; a general reader for loop data. - * Fills loopData with fieldCount fields. - * - * @return false if EOF - * @throws Exception - */ - @Override - public boolean getData() throws Exception { - // line is already present, and we leave with the next line to parse - for (int i = 0; i < fieldCount; ++i) - if ((loopData[i] = getNextDataToken()) == null) - return false; - return (fieldCount > 0); - } - - /** - * - * Skips all associated loop data. (Skips to next control word.) - * - * @throws Exception - */ - @Override - public String skipLoop(boolean doReport) throws Exception { - String str; - SB ret = (doReport ? new SB() : null); - int n = 0; - while ((str = peekToken()) != null && str.charAt(0) == '_') { - if (ret != null) - ret.append(str).append("\n"); - getTokenPeeked(); - n++; - } - int m = 0; - while ((str = getNextDataToken()) != null) { - if (ret == null) - continue; - ret.append(str).append(" "); - if ((++m % n) == 0) - ret.append("\n"); - } - return (ret == null ? null : ret.toString()); - } - - /** - * - * @return the next token of any kind, or null - * @throws Exception - */ - @Override - public String getNextToken() throws Exception { - while (!strHasMoreTokens()) - if (setStringNextLine() == null) - return null; - return nextStrToken(); - } - - /** - * - * first checks to see if the next token is an unquoted - * control code, and if so, returns null - * - * @return next data token or null - * @throws Exception - */ - @Override - public String getNextDataToken() throws Exception { - String str = peekToken(); - if (str == null) - return null; - if (wasUnQuoted) - if (str.charAt(0) == '_' || str.startsWith("loop_") - || str.startsWith("data_") - || str.startsWith("stop_") - || str.startsWith("global_")) - return null; - return getTokenPeeked(); - } - - /** - * Just look at the next token. Saves it for retrieval - * using getTokenPeeked() - * - * @return next token or null if EOF - * @throws Exception - */ - @Override - public String peekToken() throws Exception { - while (!strHasMoreTokens()) - if (setStringNextLine() == null) - return null; - int ich = this.ich; - strPeeked = nextStrToken(); - ichPeeked= this.ich; - this.ich = ich; - return strPeeked; - } - - /** - * - * @return the token last acquired; may be null - */ - @Override - public String getTokenPeeked() { - ich = ichPeeked; - return strPeeked; - } - - /** - * Used especially for data that might be multi-line data that - * might have unwanted white space at start or end. - * - * @param str - * @return str without any leading/trailing white space, and no '\n' - */ - @Override - public String fullTrim(String str) { - int pt0 = -1; - int pt1 = str.length(); - while (++pt0 < pt1 && PT.isWhitespace(str.charAt(pt0))) { - } - while (--pt1 > pt0 && PT.isWhitespace(str.charAt(pt1))) { - } - return str.substring(pt0, pt1 + 1); - } - - private final static String grABC = - "ABX\u0394E\u03A6\u0393H" // ABCDEFGH - + "I_K\u039BMNO\u03A0" // I_KLMNOP - + "\u0398P\u03A3TY_\u03A9\u039E\u03A5Z"; // QRSTU_WXYZ - private final static String grabc = - "\u03B1\u03B2\u03C7\u03A4\u03A5\u03C6\u03B3\u03B7" // abcdefgh - + "\u03B9_\u03BA\u03BB\u03BC\u03BD\u03BF\u03C0" // i_klmnop - + "\u03B8\u03C1\u03C3\u03C4\u03C5_\u03C9\u03BE\u03C5\u03B6"; // qrstu_wxyz - - /** - * Only translating the basic Greek set here, not all the other stuff. See - * http://www.iucr.org/resources/cif/spec/version1.1/semantics#markup - * - * @param data - * @return cleaned string - */ - @Override - public String toUnicode(String data) { - int pt; - try { - while ((pt = data.indexOf('\\')) >= 0) { - int c = data.charAt(pt + 1); - String ch = (c >= 65 && c <= 90 ? grABC.substring(c - 65, c - 64) - : c >= 97 && c <= 122 ? grabc.substring(c - 97, c - 96) : "_"); - data = data.substring(0, pt) + ch + data.substring(pt + 2); - } - } catch (Exception e) { - // ignore - } - - return data; - } - - /** - * Passing an array of field names, this method fills two arrays. - * The first, fieldOf, identifies - * It does this by first creating a map of names to their indices in fields[]. - * - * Alternatively, if fields is null, then a private array is filled, in order, - * with key data. This is used in cases such as matrices for which there are simply - * too many possibilities to list, and the key name itself contains the x-y - * information that we need. - * - */ - @Override - public int parseLoopParameters(String[] fields, int[] fieldOf, int[] propertyOf) throws Exception { - int propertyCount = 0; - if (fields == null) { - // for reading full list of keys, as for matrices - this.fields = new String[100]; - } else { - if (!htFields.containsKey(fields[0])) - for (int i = fields.length; --i >= 0;) - htFields.put(fields[i], Integer.valueOf(i)); - for (int i = fields.length; --i >= 0;) - fieldOf[i] = NONE; - propertyCount = fields.length; - } - fieldCount = 0; - while (true) { - String str = peekToken(); - if (str == null) { - // we are PREMATURELY done; reset - fieldCount = 0; - break; - } - // end of the loop is a new token starting with underscore - if (str.charAt(0) != '_') - break; - - int pt = fieldCount++; - str = fixKey(getTokenPeeked()); - if (fields == null) { - // just make a linear model, saving the list - this.fields[propertyOf[pt] = fieldOf[pt] = pt] = str; - continue; - } - Integer iField = htFields.get(str); - int i = (iField == null ? NONE : iField.intValue()); - if ((propertyOf[pt] = i) != NONE) - fieldOf[i] = pt; - } - if (fieldCount > 0) - loopData = new String[fieldCount]; - return propertyCount; - } - - @Override - public String fixKey(String key) { - // PRELIMINARY -- BilBao _magnetic - // PRELIMINARY -- Jana2006 - return (PT.rep( - key.startsWith("_magnetic") ? key.substring(9) - : key.startsWith("_jana") ? key.substring(5) - : key, ".", "_").toLowerCase()); - } - - //////////////////// private methods //////////////////// - - - /** - * sets a string to be parsed from the beginning - * - * @param str - */ - private void setString(String str) { - this.str = line = str; - cch = (str == null ? 0 : str.length()); - ich = 0; - } - - /* - * http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax - * - * 17. The special sequence of end-of-line followed - * immediately by a semicolon in column one (denoted ";") - * may also be used as a delimiter at the beginning and end - * of a character string comprising a data value. The complete - * bounded string is called a text field, and may be used to - * convey multi-line values. The end-of-line associated with - * the closing semicolon does not form part of the data value. - * Within a multi-line text field, leading white space within - * text lines must be retained as part of the data value; trailing - * white space on a line may however be elided. - * - * 18. A text field delimited by the ; digraph may not - * include a semicolon at the start of a line of text as - * part of its value. - * - * 20. For example, the data value foo may be expressed - * equivalently as an unquoted string foo, as a quoted - * string 'foo' or as a text field - * - *;foo - *; - * - * By contrast the value of the text field - * - *; foo - * bar - *; - * - * is foo bar (where represents an end-of-line); - * the embedded space characters are significant. - * - * - * I (BH) note, however, that we sometimes have: - * - * _some_name - * ; - * the name here - * ; - * - * so this should actually be - * - * ;the name here - * ; - * - * for this, we use fullTrim(); - * - */ - - /** - * - * sets the string for parsing to be from the next line - * when the token buffer is empty, and if ';' is at the - * beginning of that line, extends the string to include - * that full multiline string. Uses \1 to indicate that - * this is a special quotation. - * - * @return the next line or null if EOF - * @throws Exception - */ - private String setStringNextLine() throws Exception { - setString(readLine()); - if (line == null || line.length() == 0) - return line; - if (line.charAt(0) != ';') { - if (str.startsWith("###non-st#")) - ich = 10; - return line; - } - ich = 1; - String str = '\1' + line.substring(1) + '\n'; - while (readLine() != null) { - if (line.startsWith(";")) { - // remove trailing only, and attach rest of next line - str = str.substring(0, str.length() - 1) - + '\1' + line.substring(1); - break; - } - str += line + '\n'; - } - setString(str); - return str; - } - - /** - * @return TRUE if there are more tokens in the line buffer - * - */ - private boolean strHasMoreTokens() { - if (str == null) - return false; - char ch = '#'; - while (ich < cch && ((ch = str.charAt(ich)) == ' ' || ch == '\t')) - ++ich; - return (ich < cch && ch != '#'); - } - - /** - * assume that hasMoreTokens() has been called and that - * ich is pointing at a non-white character. Also sets - * boolean wasUnQuoted, because we need to know if we should - * be checking for a control keyword. 'loop_' is different from just - * loop_ without the quotes. - * - * @return null if no more tokens, "\0" if '.' or '?', or next token - */ - private String nextStrToken() { - if (ich == cch) - return null; - int ichStart = ich; - char ch = str.charAt(ichStart); - if (ch != '\'' && ch != '"' && ch != '\1') { - wasUnQuoted = true; - while (ich < cch && (ch = str.charAt(ich)) != ' ' && ch != '\t') - ++ich; - if (ich == ichStart + 1) - if (nullString != null && (str.charAt(ichStart) == '.' || str.charAt(ichStart) == '?')) - return nullString; - String s = str.substring(ichStart, ich); - return s; - } - wasUnQuoted = false; - char chOpeningQuote = ch; - boolean previousCharacterWasQuote = false; - while (++ich < cch) { - ch = str.charAt(ich); - if (previousCharacterWasQuote && (ch == ' ' || ch == '\t')) - break; - previousCharacterWasQuote = (ch == chOpeningQuote); - } - if (ich == cch) { - if (previousCharacterWasQuote) // close quote was last char of string - return str.substring(ichStart + 1, ich - 1); - // reached the end of the string without finding closing ' - return str.substring(ichStart, ich); - } - ++ich; // throw away the last white character - return str.substring(ichStart + 1, ich - 2); - } - - +package javajs.util; + +import java.io.BufferedReader; + +import java.util.Hashtable; + +import java.util.Map; + +import javajs.api.GenericCifDataParser; +import javajs.api.GenericLineReader; + + + +public class CifDataParser implements GenericCifDataParser { + /** + * + * A special tokenizer class for dealing with quoted strings in CIF files. + * + * Greek letters implemented in Jmol 13.3.9 and only for + * titles and space groups. All other mark ups ignored. + * + *

+ * regarding the treatment of single quotes vs. primes in + * cif file, PMR wrote: + *

+ *

+ * * There is a formal grammar for CIF + * (see http://www.iucr.org/iucr-top/cif/index.html) + * which confirms this. The textual explanation is + *

+ *

+ * 14. Matching single or double quote characters (' or ") may + * be used to bound a string representing a non-simple data value + * provided the string does not extend over more than one line. + *

+ *

+ * 15. Because data values are invariably separated from other + * tokens in the file by white space, such a quote-delimited + * character string may contain instances of the character used + * to delimit the string provided they are not followed by white + * space. For example, the data item + * + * _example 'a dog's life' + * + * is legal; the data value is a dog's life. + *

+ *

+ * [PMR - the terminating character(s) are quote+whitespace. + * That would mean that: + * + * _example 'Jones' life' + * + * would be an error + *

+ *

+ * The CIF format was developed in that late 1980's under the aegis of the + * International Union of Crystallography (I am a consultant to the COMCIFs + * committee). It was ratified by the Union and there have been several + * workshops. mmCIF is an extension of CIF which includes a relational + * structure. The formal publications are: + *

+ *

+ * Hall, S. R. (1991). "The STAR File: A New Format for Electronic Data + * Transfer and Archiving", J. Chem. Inform. Comp. Sci., 31, 326-333. + * Hall, S. R., Allen, F. H. and Brown, I. D. (1991). "The Crystallographic + * Information File (CIF): A New Standard Archive File for Crystallography", + * Acta Cryst., A47, 655-685. + * Hall, S.R. & Spadaccini, N. (1994). "The STAR File: Detailed + * Specifications," J. Chem. Info. Comp. Sci., 34, 505-508. + *

+ */ + private GenericLineReader reader; + private BufferedReader br; + + private String line; + private String str; + private int ich; + private int cch; + private boolean wasUnQuoted; + private String strPeeked; + private int ichPeeked; + private int fieldCount; + private String[] loopData; + private SB fileHeader = new SB(); + private boolean isHeader = true; + private String nullString = "\0"; + + /** + * Set the string value of what is returned for "." and "?" + * + * @param nullString null here returns "." and "?"; default is "\0" + * + */ + public void setNullValue(String nullString) { + this.nullString = nullString; + } + + /** + * A global, static map that contains field information. The assumption is that + * if we read a set of fields for, say, atom_site, once in a lifetime, then + * that should be good forever. Those are static lists. Or should be.... + */ + private static Map htFields = new Hashtable(); + + //////////////////////////////////////////////////////////////// + // special tokenizer class + //////////////////////////////////////////////////////////////// + + public CifDataParser() { + // for reflection + } + + private String[] fields; + + @Override + public String getLoopData(int i) { + return loopData[i]; + } + + @Override + public int getFieldCount() { + return fieldCount; + } + + @Override + public String getField(int i) { + return fields[i]; + } + + /** + * A Chemical Information File data parser. + * + * Should be called immediately upon construction. + * + * Two options; one of reader or br should be null, or reader will be + * ignored. Just simpler this way... + * + * @param reader Anything that can deliver a line of text or null + * @param br A standard BufferedReader. + * + */ + @Override + public CifDataParser set(GenericLineReader reader, BufferedReader br) { + this.reader = reader; + this.br = br; + return this; + } + + /** + * + * @return commented-out section at the start of a CIF file. + * + */ + @Override + public String getFileHeader() { + return fileHeader.toString(); + } + + + /** + * Parses all CIF data for a reader defined in the constructor + * into a standard Map structure and close the BufferedReader if + * it exists. + * + * @return Hashtable of models Vector of Hashtable data + */ + @Override + public Map getAllCifData() { + line = ""; + String key; + Map data = null; + Map allData = new Hashtable(); + Lst> models = new Lst>(); + allData.put("models", models); + try { + while ((key = getNextToken()) != null) { + if (key.startsWith("global_") || key.startsWith("data_")) { + models.addLast(data = new Hashtable()); + data.put("name", key); + continue; + } + if (key.startsWith("loop_")) { + getAllCifLoopData(data); + continue; + } + if (key.charAt(0) != '_') { + System.out.println("CIF ERROR ? should be an underscore: " + key); + } else { + String value = getNextToken(); + if (value == null) { + System.out.println("CIF ERROR ? end of file; data missing: " + key); + } else { + data.put(fixKey(key), value); + } + } + } + } catch (Exception e) { + // ? + } + try { + if (br != null) + br.close(); + } catch (Exception e) { + // ? + } + return allData; + } + + /** + * create our own list of keywords and for each one create a list + * of data associated with that keyword. For example, a list of all + * x coordinates, then a list of all y coordinates, etc. + * + * @param data + * @throws Exception + */ + @SuppressWarnings("unchecked") + private void getAllCifLoopData(Map data) throws Exception { + String key; + Lst keyWords = new Lst(); + while ((key = peekToken()) != null && key.charAt(0) == '_') { + key = fixKey(getTokenPeeked()); + keyWords.addLast(key); + data.put(key, new Lst()); + } + fieldCount = keyWords.size(); + if (fieldCount == 0) + return; + loopData = new String[fieldCount]; + while (getData()) + for (int i = 0; i < fieldCount; i++) + ((Lst)data.get(keyWords.get(i))).addLast(loopData[i]); + } + + @Override + public String readLine() { + try { + line = (reader == null ? br.readLine() : reader.readNextLine()); + if (line == null) + return null; + if (isHeader) { + if (line.startsWith("#")) + fileHeader.append(line).appendC('\n'); + else + isHeader = false; + } + return line; + } catch (Exception e) { + return null; + } + } + + /** + * The work horse; a general reader for loop data. + * Fills loopData with fieldCount fields. + * + * @return false if EOF + * @throws Exception + */ + @Override + public boolean getData() throws Exception { + // line is already present, and we leave with the next line to parse + for (int i = 0; i < fieldCount; ++i) + if ((loopData[i] = getNextDataToken()) == null) + return false; + return (fieldCount > 0); + } + + /** + * + * Skips all associated loop data. (Skips to next control word.) + * + * @throws Exception + */ + @Override + public String skipLoop(boolean doReport) throws Exception { + String str; + SB ret = (doReport ? new SB() : null); + int n = 0; + while ((str = peekToken()) != null && str.charAt(0) == '_') { + if (ret != null) + ret.append(str).append("\n"); + getTokenPeeked(); + n++; + } + int m = 0; + while ((str = getNextDataToken()) != null) { + if (ret == null) + continue; + ret.append(str).append(" "); + if ((++m % n) == 0) + ret.append("\n"); + } + return (ret == null ? null : ret.toString()); + } + + /** + * + * @return the next token of any kind, or null + * @throws Exception + */ + @Override + public String getNextToken() throws Exception { + while (!strHasMoreTokens()) + if (setStringNextLine() == null) + return null; + return nextStrToken(); + } + + /** + * + * first checks to see if the next token is an unquoted + * control code, and if so, returns null + * + * @return next data token or null + * @throws Exception + */ + @Override + public String getNextDataToken() throws Exception { + String str = peekToken(); + if (str == null) + return null; + if (wasUnQuoted) + if (str.charAt(0) == '_' || str.startsWith("loop_") + || str.startsWith("data_") + || str.startsWith("stop_") + || str.startsWith("global_")) + return null; + return getTokenPeeked(); + } + + /** + * Just look at the next token. Saves it for retrieval + * using getTokenPeeked() + * + * @return next token or null if EOF + * @throws Exception + */ + @Override + public String peekToken() throws Exception { + while (!strHasMoreTokens()) + if (setStringNextLine() == null) + return null; + int ich = this.ich; + strPeeked = nextStrToken(); + ichPeeked= this.ich; + this.ich = ich; + return strPeeked; + } + + /** + * + * @return the token last acquired; may be null + */ + @Override + public String getTokenPeeked() { + ich = ichPeeked; + return strPeeked; + } + + /** + * Used especially for data that might be multi-line data that + * might have unwanted white space at start or end. + * + * @param str + * @return str without any leading/trailing white space, and no '\n' + */ + @Override + public String fullTrim(String str) { + int pt0 = -1; + int pt1 = str.length(); + while (++pt0 < pt1 && PT.isWhitespace(str.charAt(pt0))) { + } + while (--pt1 > pt0 && PT.isWhitespace(str.charAt(pt1))) { + } + return str.substring(pt0, pt1 + 1); + } + + private final static String grABC = + "ABX\u0394E\u03A6\u0393H" // ABCDEFGH + + "I_K\u039BMNO\u03A0" // I_KLMNOP + + "\u0398P\u03A3TY_\u03A9\u039E\u03A5Z"; // QRSTU_WXYZ + private final static String grabc = + "\u03B1\u03B2\u03C7\u03A4\u03A5\u03C6\u03B3\u03B7" // abcdefgh + + "\u03B9_\u03BA\u03BB\u03BC\u03BD\u03BF\u03C0" // i_klmnop + + "\u03B8\u03C1\u03C3\u03C4\u03C5_\u03C9\u03BE\u03C5\u03B6"; // qrstu_wxyz + + /** + * Only translating the basic Greek set here, not all the other stuff. See + * http://www.iucr.org/resources/cif/spec/version1.1/semantics#markup + * + * @param data + * @return cleaned string + */ + @Override + public String toUnicode(String data) { + int pt; + try { + while ((pt = data.indexOf('\\')) >= 0) { + int c = data.charAt(pt + 1); + String ch = (c >= 65 && c <= 90 ? grABC.substring(c - 65, c - 64) + : c >= 97 && c <= 122 ? grabc.substring(c - 97, c - 96) : "_"); + data = data.substring(0, pt) + ch + data.substring(pt + 2); + } + } catch (Exception e) { + // ignore + } + + return data; + } + + /** + * Passing an array of field names, this method fills two arrays. + * The first, fieldOf, identifies + * It does this by first creating a map of names to their indices in fields[]. + * + * Alternatively, if fields is null, then a private array is filled, in order, + * with key data. This is used in cases such as matrices for which there are simply + * too many possibilities to list, and the key name itself contains the x-y + * information that we need. + * + */ + @Override + public int parseLoopParameters(String[] fields, int[] fieldOf, int[] propertyOf) throws Exception { + int propertyCount = 0; + if (fields == null) { + // for reading full list of keys, as for matrices + this.fields = new String[100]; + } else { + if (!htFields.containsKey(fields[0])) + for (int i = fields.length; --i >= 0;) + htFields.put(fields[i], Integer.valueOf(i)); + for (int i = fields.length; --i >= 0;) + fieldOf[i] = NONE; + propertyCount = fields.length; + } + fieldCount = 0; + while (true) { + String str = peekToken(); + if (str == null) { + // we are PREMATURELY done; reset + fieldCount = 0; + break; + } + // end of the loop is a new token starting with underscore + if (str.charAt(0) != '_') + break; + + int pt = fieldCount++; + str = fixKey(getTokenPeeked()); + if (fields == null) { + // just make a linear model, saving the list + this.fields[propertyOf[pt] = fieldOf[pt] = pt] = str; + continue; + } + Integer iField = htFields.get(str); + int i = (iField == null ? NONE : iField.intValue()); + if ((propertyOf[pt] = i) != NONE) + fieldOf[i] = pt; + } + if (fieldCount > 0) + loopData = new String[fieldCount]; + return propertyCount; + } + + @Override + public String fixKey(String key) { + // PRELIMINARY -- BilBao _magnetic + // PRELIMINARY -- Jana2006 + return (PT.rep( + key.startsWith("_magnetic") ? key.substring(9) + : key.startsWith("_jana") ? key.substring(5) + : key, ".", "_").toLowerCase()); + } + + //////////////////// private methods //////////////////// + + + /** + * sets a string to be parsed from the beginning + * + * @param str + */ + private void setString(String str) { + this.str = line = str; + cch = (str == null ? 0 : str.length()); + ich = 0; + } + + /* + * http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax + * + * 17. The special sequence of end-of-line followed + * immediately by a semicolon in column one (denoted ";") + * may also be used as a delimiter at the beginning and end + * of a character string comprising a data value. The complete + * bounded string is called a text field, and may be used to + * convey multi-line values. The end-of-line associated with + * the closing semicolon does not form part of the data value. + * Within a multi-line text field, leading white space within + * text lines must be retained as part of the data value; trailing + * white space on a line may however be elided. + * + * 18. A text field delimited by the ; digraph may not + * include a semicolon at the start of a line of text as + * part of its value. + * + * 20. For example, the data value foo may be expressed + * equivalently as an unquoted string foo, as a quoted + * string 'foo' or as a text field + * + *;foo + *; + * + * By contrast the value of the text field + * + *; foo + * bar + *; + * + * is foo bar (where represents an end-of-line); + * the embedded space characters are significant. + * + * + * I (BH) note, however, that we sometimes have: + * + * _some_name + * ; + * the name here + * ; + * + * so this should actually be + * + * ;the name here + * ; + * + * for this, we use fullTrim(); + * + */ + + /** + * + * sets the string for parsing to be from the next line + * when the token buffer is empty, and if ';' is at the + * beginning of that line, extends the string to include + * that full multiline string. Uses \1 to indicate that + * this is a special quotation. + * + * @return the next line or null if EOF + * @throws Exception + */ + private String setStringNextLine() throws Exception { + setString(readLine()); + if (line == null || line.length() == 0) + return line; + if (line.charAt(0) != ';') { + if (str.startsWith("###non-st#")) + ich = 10; + return line; + } + ich = 1; + String str = '\1' + line.substring(1) + '\n'; + while (readLine() != null) { + if (line.startsWith(";")) { + // remove trailing only, and attach rest of next line + str = str.substring(0, str.length() - 1) + + '\1' + line.substring(1); + break; + } + str += line + '\n'; + } + setString(str); + return str; + } + + /** + * @return TRUE if there are more tokens in the line buffer + * + */ + private boolean strHasMoreTokens() { + if (str == null) + return false; + char ch = '#'; + while (ich < cch && ((ch = str.charAt(ich)) == ' ' || ch == '\t')) + ++ich; + return (ich < cch && ch != '#'); + } + + /** + * assume that hasMoreTokens() has been called and that + * ich is pointing at a non-white character. Also sets + * boolean wasUnQuoted, because we need to know if we should + * be checking for a control keyword. 'loop_' is different from just + * loop_ without the quotes. + * + * @return null if no more tokens, "\0" if '.' or '?', or next token + */ + private String nextStrToken() { + if (ich == cch) + return null; + int ichStart = ich; + char ch = str.charAt(ichStart); + if (ch != '\'' && ch != '"' && ch != '\1') { + wasUnQuoted = true; + while (ich < cch && (ch = str.charAt(ich)) != ' ' && ch != '\t') + ++ich; + if (ich == ichStart + 1) + if (nullString != null && (str.charAt(ichStart) == '.' || str.charAt(ichStart) == '?')) + return nullString; + String s = str.substring(ichStart, ich); + return s; + } + wasUnQuoted = false; + char chOpeningQuote = ch; + boolean previousCharacterWasQuote = false; + while (++ich < cch) { + ch = str.charAt(ich); + if (previousCharacterWasQuote && (ch == ' ' || ch == '\t')) + break; + previousCharacterWasQuote = (ch == chOpeningQuote); + } + if (ich == cch) { + if (previousCharacterWasQuote) // close quote was last char of string + return str.substring(ichStart + 1, ich - 1); + // reached the end of the string without finding closing ' + return str.substring(ichStart, ich); + } + ++ich; // throw away the last white character + return str.substring(ichStart + 1, ich - 2); + } + + } \ No newline at end of file