From 69f23b44e8ce05b33fca23ffb3774c285a23dd9b Mon Sep 17 00:00:00 2001 From: jprocter Date: Thu, 12 Jan 2006 15:07:50 +0000 Subject: [PATCH] PIR file IO now preserves and parses Modeller style colon-separated fields --- src/jalview/io/ModellerDescription.java | 376 +++++++++++++++++++++++++++++++ src/jalview/io/PIRFile.java | 38 +++- 2 files changed, 404 insertions(+), 10 deletions(-) create mode 100755 src/jalview/io/ModellerDescription.java diff --git a/src/jalview/io/ModellerDescription.java b/src/jalview/io/ModellerDescription.java new file mode 100755 index 0000000..457a6ae --- /dev/null +++ b/src/jalview/io/ModellerDescription.java @@ -0,0 +1,376 @@ +package jalview.io; + +import jalview.datamodel.SequenceI; +import java.util.Vector; +import com.stevesoft.pat.Regex; +public class ModellerDescription +{ + /** + * Translates between a String containing a set of colon-separated values + * on a single line, and sequence start/end and other properties. + * See PIRFile IO for its use. + */ + final String[] seqTypes = + { + "sequence", "structure", "structureX", "structureN"}; + final String[] Fields = + { + "objectType", "objectId", + "startField", "startCode", + "endField", "endCode", + "description1", "description2", + "resolutionField", "tailField"}; + final int TYPE = 0; + final int LOCALID = 1; + final int START = 2; + final int START_CHAIN = 3; + final int END = 4; + final int END_CHAIN = 5; + final int DESCRIPTION1 = 6; + final int DESCRIPTION2 = 7; + final int RESOLUTION = 8; + final int TAIL = 9; + + /** + * 0 is free text or empty + * 1 is something that parses to an integer, or \@ + */ + final int Types[] = + { + 0, 0, 1, 0, 1, 0, 0, 0, 0, 0 + }; + final char Padding[] = + { + ' ', ' ', ' ', '.', ' ', '.', '.', '.', '.', '.' + }; + + java.util.Hashtable fields = new java.util.Hashtable(); + ModellerDescription() + { + fields.put(Fields[TAIL], ""); + } + + class resCode + { + Integer val; + String field; + resCode(String f, Integer v) + { + val = v; + field = f; + } + + resCode(int v) + { + val = new Integer(v); + field = val.toString(); + } + }; + + private resCode validResidueCode(String field) + { + Integer val = null; + com.stevesoft.pat.Regex r = new Regex("\\s*((([-0-9]+).?)|FIRST|LAST|@)"); + + if (!r.search(field)) + { + return null; // invalid + } + String value = r.stringMatched(3); + if (value == null) + { + value = r.stringMatched(1); + } + jalview.bin.Cache.log.debug("from '" + field + "' matched '" + value + + "'"); + try + { + val = Integer.valueOf(value); + return new resCode(field, val); // successful numeric extraction + } + catch (Exception e) + { + } + return new resCode(field, null); + } + + private java.util.Hashtable parseDescription(String desc) + { + java.util.Hashtable fields = new java.util.Hashtable(); + java.util.StringTokenizer st = new java.util.StringTokenizer(desc, ":"); + String field; + int type = -1; + if (st.countTokens() > 0) + { + // parse colon-fields + int i = 0; + field = st.nextToken(":"); + do + { + if (seqTypes[i].compareToIgnoreCase(field) == 0) + { + break; + } + } + while (++i < seqTypes.length); + + if (i < seqTypes.length) + { + // valid seqType for modeller + type = i; + i = 1; // continue parsing fields + while (i < TAIL && st.hasMoreTokens()) + { + if ( (field = st.nextToken(":")) != null) + { + // validate residue field value + if (Types[i] == 1) + { + resCode val = validResidueCode(field); + if (val != null) + { + fields.put(new String(Fields[i] + "num"), val); + } + else + { + jalview.bin.Cache.log.debug( + "Ignoring non-Modeller description: invalid integer-like field '" + field + "'"); + type = -1; /* invalid field! - throw the FieldSet away */ + } + ; + } + fields.put(Fields[i++], field); + } + } + if (i == TAIL) + { + // slurp remaining fields + while (st.hasMoreTokens()) + { + field += ":" + st.nextToken(":"); + } + fields.put(Fields[TAIL], field); + } + } + } + if (type == -1) + { + // object is not a proper ModellerPIR object + fields = new java.util.Hashtable(); + fields.put(Fields[TAIL], new String(desc)); + } + else + { + fields.put(Fields[TYPE], seqTypes[type]); + } + return fields; + } + + ModellerDescription(String desc) + { + if (desc == null) + { + desc = ""; + } + fields = parseDescription(desc); + } + + void setStartCode(int v) + { + resCode r; + fields.put(Fields[START] + "num", r = new resCode(v)); + fields.put(Fields[START], r.field); + } + + void setEndCode(int v) + { + resCode r; + fields.put(Fields[END] + "num", r = new resCode(v)); + fields.put(Fields[END], r.field); + } + + /** + * make a possibly updated modeller field line for the sequence object + * @param seq SequenceI + */ + ModellerDescription(SequenceI seq) + { + + if (seq.getDescription() != null) + { + fields = parseDescription(seq.getDescription()); + } + + if (isModellerFieldset()) + { + // Set start and end before we update the type (in the case of a synthesized field set) + if (getStartNum() != seq.getStart() && getStartCode().val != null) + { + setStartCode(seq.getStart()); + } + + if (getEndNum() != seq.getEnd() && getStartCode().val != null) + { + setEndCode(seq.getEnd()); + } + } + else + { + // synthesize fields + setStartCode(seq.getStart()); + setEndCode(seq.getEnd()); + fields.put(Fields[LOCALID], seq.getName()); // this may be overwritten below... + // type - decide based on evidence of PDB database references - this also sets the local reference field + int t = 0; // sequence + if (seq.getDatasetSequence() != null && + seq.getDatasetSequence().getDBRef() != null) + { + Vector dbr = seq.getDatasetSequence().getDBRef(); + int i, j; + for (i = 0, j = dbr.size(); i < j; i++) + { + jalview.datamodel.DBRefEntry dref = (jalview.datamodel.DBRefEntry) + dbr.get(i); + if (dref != null) + { + // JBPNote PDB dbRefEntry needs properties to propagate onto ModellerField + // JBPNote Need to get info from the user about whether the sequence is the one being modelled, or if it is a template. + if (dref.getSource().equals("PDB")) + { + fields.put(Fields[LOCALID], dref.getAccessionId()); + t = 2; + break; + } + } + } + } + fields.put(Fields[TYPE], seqTypes[t]); + } + + } + + /** + * Indicate if fields parsed to a modeller-like colon-separated value line + * @return boolean + */ + boolean isModellerFieldset() + { + return (fields.containsKey(Fields[TYPE])); + } + + String getDescriptionLine() + { + String desc = ""; + int lastfield = Fields.length - 1; + + if (isModellerFieldset()) + { + String value; + // try to write a minimal modeller field set, so.. + + // find the last valid field in the entry + + for (; lastfield > 6; lastfield--) + { + if (fields.containsKey(Fields[lastfield])) + { + break; + } + } + + for (int i = 0; i < lastfield; i++) + { + value = (String) fields.get(Fields[i]); + if (value != null && value.length() > 0) + { + desc += ( (String) fields.get(Fields[i])) + ":"; + } + else + { + desc += Padding[i] + ":"; + } + } + } + // just return the last field if no others were defined. + if (fields.containsKey(Fields[lastfield])) + { + desc += (String) fields.get(Fields[lastfield]); + } + else + { + desc += "."; + } + return desc; + } + + int getStartNum() + { + int start = 0; + resCode val = getStartCode(); + if (val.val != null) + { + return val.val.intValue(); + } + return start; + } + + resCode getStartCode() + { + if (isModellerFieldset() && fields.containsKey(Fields[START] + "num")) + { + return (resCode) fields.get(Fields[START] + "num"); + } + return null; + } + + resCode getEndCode() + { + if (isModellerFieldset() && fields.containsKey(Fields[END] + "num")) + { + return (resCode) fields.get(Fields[END] + "num"); + } + return null; + } + + int getEndNum() + { + int end = 0; + resCode val = getEndCode(); + if (val.val != null) + { + return val.val.intValue(); + } + return end; + } + + /** + * returns true if sequence object was modifed with a valid modellerField set + * @param newSeq SequenceI + * @return boolean + */ + boolean updateSequenceI(SequenceI newSeq) + { + if (isModellerFieldset()) + { + if (getStartCode().val != null) + { + newSeq.setStart(getStartNum()); + } + else + { + newSeq.setStart(1); + } + if (getEndCode().val != null) + { + newSeq.setEnd(getEndNum()); + } + else + { + newSeq.setEnd(newSeq.getStart() + newSeq.getLength()); + } + return true; + } + return false; + } +} + diff --git a/src/jalview/io/PIRFile.java b/src/jalview/io/PIRFile.java index d4b818b..724fea8 100755 --- a/src/jalview/io/PIRFile.java +++ b/src/jalview/io/PIRFile.java @@ -53,12 +53,15 @@ public class PIRFile while ( (line = nextLine()) != null) { - if(line.length()==0) + if (line.length() == 0) { //System.out.println("blank line"); continue; } - + if (line.indexOf("C;") == 0 || line.indexOf("#") == 0) + { + continue; + } Sequence newSeq = parseId(line.substring(line.indexOf(";") + 1)); sequence = new StringBuffer(); @@ -81,12 +84,14 @@ public class PIRFile } } - if(sequence.length()>0) + if (sequence.length() > 0) { sequence.setLength(sequence.length() - 1); - newSeq.setSequence(sequence.toString()); seqs.addElement(newSeq); + ModellerDescription md = new ModellerDescription(newSeq. + getDescription()); + md.updateSequenceI(newSeq); } } } @@ -113,14 +118,27 @@ public class PIRFile String seq = s[i].getSequence(); seq = seq + "*"; - out.append(">P1;" + printId(s[i]) + "\n"); - - if(s[i].getDescription()!=null) - out.append(s[i].getDescription()+"\n"); + if (is_NA) + { + // modeller doesn't really do nucleotides, so we don't do anything fancy + // Nucleotide sequence tags should have a >DL; prefix + out.append(">P1;" + s[i].getName() + "\n"); // JBPNote Should change >P to >N + if (s[i].getDescription() == null) + { + out.append(s[i].getName() + " " + + (s[i].getEnd() - s[i].getStart() + 1)); + out.append(is_NA ? " bases\n" : " residues\n"); + } + else + { + out.append(s[i].getDescription()+"\n"); + } + } else { - out.append(s[i].getName()+" "+ (s[i].getEnd() - s[i].getStart() + 1)); - out.append( is_NA ? " bases\n" : " residues\n"); + out.append(">P1;" + s[i].getName() + "\n"); + ModellerDescription md = new ModellerDescription(s[i]); + out.append(md.getDescriptionLine() + "\n"); } int nochunks = (seq.length() / len) + 1; -- 1.7.10.2