From d8d34047253ad28418f5478d9185a9b840114072 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Wed, 2 Dec 2015 16:52:41 +0000 Subject: [PATCH] JAL-1827 refactored to handle missing residues better --- src/jalview/ext/jmol/PDBFileWithJmol.java | 616 ++++++++++++++++-------- test/jalview/ext/jmol/PDBFileWithJmolTest.java | 77 ++- 2 files changed, 486 insertions(+), 207 deletions(-) diff --git a/src/jalview/ext/jmol/PDBFileWithJmol.java b/src/jalview/ext/jmol/PDBFileWithJmol.java index 8d76afc..49cb6d5 100644 --- a/src/jalview/ext/jmol/PDBFileWithJmol.java +++ b/src/jalview/ext/jmol/PDBFileWithJmol.java @@ -28,10 +28,13 @@ import jalview.datamodel.SequenceI; import jalview.io.AlignFile; import jalview.io.FileParse; import jalview.schemes.ResidueProperties; +import jalview.util.Comparison; import jalview.util.MessageManager; import java.io.IOException; +import java.util.ArrayList; import java.util.Hashtable; +import java.util.List; import java.util.Map; import javajs.awt.Dimension; @@ -39,11 +42,13 @@ import javajs.awt.Dimension; import org.jmol.api.JmolStatusListener; import org.jmol.api.JmolViewer; import org.jmol.c.CBK; +import org.jmol.c.STR; import org.jmol.modelset.Group; import org.jmol.modelset.Model; import org.jmol.modelset.ModelSet; import org.jmol.modelsetbio.BioModel; import org.jmol.modelsetbio.BioPolymer; +import org.jmol.modelsetbio.Monomer; import org.jmol.viewer.Viewer; /** @@ -84,6 +89,8 @@ public class PDBFileWithJmol extends AlignFile implements { viewer = (Viewer) JmolViewer.allocateViewer(null, null, null, null, null, "-x -o -n", this); + // ensure the 'new' (DSSP) not 'old' (Ramachandran) SS method is used + viewer.setBooleanProperty("defaultStructureDSSP", true); } catch (ClassCastException x) { throw new Error(MessageManager.formatMessage( @@ -108,229 +115,95 @@ public class PDBFileWithJmol extends AlignFile implements } } - /* - * (non-Javadoc) + /** + * Convert Jmol's secondary structure code to Jalview's, and stored it in the + * secondary structure arrays at the given sequence position * - * @see jalview.io.AlignFile#parse() + * @param proteinStructureSubType + * @param pos + * @param secstr + * @param secstrcode */ - @Override - public void parse() throws IOException + protected void setSecondaryStructure(STR proteinStructureSubType, + int pos, char[] secstr, char[] secstrcode) { - Viewer jmd = getJmolData(); - jmd.openReader(getDataName(), getDataName(), getReader()); - waitForScript(jmd); - - if (jmd.ms.mc > 0) + switch (proteinStructureSubType) { - ModelSet ms = jmd.ms; - // Jmol 14.2 added third argument doReport = false - ms.calculateStructures(null, true, false, false, true); - // System.out.println("Structs\n"+structs); - Group group = null; - int modelIndex = -1; - for (Model model : ms.am) + case HELIX310: + if (secstr[pos] == 0) { - modelIndex++; - for (BioPolymer bp : ((BioModel) model).bioPolymers) - { - int lastChainId = 0; // int value of character e.g. 65 for A - String lastChainIdAlpha = ""; - - int[] groups = bp.getLeadAtomIndices(); - char seq[] = new char[groups.length], secstr[] = new char[groups.length], secstrcode[] = new char[groups.length]; - int groupc = 0, len = 0, firstrnum = 1, lastrnum = 0; - - do - { - if (groupc >= groups.length - || ms.at[groups[groupc]].group.chain.chainID != lastChainId) - { - /* - * on change of chain (or at end), construct the sequence and - * secondary structure annotation for the last chain - */ - if (len > 0) - { - boolean isNa = bp.isNucleic(); - // normalise sequence from Jmol to jalview - int[] cinds = isNa ? ResidueProperties.nucleotideIndex - : ResidueProperties.aaIndex; - int nonGap = isNa ? ResidueProperties.maxNucleotideIndex - : ResidueProperties.maxProteinIndex; - char ngc = 'X'; - char newseq[] = new char[len]; - Annotation asecstr[] = new Annotation[len + firstrnum - 1]; - for (int p = 0; p < len; p++) - { - newseq[p] = cinds[seq[p]] == nonGap ? ngc : seq[p]; - if (secstr[p] >= 'A' && secstr[p] <= 'z') - { - try - { - asecstr[p] = new Annotation("" + secstr[p], null, - secstrcode[p], Float.NaN); - } catch (ArrayIndexOutOfBoundsException e) - { - // skip - patch for JAL-1836 - } - } - } - String modelTitle = (String) ms - .getInfo(modelIndex, "title"); - SequenceI sq = new Sequence("" + getDataName() + "|" - + modelTitle + "|" + lastChainIdAlpha, newseq, - firstrnum, lastrnum); - PDBEntry pdbe = new PDBEntry(); - pdbe.setFile(getDataName()); - pdbe.setId(getDataName()); - pdbe.setProperty(new Hashtable()); - // pdbe.getProperty().put("CHAIN", "" + _lastChainId); - pdbe.setChainCode(lastChainIdAlpha); - sq.addPDBId(pdbe); - // JAL-1533 - // Need to put the number of models for this polymer somewhere - // for Chimera/others to grab - // pdbe.getProperty().put("PDBMODELS", biopoly.) - seqs.add(sq); - if (!isNa) - { - String mt = modelTitle == null ? getDataName() - : modelTitle; - if (lastChainId >= ' ') - { - mt += lastChainIdAlpha; - } - AlignmentAnnotation ann = new AlignmentAnnotation( - "Secondary Structure", "Secondary Structure for " - + mt, asecstr); - ann.belowAlignment = true; - ann.visible = true; - ann.autoCalculated = false; - ann.setCalcId(getClass().getName()); - sq.addAlignmentAnnotation(ann); - ann.adjustForAlignment(); - ann.validateRangeAndDisplay(); - annotations.add(ann); - } - } - len = 0; - firstrnum = 1; - lastrnum = 0; - } - if (groupc < groups.length) - { - group = ms.at[groups[groupc]].group; - if (len == 0) - { - firstrnum = group.getResno(); - lastChainId = group.chain.chainID; - lastChainIdAlpha = group.chain.getIDStr(); - } - else - { - lastrnum = group.getResno(); - } - seq[len] = group.getGroup1(); - - /* - * JAL-1828 replace a modified amino acid with its standard - * equivalent (e.g. MSE with MET->M) to maximise sequence matching - */ - String threeLetterCode = group.getGroup3(); - String canonical = ResidueProperties - .getCanonicalAminoAcid(threeLetterCode); - if (canonical != null - && !canonical.equalsIgnoreCase(threeLetterCode)) - { - seq[len] = ResidueProperties - .getSingleCharacterCode(canonical); - } - switch (group.getProteinStructureSubType()) - { - case HELIX310: - if (secstr[len] == 0) - { - secstr[len] = '3'; - } - case HELIXALPHA: - if (secstr[len] == 0) - { - secstr[len] = 'H'; - } - case HELIXPI: - if (secstr[len] == 0) - { - secstr[len] = 'P'; - } - case HELIX: - if (secstr[len] == 0) - { - secstr[len] = 'H'; - } - secstrcode[len] = 'H'; - break; - case SHEET: - secstr[len] = 'E'; - secstrcode[len] = 'E'; - break; - default: - secstr[len] = 0; - secstrcode[len] = 0; - } - len++; - } - } while (groupc++ < groups.length); - } + secstr[pos] = '3'; } - - /* - * lastScriptTermination = -9465; String dsspOut = - * jmd.evalString("calculate STRUCTURE"); if (dsspOut.equals("pending")) { - * while (lastScriptTermination == -9465) { try { Thread.sleep(50); } - * catch (Exception x) { } ; } } System.out.println(lastConsoleEcho); - */ + case HELIXALPHA: + if (secstr[pos] == 0) + { + secstr[pos] = 'H'; + } + case HELIXPI: + if (secstr[pos] == 0) + { + secstr[pos] = 'P'; + } + case HELIX: + if (secstr[pos] == 0) + { + secstr[pos] = 'H'; + } + secstrcode[pos] = 'H'; + break; + case SHEET: + secstr[pos] = 'E'; + secstrcode[pos] = 'E'; + break; + default: + secstr[pos] = 0; + secstrcode[pos] = 0; } } - /* - * (non-Javadoc) + /** + * Convert any non-standard peptide codes to their standard code table + * equivalent. (Initial version only does Selenomethionine MSE->MET.) * - * @see jalview.io.AlignFile#print() + * @param threeLetterCode + * @param seq + * @param pos + */ + protected void replaceNonCanonicalResidue(String threeLetterCode, + char[] seq, int pos) + { + String canonical = ResidueProperties + .getCanonicalAminoAcid(threeLetterCode); + if (canonical != null && !canonical.equalsIgnoreCase(threeLetterCode)) + { + seq[pos] = ResidueProperties.getSingleCharacterCode(canonical); + } + } + + /** + * Not implemented - returns null */ @Override public String print() { - // TODO Auto-generated method stub return null; } + /** + * Not implemented + */ @Override public void setCallbackFunction(String callbackType, String callbackFunction) { - // TODO Auto-generated method stub - } - /* - * @Override public void notifyCallback(EnumCallback type, Object[] data) { - * try { switch (type) { case ERROR: case SCRIPT: - * notifyScriptTermination((String) data[2], ((Integer) data[3]).intValue()); - * break; case MESSAGE: sendConsoleMessage((data == null) ? ((String) null) : - * (String) data[1]); break; case LOADSTRUCT: notifyFileLoaded((String) - * data[1], (String) data[2], (String) data[3], (String) data[4], ((Integer) - * data[5]).intValue()); - * - * break; default: // System.err.println("Unhandled callback " + type + " " // - * + data[1].toString()); break; } } catch (Exception e) { - * System.err.println("Squashed Jmol callback handler error:"); - * e.printStackTrace(); } } - */ - public void notifyCallback(CBK type, Object[] data) + @Override + public void notifyCallback(CBK cbType, Object[] data) { String strInfo = (data == null || data[1] == null ? null : data[1] .toString()); - switch (type) + switch (cbType) { case ECHO: sendConsoleEcho(strInfo); @@ -407,49 +280,63 @@ public class PDBFileWithJmol extends AlignFile implements } } + /** + * Not implemented - returns null + */ @Override public String eval(String strEval) { - // TODO Auto-generated method stub return null; } + /** + * Not implemented - returns null + */ @Override public float[][] functionXY(String functionName, int x, int y) { - // TODO Auto-generated method stub return null; } + /** + * Not implemented - returns null + */ @Override public float[][][] functionXYZ(String functionName, int nx, int ny, int nz) { - // TODO Auto-generated method stub return null; } + /** + * Not implemented - returns null + */ @Override - public String createImage(String fileName, String type, + public String createImage(String fileName, String imageType, Object text_or_bytes, int quality) { - // TODO Auto-generated method stub return null; } + /** + * Not implemented - returns null + */ @Override public Map getRegistryInfo() { - // TODO Auto-generated method stub return null; } + /** + * Not implemented + */ @Override public void showUrl(String url) { - // TODO Auto-generated method stub - } + /** + * Not implemented - returns null + */ @Override public Dimension resizeInnerPanel(String data) { @@ -462,4 +349,323 @@ public class PDBFileWithJmol extends AlignFile implements return null; } + /** + * Calls the Jmol library to parse the PDB file, and then inspects the + * resulting object model to generate Jalview-style sequences, with secondary + * structure annotation added where available (i.e. where it has been computed + * by Jmol using DSSP). + * + * @see jalview.io.AlignFile#parse() + */ + @Override + public void parse() throws IOException + { + Viewer jmolModel = getJmolData(); + jmolModel.openReader(getDataName(), getDataName(), getReader()); + waitForScript(jmolModel); + + /* + * Convert one or more Jmol Model objects to Jalview objects. + */ + if (jmolModel.ms.mc > 0) + { + parseBiopolymers(jmolModel.ms); + } + } + + /** + * Process the Jmol BioPolymer array and generate a Jalview sequence for each + * chain found (including any secondary structure annotation from DSSP) + * + * @param ms + * @throws IOException + */ + public void parseBiopolymers(ModelSet ms) throws IOException + { + int modelIndex = -1; + for (Model model : ms.am) + { + modelIndex++; + String modelTitle = (String) ms.getInfo(modelIndex, "title"); + + /* + * as chains can span BioPolymers, we first make a flattened list, + * and then work out the lengths of chains present + */ + List monomers = getMonomers(ms, (BioModel) model); + List chainLengths = getChainLengths(monomers); + + /* + * now chop up the Monomer list to make Jalview Sequences + */ + int from = 0; + for (int length : chainLengths) + { + buildSequenceFromChain(monomers.subList(from, from + length), modelTitle); + from += length; + } + } + } + + /** + * Helper method to construct a sequence for one chain and add it to the seqs + * list + * + * @param monomers + * a list of all monomers in the chain + * @param modelTitle + */ + protected void buildSequenceFromChain(List monomers, String modelTitle) + { + final int length = monomers.size(); + + /* + * arrays to hold sequence and secondary structure + */ + char[] seq = new char[length]; + char[] secstr = new char[length]; + char[] secstrcode = new char[length]; + + /* + * populate the sequence and secondary structure arrays + */ + extractJmolChainData(monomers, seq, secstr, secstrcode); + + /* + * grab chain code and start position from first residue; + */ + String chainId = monomers.get(0).chain.getIDStr(); + int firstResNum = monomers.get(0).getResno(); + if (firstResNum < 1) + { + // Jalview doesn't like residue < 1, so force this to 1 + System.err.println("Converting chain " + chainId + " first RESNUM (" + + firstResNum + ") to 1"); + firstResNum = 1; + } + + /* + * convert any non-gap unknown residues to 'X' + */ + convertNonGapCharacters(seq); + + /* + * construct and add the Jalview sequence + */ + SequenceI sq = new Sequence("" + getDataName() + "|" + modelTitle + "|" + + chainId, seq, firstResNum, firstResNum + length - 1); + seqs.add(sq); + + /* + * add secondary structure predictions (if any) + */ + addSecondaryStructureAnnotation(modelTitle, sq, secstr, secstrcode, + chainId, firstResNum); + + /* + * record the PDB id for the sequence + */ + addPdbid(sq, chainId); + } + + /** + * Scans the list of (Jmol) Monomer objects, and adds the residue for each to + * the sequence array, and any converted secondary structure prediction to the + * secondary structure arrays + * + * @param monomers + * @param seq + * @param secstr + * @param secstrcode + */ + protected void extractJmolChainData(List monomers, char[] seq, + char[] secstr, char[] secstrcode) + { + int pos = 0; + for (Monomer monomer : monomers) + { + seq[pos] = monomer.getGroup1(); + + /* + * JAL-1828 replace a modified amino acid with its standard + * equivalent (e.g. MSE with MET->M) to maximise sequence matching + */ + replaceNonCanonicalResidue(monomer.getGroup3(), seq, pos); + + /* + * if Jmol has derived a secondary structure prediction for + * this position, convert it to Jalview equivalent and save it + */ + setSecondaryStructure(monomer.getProteinStructureSubType(), pos, + secstr, secstrcode); + pos++; + } + } + + /** + * Helper method that adds an AlignmentAnnotation for secondary structure to + * the sequence, provided at least one secondary structure prediction has been + * made + * + * @param modelTitle + * @param seq + * @param secstr + * @param secstrcode + * @param chainId + * @param firstResNum + * @return + */ + protected void addSecondaryStructureAnnotation(String modelTitle, + SequenceI sq, char[] secstr, char[] secstrcode, + String chainId, int firstResNum) + { + char[] seq = sq.getSequence(); + boolean ssFound = false; + Annotation asecstr[] = new Annotation[seq.length + firstResNum - 1]; + for (int p = 0; p < seq.length; p++) + { + if (secstr[p] >= 'A' && secstr[p] <= 'z') + { + asecstr[p] = new Annotation(String.valueOf(secstr[p]), null, + secstrcode[p], Float.NaN); + ssFound = true; + } + } + + if (ssFound) + { + String mt = modelTitle == null ? getDataName() : modelTitle; + mt += chainId; + AlignmentAnnotation ann = new AlignmentAnnotation( + "Secondary Structure", "Secondary Structure for " + mt, + asecstr); + ann.belowAlignment = true; + ann.visible = true; + ann.autoCalculated = false; + ann.setCalcId(getClass().getName()); + ann.adjustForAlignment(); + ann.validateRangeAndDisplay(); + annotations.add(ann); + sq.addAlignmentAnnotation(ann); + } + } + + /** + * Replace any non-gap miscellaneous characters with 'X' + * + * @param seq + * @return + */ + protected void convertNonGapCharacters(char[] seq) + { + boolean isNa = Comparison.areNucleotide(new char[][] { seq }); + int[] cinds = isNa ? ResidueProperties.nucleotideIndex + : ResidueProperties.aaIndex; + int nonGap = isNa ? ResidueProperties.maxNucleotideIndex + : ResidueProperties.maxProteinIndex; + + for (int p = 0; p < seq.length; p++) + { + if (cinds[seq[p]] == nonGap) + { + seq[p] = 'X'; + } + } + } + + /** + * @param sq + * @param chainId + */ + protected void addPdbid(SequenceI sq, String chainId) + { + PDBEntry pdbe = new PDBEntry(); + pdbe.setFile(getDataName()); + pdbe.setId(getDataName()); + pdbe.setProperty(new Hashtable()); + // pdbe.getProperty().put("CHAIN", "" + _lastChainId); + pdbe.setChainCode(chainId); + sq.addPDBId(pdbe); + } + + /** + * Scans the list of Monomers (residue models), inspecting the chain id for + * each, and returns an array whose length is the number of chains, and values + * the length of each chain + * + * @param monomers + * @return + */ + protected List getChainLengths(List monomers) + { + List chainLengths = new ArrayList(); + int lastChainId = -1; + int length = 0; + + for (Monomer monomer : monomers) + { + int chainId = monomer.chain.chainID; + if (chainId != lastChainId && length > 0) + { + /* + * change of chain - record the length of the last one + */ + chainLengths.add(length); + length = 0; + } + lastChainId = chainId; + length++; + } + if (length > 0) + { + /* + * record the length of the final chain + */ + chainLengths.add(length); + } + + return chainLengths; + } + + /** + * Returns a flattened list of Monomer (residue) in order, across all + * BioPolymers in the model. This simplifies assembling chains which span + * BioPolymers. The result does not include any alternate residues reported + * for the same sequence position (RESNUM value). + * + * @param ms + * @param model + * @return + */ + protected List getMonomers(ModelSet ms, BioModel model) + { + List result = new ArrayList(); + String lastSeqCode = ""; + + for (BioPolymer bp : model.bioPolymers) { + for (int groupLeadAtoms : bp.getLeadAtomIndices()) + { + Group group = ms.at[groupLeadAtoms].group; + if (group instanceof Monomer) + { + /* + * ignore alternate residue at same position + * example: 1ejg has residues A:LEU, B:ILE, C:ILE at RESNUM=25 + */ + String seqcodeString = group.getSeqcodeString(); + if (!lastSeqCode.equals(seqcodeString)) + { + result.add((Monomer) group); + } + else + { + System.out.println("skipping"); + } + lastSeqCode = seqcodeString; + } + } + } + return result; + } + } diff --git a/test/jalview/ext/jmol/PDBFileWithJmolTest.java b/test/jalview/ext/jmol/PDBFileWithJmolTest.java index 4ccb6e3..fd02d00 100644 --- a/test/jalview/ext/jmol/PDBFileWithJmolTest.java +++ b/test/jalview/ext/jmol/PDBFileWithJmolTest.java @@ -44,11 +44,40 @@ import MCview.PDBfile; */ public class PDBFileWithJmolTest { + /* + * 1GAQ has been reduced to alpha carbons only + * 1QCF is the full PDB file including headers, HETATM etc + */ String[] testFile = new String[] { "./examples/1GAQ.txt", "./test/jalview/ext/jmol/1QCF.pdb" }; // , - // "./examples/DNMT1_MOUSE.pdb" - // }; + //@formatter:off + // a modified and very cut-down extract of 4UJ4 + String pdbWithChainBreak = + "HEADER TRANSPORT PROTEIN 08-APR-15 4UJ4\n" + + // chain B has missing residues; these should all go in the same sequence: + "ATOM 1909 CA VAL B 358 21.329 -19.739 -67.740 1.00201.05 C\n" + + "ATOM 1916 CA GLY B 359 21.694 -23.563 -67.661 1.00198.09 C\n" + + "ATOM 1920 CA LYS B 367 32.471 -12.135 -77.100 1.00257.97 C\n" + + "ATOM 1925 CA ALA B 368 31.032 -9.324 -74.946 1.00276.01 C\n" + + // switch to chain C; should be a separate sequence + "ATOM 1930 CA SER C 369 32.589 -7.517 -71.978 1.00265.44 C\n" + + "ATOM 1936 CA ALA C 370 31.650 -6.849 -68.346 1.00249.48 C\n"; + //@formatter:on + + //@formatter:off + // a very cut-down extract of 1ejg + String pdbWithAltLoc = + "HEADER TRANSPORT PROTEIN 08-APR-15 1EJG\n" + + "ATOM 448 CA ALA A 24 6.619 16.195 1.970 1.00 1.65 C\n" + + "ATOM 458 CA ALEU A 25 3.048 14.822 1.781 0.57 1.48 C\n" + + // alternative residue 25 entries (with ILE instead of LEU) should be ignored: + "ATOM 478 CA BILE A 25 3.048 14.822 1.781 0.21 1.48 C\n" + + // including the next altloc causes the unit test to fail but it works with the full file + // not sure why! + // "ATOM 479 CA CILE A 25 3.048 14.822 1.781 0.22 1.48 C\n" + + "ATOM 512 CA CYS A 26 4.137 11.461 3.154 1.00 1.52 C\n"; + //@formatter:on @BeforeMethod(alwaysRun = true) public void setUp() @@ -133,4 +162,48 @@ public class PDBFileWithJmolTest "Secondary structure not associated for sequence " + sq.getName(), sq.getAnnotation()[0].sequenceRef == sq); } + + /** + * Test parsing a chain with missing residues + * + * @throws Exception + */ + @Test(groups = { "Functional" }) + public void testParse_missingResidues() throws Exception + { + PDBfile mctest = new PDBfile(false, false, false, pdbWithChainBreak, + AppletFormatAdapter.PASTE); + PDBFileWithJmol jtest = new PDBFileWithJmol(pdbWithChainBreak, + jalview.io.AppletFormatAdapter.PASTE); + Vector seqs = jtest.getSeqs(); + Vector mcseqs = mctest.getSeqs(); + + assertEquals("Failed to find 2 sequences\n", 2, seqs.size()); + assertEquals("Failed to find 2 sequences\n", 2, mcseqs.size()); + assertEquals("VGKA", seqs.get(0).getSequenceAsString()); + assertEquals("VGKA", mcseqs.get(0).getSequenceAsString()); + assertEquals("SA", seqs.get(1).getSequenceAsString()); + assertEquals("SA", mcseqs.get(1).getSequenceAsString()); + } + + /** + * Test parsing a chain with 'altloc' residues + * + * @throws Exception + */ + @Test(groups = { "Functional" }) + public void testParse_alternativeResidues() throws Exception + { + PDBfile mctest = new PDBfile(false, false, false, pdbWithAltLoc, + AppletFormatAdapter.PASTE); + PDBFileWithJmol jtest = new PDBFileWithJmol(pdbWithAltLoc, + jalview.io.AppletFormatAdapter.PASTE); + Vector seqs = jtest.getSeqs(); + Vector mcseqs = mctest.getSeqs(); + + assertEquals("Failed to find 1 sequence\n", 1, seqs.size()); + assertEquals("Failed to find 1 sequence\n", 1, mcseqs.size()); + assertEquals("ALC", seqs.get(0).getSequenceAsString()); + assertEquals("ALC", mcseqs.get(0).getSequenceAsString()); + } } -- 1.7.10.2