From def36f0f6cfc58b9a96af7a099677662d3463174 Mon Sep 17 00:00:00 2001 From: gmungoc Date: Mon, 26 Feb 2018 11:10:48 +0000 Subject: [PATCH] JAL-2599 test coverage and bug fixes for 'minimal file' parsing --- src/jalview/datamodel/HiddenMarkovModel.java | 6 +- src/jalview/datamodel/Sequence.java | 3 +- src/jalview/io/HMMFile.java | 176 ++++++++++++++------------ test/jalview/hmmer/HMMERTest.java | 2 +- test/jalview/io/HMMFileTest.java | 77 ++++++++--- 5 files changed, 158 insertions(+), 106 deletions(-) diff --git a/src/jalview/datamodel/HiddenMarkovModel.java b/src/jalview/datamodel/HiddenMarkovModel.java index c6545e2..581f481 100644 --- a/src/jalview/datamodel/HiddenMarkovModel.java +++ b/src/jalview/datamodel/HiddenMarkovModel.java @@ -195,15 +195,15 @@ public class HiddenMarkovModel } /** - * Returns the length of the hidden Markov model. + * Returns the length of the hidden Markov model, or 0 if not known * * @return */ - public Integer getLength() + public int getLength() { if (fileProperties.get(HMMFile.LENGTH) == null) { - return null; + return 0; } return Integer.parseInt(fileProperties.get(HMMFile.LENGTH)); } diff --git a/src/jalview/datamodel/Sequence.java b/src/jalview/datamodel/Sequence.java index e798525..9398c0b 100755 --- a/src/jalview/datamodel/Sequence.java +++ b/src/jalview/datamodel/Sequence.java @@ -1918,8 +1918,7 @@ public class Sequence extends ASequence implements SequenceI hmm.clearNodeLookup(); for (int i = 0; i < getLength(); i++) { - if (rf.annotations[i].displayCharacter.equals("x") - || rf.annotations[i].displayCharacter.equals("X")) + if (rf.annotations[i].displayCharacter.equalsIgnoreCase("x")) { if (i < hmm.getNodeAlignmentColumn(node)) { diff --git a/src/jalview/io/HMMFile.java b/src/jalview/io/HMMFile.java index 2733fb4..a05425c 100644 --- a/src/jalview/io/HMMFile.java +++ b/src/jalview/io/HMMFile.java @@ -23,10 +23,12 @@ import java.util.Scanner; public class HMMFile extends AlignFile implements AlignmentFileReaderI, AlignmentFileWriterI { + private static final String TERMINATOR = "//"; + /* * keys to data in HMM file, used to store as properties of the HiddenMarkovModel */ - private static final String HMM = "HMM"; + public static final String HMM = "HMM"; public static final String NAME = "NAME"; @@ -40,18 +42,6 @@ public class HMMFile extends AlignFile public static final String ALPHABET = "ALPH"; - private static final String ALPH_AMINO = "amino"; - - private static final String ALPH_DNA = "DNA"; - - private static final String ALPH_RNA = "RNA"; - - private static final String ALPHABET_AMINO = "ACDEFGHIKLMNPQRSTVWY"; - - private static final String ALPHABET_DNA = "ACGT"; - - private static final String ALPHABET_RNA = "ACGU"; - public static final String DATE = "DATE"; public static final String COMMAND_LOG = "COM"; @@ -88,6 +78,18 @@ public class HMMFile extends AlignFile public static final String MASKED_VALUE = "MM"; + private static final String ALPH_AMINO = "amino"; + + private static final String ALPH_DNA = "DNA"; + + private static final String ALPH_RNA = "RNA"; + + private static final String ALPHABET_AMINO = "ACDEFGHIKLMNPQRSTVWY"; + + private static final String ALPHABET_DNA = "ACGT"; + + private static final String ALPHABET_RNA = "ACGU"; + private static final int NUMBER_OF_TRANSITIONS = 7; private static final String SPACE = " "; @@ -240,53 +242,75 @@ public class HMMFile extends AlignFile } /** - * Parses the model data from the HMMER3 file + * Parses the model data from the HMMER3 file. The input buffer should be + * positioned at the (optional) COMPO line if there is one, else at the insert + * emissions line for the BEGIN node of the model. * * @param input * @throws IOException */ void parseModel(BufferedReader input) throws IOException { - boolean first = true; - // specification says there must always be an HMM header - // and one more header which is skipped here + /* + * specification says there must always be an HMM header (already read) + * and one more header (guide headings) which is skipped here + */ + int nodeNo = 0; String line = input.readLine(); - while (!"//".equals(line)) + while (line != null && !TERMINATOR.equals(line)) { HMMNode node = new HMMNode(); hmm.addNode(node); - Scanner matchReader = new Scanner(line); - String next = matchReader.next(); - if (next.equals(COMPO) || !first) + Scanner scanner = new Scanner(line); + String next = scanner.next(); + + /* + * expect COMPO (optional) for average match emissions + * or a node number followed by node's match emissions + */ + if (COMPO.equals(next) || nodeNo > 0) { - // stores match emission line in list - double[] matches = parseDoubles(matchReader, numberOfSymbols); + /* + * parse match emissions + */ + double[] matches = parseDoubles(scanner, numberOfSymbols); node.setMatchEmissions(matches); - if (!first) + if (!COMPO.equals(next)) { - // TODO handle files with no column map (make our own) - int column = parseAnnotations(matchReader, node); - hmm.setAlignmentColumn(node, column - 1); + int column = parseAnnotations(scanner, node); + if (column == 0) + { + /* + * no MAP annotation provided, just number off from 0 (begin node) + */ + column = nodeNo; + } + hmm.setAlignmentColumn(node, column - 1); // node 1 <==> column 0 } + line = input.readLine(); } - matchReader.close(); - // stores insert emission line in list - line = input.readLine(); - Scanner insertReader = new Scanner(line); - double[] inserts = parseDoubles(insertReader, numberOfSymbols); + scanner.close(); + + /* + * parse insert emissions + */ + scanner = new Scanner(line); + double[] inserts = parseDoubles(scanner, numberOfSymbols); node.setInsertEmissions(inserts); - insertReader.close(); + scanner.close(); - // stores state transition line in list + /* + * parse state transitions + */ line = input.readLine(); - Scanner transitionReader = new Scanner(line); - double[] transitions = parseDoubles(transitionReader, + scanner = new Scanner(line); + double[] transitions = parseDoubles(scanner, NUMBER_OF_TRANSITIONS); node.setStateTransitions(transitions); - transitionReader.close(); + scanner.close(); line = input.readLine(); - first = false; + nodeNo++; } } @@ -306,54 +330,53 @@ public class HMMFile extends AlignFile * HMM counts columns from 1, convert to base 0 for Jalview */ int column = 0; - if (hmm.getBooleanProperty(MAP) && scanner.hasNext()) - { - column = scanner.nextInt(); - node.setAlignmentColumn(column - 1); - } - else + String value; + if (scanner.hasNext()) { - scanner.next(); + value = scanner.next(); + if (!"-".equals(value)) + { + try + { + column = Integer.parseInt(value); + node.setAlignmentColumn(column - 1); + } catch (NumberFormatException e) + { + // ignore + } + } } /* - * hmm consensus residue if provided, else - + * hmm consensus residue if provided, else '-' */ if (scanner.hasNext()) { - char consensusR; - consensusR = charValue(scanner.next()); - node.setConsensusResidue(consensusR); + node.setConsensusResidue(scanner.next().charAt(0)); } /* - * RF reference annotation, if provided, else - + * RF reference annotation, if provided, else '-' */ if (scanner.hasNext()) { - char reference; - reference = charValue(scanner.next()); - node.setReferenceAnnotation(reference); + node.setReferenceAnnotation(scanner.next().charAt(0)); } /* - * 'm' for masked position, if provided, else - + * 'm' for masked position, if provided, else '-' */ if (scanner.hasNext()) { - char value; - value = charValue(scanner.next()); - node.setMaskValue(value); + node.setMaskValue(scanner.next().charAt(0)); } /* - * structure consensus symbol, if provided, else - + * structure consensus symbol, if provided, else '-' */ if (scanner.hasNext()) { - char consensusS; - consensusS = charValue(scanner.next()); - node.setConsensusStructure(consensusS); + node.setConsensusStructure(scanner.next().charAt(0)); } return column; @@ -487,7 +510,7 @@ public class HMMFile extends AlignFile for (int nodeNo = 0; nodeNo <= length; nodeNo++) { String matchLine = String.format("%7s", - nodeNo == 0 ? "COMPO" : Integer.toString(nodeNo)); + nodeNo == 0 ? COMPO : Integer.toString(nodeNo)); double[] doubleMatches = convertToLogSpace( hmm.getNode(nodeNo).getMatchEmissions()); @@ -559,13 +582,13 @@ public class HMMFile extends AlignFile if (hmm.getMSV() != null) { - output.append(String.format("%n%-19s %18s", "STATS LOCAL MSV", - hmm.getMSV())); + format = "%n%-19s %18s"; + output.append(String.format(format, "STATS LOCAL MSV", hmm.getMSV())); - output.append(String.format("%n%-19s %18s", "STATS LOCAL VITERBI", + output.append(String.format(format, "STATS LOCAL VITERBI", hmm.getViterbi())); - output.append(String.format("%n%-19s %18s", "STATS LOCAL FORWARD", + output.append(String.format(format, "STATS LOCAL FORWARD", hmm.getForward())); } } @@ -603,25 +626,12 @@ public class HMMFile extends AlignFile } } - /** - * Returns the char value of a single lettered String. - * - * @param string - * @return - */ - char charValue(String string) - { - char character; - character = string.charAt(0); - return character; - } - @Override - public String print(SequenceI[] seqs, boolean jvsuffix) + public String print(SequenceI[] sequences, boolean jvsuffix) { - if (seqs[0].getHMM() != null) + if (sequences[0].getHMM() != null) { - hmm = seqs[0].getHMM(); + hmm = sequences[0].getHMM(); } return print(); } @@ -637,7 +647,7 @@ public class HMMFile extends AlignFile appendProperties(output); output.append(NL); appendModelAsString(output); - output.append(NL + "//"); + output.append(NL).append(TERMINATOR).append(NL); return output.toString(); } diff --git a/test/jalview/hmmer/HMMERTest.java b/test/jalview/hmmer/HMMERTest.java index d084d77..4720dc2 100644 --- a/test/jalview/hmmer/HMMERTest.java +++ b/test/jalview/hmmer/HMMERTest.java @@ -82,7 +82,7 @@ public class HMMERTest { HiddenMarkovModel hmm = seq.getHMM(); assertNotNull(hmm); - assertEquals(hmm.getLength().intValue(), 148); + assertEquals(hmm.getLength(), 148); assertEquals(hmm.getAlphabetType(), "amino"); assertEquals(hmm.getName(), "Alignment"); assertEquals(hmm.getProperty(HMMFile.EFF_NUMBER_OF_SEQUENCES), diff --git a/test/jalview/io/HMMFileTest.java b/test/jalview/io/HMMFileTest.java index 387a915..c2afcb3 100644 --- a/test/jalview/io/HMMFileTest.java +++ b/test/jalview/io/HMMFileTest.java @@ -2,9 +2,9 @@ package jalview.io; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; -import static org.testng.Assert.fail; import jalview.datamodel.HMMNode; import jalview.datamodel.HiddenMarkovModel; @@ -52,7 +52,7 @@ public class HMMFileTest { assertEquals(hmm.getProperty(HMMFile.ACCESSION_NUMBER), "PF00069.17"); assertEquals(hmm.getProperty(HMMFile.DESCRIPTION), "Protein kinase domain"); - assertEquals(hmm.getLength().intValue(), 260); + assertEquals(hmm.getLength(), 260); assertNull(hmm.getProperty(HMMFile.MAX_LENGTH)); assertEquals(hmm.getAlphabetType(), "amino"); assertFalse(hmm.getBooleanProperty(HMMFile.REFERENCE_ANNOTATION)); @@ -114,6 +114,59 @@ public class HMMFileTest { assertEquals(hmm.getMaskedValue(183), '-'); assertEquals(hmm.getConsensusStructure(240), 'H'); } + + /** + * Test that Jalview can parse an HMM file even with a bunch of 'mandatory' + * fields missing (including no MAP annotation or // terminator line) + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testParse_minimalFile() throws IOException + { + /* + * ALPH is absent, alphabet inferred from HMM header line + * Optional COMPO line is absent + * first line after HMM is a guide line for readability + * next line is BEGIN node insert emissions + * next line is BEGIN node transitions + * next line is first sequence node match emissions 1.1 1.2 1.3 + * next line is first sequence node insert emissions 1.4 1.5 1.6 + * last line is first sequence node transitions + */ + //@formatter:off + String hmmData = + "HMMER3\n" + + "HMM P M J\n" + + // both spec and parser require a line after the HMM line + " m->m m->i m->d i->m i->i d->m d->d\n" + + " 0.1 0.2 0.3\n" + + " 0.4 0.5 0.6 0.7 0.8 0.9 0.95\n" + + " 1 1.1 1.2 1.3 - - - - -\n" + + " 1.4 1.5 1.6\n" + + " 1.7 1.8 1.9 2.0 2.1 2.2 2.3\n" + + " 2 1.01 1.02 1.03 - - - - -\n" + + " 1.04 1.05 1.06\n" + + " 1.7 1.8 1.9 2.0 2.1 2.2 2.3\n"; + //@formatter:on + HMMFile parser = new HMMFile(hmmData, DataSourceType.PASTE); + HiddenMarkovModel hmm = parser.getHMM(); + assertNotNull(hmm); + assertEquals(hmm.getSymbols(), "PMJ"); + assertEquals(hmm.getLength(), 0); // no LENG property :-( + + // node 1 (implicitly mapped to column 0) + double prob = hmm.getMatchEmissionProbability(0, 'p'); + assertEquals(prob, Math.pow(Math.E, -1.1)); + prob = hmm.getInsertEmissionProbability(0, 'J'); + assertEquals(prob, Math.pow(Math.E, -1.6)); + + // node 2 (implicitly mapped to column 1) + prob = hmm.getMatchEmissionProbability(1, 'M'); + assertEquals(prob, Math.pow(Math.E, -1.02)); + prob = hmm.getInsertEmissionProbability(1, 'm'); + assertEquals(prob, Math.pow(Math.E, -1.05)); + } @Test(groups = "Functional") public void testParseHeaderLines_amino() throws IOException @@ -244,10 +297,12 @@ public class HMMFileTest { new File("test/jalview/io/test_MADE1_hmm.txt")); BufferedReader br = new BufferedReader(fr); HiddenMarkovModel testHMM = new HiddenMarkovModel(); - for (int i = 0; i < 24; i++) + String line = null; + do { - br.readLine(); - } + line = br.readLine(); // skip header lines up to HMM plus one + } while (!line.startsWith("HMM ")); + br.readLine(); made1.parseModel(br); testHMM = made1.getHMM(); @@ -287,18 +342,6 @@ public class HMMFileTest { Double.NEGATIVE_INFINITY); } - /** - * Test that if no mapping of nodes to aligned columns is provided by the HMM - * file, we construct one - * - * @throws IOException - */ - @Test(groups = "Functional") - public void testParseModel_noMap() throws IOException - { - fail("test to be written"); - } - @Test(groups = "Functional") public void testParseAnnotations() { -- 1.7.10.2