package jalview.io; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; import jalview.datamodel.HMMNode; import jalview.datamodel.HiddenMarkovModel; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Scanner; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import junit.extensions.PA; public class HMMFileTest { HMMFile fn3; HMMFile pKinase; HMMFile made1; @BeforeClass(alwaysRun = true) public void setUp() throws IOException { fn3 = new HMMFile("test/jalview/io/test_fn3_hmm.txt", DataSourceType.FILE); pKinase = new HMMFile("test/jalview/io/test_PKinase_hmm.txt", DataSourceType.FILE); made1 = new HMMFile("test/jalview/io/test_MADE1_hmm.txt", DataSourceType.FILE); } @Test(groups = "Functional") public void testParse() throws IOException { HiddenMarkovModel hmm = pKinase.getHMM(); assertEquals(hmm.getName(), "Pkinase"); assertEquals(hmm.getProperty(HMMFile.ACCESSION_NUMBER), "PF00069.17"); assertEquals(hmm.getProperty(HMMFile.DESCRIPTION), "Protein kinase domain"); assertEquals(hmm.getLength(), 260); assertNull(hmm.getProperty(HMMFile.MAX_LENGTH)); assertEquals(hmm.getAlphabetType(), "amino"); assertFalse(hmm.getBooleanProperty(HMMFile.REFERENCE_ANNOTATION)); assertFalse(hmm.getBooleanProperty(HMMFile.MASKED_VALUE)); assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_RESIDUE)); assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_STRUCTURE)); assertTrue(hmm.getBooleanProperty(HMMFile.MAP)); assertEquals(hmm.getProperty(HMMFile.DATE), "Thu Jun 16 11:44:06 2011"); assertNull(hmm.getProperty(HMMFile.COMMAND_LOG)); assertEquals(hmm.getProperty(HMMFile.NUMBER_OF_SEQUENCES), "54"); assertEquals(hmm.getProperty(HMMFile.EFF_NUMBER_OF_SEQUENCES), "3.358521"); assertEquals(hmm.getProperty(HMMFile.CHECK_SUM), "3106786190"); assertEquals(hmm.getProperty(HMMFile.GATHERING_THRESHOLD), "70.30 70.30"); assertEquals(hmm.getProperty(HMMFile.TRUSTED_CUTOFF), "70.30 70.30"); assertEquals(hmm.getProperty(HMMFile.NOISE_CUTOFF), "70.20 70.20"); assertEquals(hmm.getSymbols(), "ACDEFGHIKLMNPQRSTVWY"); assertEquals(hmm.getMatchEmissionProbability(0, 'Y'), 0.16102, 0.001d); assertEquals(hmm.getMatchEmissionProbability(11, 'P'), 0.0130, 0.001d); assertEquals(hmm.getMatchEmissionProbability(24, 'I'), 0.02583, 0.001d); assertEquals(hmm.getMatchEmissionProbability(83, 'C'), 0.008549, 0.001d); assertEquals(hmm.getMatchEmissionProbability(332, 'E'), 0.07998, 0.001d); assertEquals(hmm.getMatchEmissionProbability(381, 'D'), 0.014465, 0.001d); assertEquals(hmm.getMatchEmissionProbability(475, 'Y'), 0.02213, 0.001d); assertEquals(hmm.getInsertEmissionProbability(1, 'C'), 0.012, 0.001d); assertEquals(hmm.getInsertEmissionProbability(14, 'H'), 0.02411, 0.001d); assertEquals(hmm.getInsertEmissionProbability(23, 'L'), 0.06764, 0.001d); assertEquals(hmm.getInsertEmissionProbability(90, 'D'), 0.0623, 0.001d); assertEquals(hmm.getInsertEmissionProbability(374, 'T'), 0.0623, 0.001d); assertEquals(hmm.getInsertEmissionProbability(470, 'P'), 0.0647, 0.001d); assertEquals(hmm.getStateTransitionProbability(2, 6), 0.3848, 0.001d); assertEquals(hmm.getStateTransitionProbability(38, 3), 0.5382, 0.001d); assertEquals(hmm.getStateTransitionProbability(305, 3), 0.2916, 0.001d); assertEquals(hmm.getStateTransitionProbability(380, 0), 0.99, 0.001d); assertEquals(hmm.getStateTransitionProbability(453, 1), 0.0066, 0.001d); assertEquals(hmm.getNodeMapPosition(3), 3); assertEquals(hmm.getReferenceAnnotation(7), '-'); assertEquals(hmm.getConsensusResidue(23), 't'); assertEquals(hmm.getMaskedValue(30), '-'); assertEquals(hmm.getConsensusStructure(56), 'S'); assertEquals(hmm.getNodeMapPosition(78), 136); assertEquals(hmm.getReferenceAnnotation(93), '-'); assertEquals(hmm.getConsensusResidue(145), 'a'); assertEquals(hmm.getMaskedValue(183), '-'); assertEquals(hmm.getConsensusStructure(240), 'H'); } /** * Test that Jalview can parse an HMM file even with a bunch of 'mandatory' * fields missing (including no MAP annotation or // terminator line) * * @throws IOException */ @Test(groups = "Functional") public void testParse_minimalFile() throws IOException { /* * ALPH is absent, alphabet inferred from HMM header line * Optional COMPO line is absent * first line after HMM is a guide line for readability * next line is BEGIN node insert emissions * next line is BEGIN node transitions * next line is first sequence node match emissions 1.1 1.2 1.3 * next line is first sequence node insert emissions 1.4 1.5 1.6 * last line is first sequence node transitions */ //@formatter:off String hmmData = "HMMER3\n" + "HMM P M J\n" + // both spec and parser require a line after the HMM line " m->m m->i m->d i->m i->i d->m d->d\n" + " 0.1 0.2 0.3\n" + " 0.4 0.5 0.6 0.7 0.8 0.9 0.95\n" + " 1 1.1 1.2 1.3 - - - - -\n" + " 1.4 1.5 1.6\n" + " 1.7 1.8 1.9 2.0 2.1 2.2 2.3\n" + " 2 1.01 1.02 1.03 - - - - -\n" + " 1.04 1.05 1.06\n" + " 1.7 1.8 1.9 2.0 2.1 2.2 2.3\n"; //@formatter:on HMMFile parser = new HMMFile(hmmData, DataSourceType.PASTE); HiddenMarkovModel hmm = parser.getHMM(); assertNotNull(hmm); assertEquals(hmm.getSymbols(), "PMJ"); // no LENG property: this should return node count excluding BEGIN node assertEquals(hmm.getLength(), 2); // node 1 (implicitly mapped to column 0) double prob = hmm.getMatchEmissionProbability(0, 'p'); assertEquals(prob, Math.pow(Math.E, -1.1)); prob = hmm.getInsertEmissionProbability(0, 'J'); assertEquals(prob, Math.pow(Math.E, -1.6)); // node 2 (implicitly mapped to column 1) prob = hmm.getMatchEmissionProbability(1, 'M'); assertEquals(prob, Math.pow(Math.E, -1.02)); prob = hmm.getInsertEmissionProbability(1, 'm'); assertEquals(prob, Math.pow(Math.E, -1.05)); } @Test(groups = "Functional") public void testParseHeaderLines_amino() throws IOException { FileReader fr = new FileReader( new File("test/jalview/io/test_fn3_hmm.txt")); BufferedReader br = new BufferedReader(fr); HiddenMarkovModel hmm = new HiddenMarkovModel(); HMMFile testee = new HMMFile(); PA.setValue(testee, "hmm", hmm); testee.parseHeaderLines(br); br.close(); fr.close(); assertEquals(hmm.getName(), "fn3"); assertEquals(hmm.getProperty(HMMFile.ACCESSION_NUMBER), "PF00041.13"); assertEquals(hmm.getProperty(HMMFile.DESCRIPTION), "Fibronectin type III domain"); assertEquals(hmm.getProperty(HMMFile.LENGTH), "86"); assertNull(hmm.getProperty(HMMFile.MAX_LENGTH)); assertEquals(hmm.getAlphabetType(), "amino"); assertFalse(hmm.getBooleanProperty(HMMFile.REFERENCE_ANNOTATION)); assertFalse(hmm.getBooleanProperty(HMMFile.MASKED_VALUE)); assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_RESIDUE)); assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_STRUCTURE)); assertTrue(hmm.getBooleanProperty(HMMFile.MAP)); assertEquals(hmm.getProperty(HMMFile.DATE), "Fri Jun 20 08:22:31 2014"); assertNull(hmm.getProperty(HMMFile.COMMAND_LOG)); assertEquals(hmm.getProperty(HMMFile.NUMBER_OF_SEQUENCES), "106"); assertEquals(hmm.getProperty(HMMFile.EFF_NUMBER_OF_SEQUENCES), "11.415833"); assertEquals(hmm.getProperty(HMMFile.CHECK_SUM), "3564431818"); assertEquals(hmm.getProperty(HMMFile.GATHERING_THRESHOLD), "8.00 7.20"); assertEquals(hmm.getProperty(HMMFile.TRUSTED_CUTOFF), "8.00 7.20"); assertEquals(hmm.getProperty(HMMFile.NOISE_CUTOFF), "7.90 7.90"); assertEquals(hmm.getViterbi(), "-9.7737 0.71847"); assertEquals(hmm.getMSV(), "-9.4043 0.71847"); assertEquals(hmm.getForward(), "-3.8341 0.71847"); } @Test(groups = "Functional") public void testParseHeaderLines_dna() throws IOException { FileReader fr = new FileReader( new File("test/jalview/io/test_MADE1_hmm.txt")); BufferedReader br = new BufferedReader(fr); HiddenMarkovModel hmm = new HiddenMarkovModel(); HMMFile testee = new HMMFile(); PA.setValue(testee, "hmm", hmm); testee.parseHeaderLines(br); br.close(); fr.close(); assertEquals(hmm.getName(), "MADE1"); assertEquals(hmm.getProperty(HMMFile.ACCESSION_NUMBER), "DF0000629.2"); assertEquals(hmm.getProperty(HMMFile.DESCRIPTION), "MADE1 (MAriner Derived Element 1), a TcMar-Mariner DNA transposon"); assertEquals(hmm.getProperty(HMMFile.LENGTH), "80"); assertEquals(hmm.getProperty(HMMFile.MAX_LENGTH), "426"); assertEquals(hmm.getAlphabetType(), "DNA"); assertTrue(hmm.getBooleanProperty(HMMFile.REFERENCE_ANNOTATION)); assertFalse(hmm.getBooleanProperty(HMMFile.MASKED_VALUE)); assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_RESIDUE)); assertFalse(hmm.getBooleanProperty(HMMFile.CONSENSUS_STRUCTURE)); assertTrue(hmm.getBooleanProperty(HMMFile.MAP)); assertEquals(hmm.getProperty(HMMFile.DATE), "Tue Feb 19 20:33:41 2013"); assertNull(hmm.getProperty(HMMFile.COMMAND_LOG)); assertEquals(hmm.getProperty(HMMFile.NUMBER_OF_SEQUENCES), "1997"); assertEquals(hmm.getProperty(HMMFile.EFF_NUMBER_OF_SEQUENCES), "3.911818"); assertEquals(hmm.getProperty(HMMFile.CHECK_SUM), "3015610723"); assertEquals(hmm.getProperty(HMMFile.GATHERING_THRESHOLD), "2.324 4.234"); assertEquals(hmm.getProperty(HMMFile.TRUSTED_CUTOFF), "2.343 1.212"); assertEquals(hmm.getProperty(HMMFile.NOISE_CUTOFF), "2.354 5.456"); assertEquals(hmm.getViterbi(), "-9.3632 0.71858"); assertEquals(hmm.getMSV(), "-8.5786 0.71858"); assertEquals(hmm.getForward(), "-3.4823 0.71858"); } @Test(groups = "Functional") public void testFillList() throws IOException { Scanner scanner1 = new Scanner("1.3 2.4 5.3 3.9 9.8 4.7 4.3 2.3 6.9"); ArrayList filledArray = new ArrayList<>(); filledArray.add(0.27253); filledArray.add(0.0907); filledArray.add(0.00499); filledArray.add(0.02024); filledArray.add(0.00005); filledArray.add(0.00909); filledArray.add(0.01357); filledArray.add(0.10026); filledArray.add(0.001); double[] testList = HMMFile.parseDoubles(scanner1, 9); for (int i = 0; i < 9; i++) { assertEquals(testList[i], filledArray.get(i), 0.001d); } filledArray.clear(); scanner1.close(); Scanner scanner2 = new Scanner( "1.346 5.554 35.345 5.64 1.4"); filledArray.add(0.2603); filledArray.add(0.00387); filledArray.add(0d); filledArray.add(0.00355); filledArray.add(0.2466); testList = HMMFile.parseDoubles(scanner2, 5); for (int i = 0; i < 5; i++) { assertEquals(testList[i], filledArray.get(i), 0.001d); } } @Test(groups = "Functional") public void testParseModel() throws IOException { FileReader fr = new FileReader( new File("test/jalview/io/test_MADE1_hmm.txt")); BufferedReader br = new BufferedReader(fr); HiddenMarkovModel testHMM = new HiddenMarkovModel(); String line = null; do { line = br.readLine(); // skip header lines up to HMM plus one } while (!line.startsWith("HMM ")); br.readLine(); made1.parseModel(br); testHMM = made1.getHMM(); br.close(); fr.close(); assertEquals(testHMM.getMatchEmissionProbability(1, 'C'), 0.09267, 0.001d); assertEquals(testHMM.getMatchEmissionProbability(25, 'G'), 0.07327, 0.001d); assertEquals(testHMM.getMatchEmissionProbability(1092, 'C'), 0.04184, 0.001d); assertEquals(testHMM.getMatchEmissionProbability(1107, 'G'), 0.07, 0.001d); assertEquals(testHMM.getInsertEmissionProbability(0, 'G'), 0.25, 0.001d); assertEquals(testHMM.getInsertEmissionProbability(247, 'T'), 0.2776, 0.001d); assertEquals(testHMM.getInsertEmissionProbability(1096, 'T'), 0.25, 0.001d); assertEquals(testHMM.getInsertEmissionProbability(1111, 'T'), 0.25, 0.001d); assertEquals(testHMM.getStateTransitionProbability(1, 0), 0.9634, 0.001d); assertEquals(testHMM.getStateTransitionProbability(5, 1), 0.0203, 0.001d); assertEquals(testHMM.getStateTransitionProbability(14, 3), 0.2515, 0.001d); assertEquals(testHMM.getStateTransitionProbability(65, 4), 0.78808, 0.001d); assertEquals(testHMM.getStateTransitionProbability(1080, 2), 0.01845, 0.001d); assertEquals(testHMM.getStateTransitionProbability(1111, 6), Double.NEGATIVE_INFINITY); } @Test(groups = "Functional") public void testParseAnnotations() { HMMFile testFile = new HMMFile(); HiddenMarkovModel hmm = new HiddenMarkovModel(); PA.setValue(testFile, "hmm", hmm); List nodes = new ArrayList<>(); nodes.add(new HMMNode()); // BEGIN node hmm.setProperty(HMMFile.CONSENSUS_RESIDUE, "yes"); hmm.setProperty(HMMFile.MAP, "yes"); hmm.setProperty(HMMFile.REFERENCE_ANNOTATION, "yes"); hmm.setProperty(HMMFile.CONSENSUS_STRUCTURE, "yes"); hmm.setProperty(HMMFile.MASKED_VALUE, "yes"); Scanner scanner = new Scanner("1345 t t t t"); HMMNode node = new HMMNode(); nodes.add(node); testFile.parseAnnotations(scanner, node); hmm.setProperty(HMMFile.CONSENSUS_RESIDUE, "yes"); hmm.setProperty(HMMFile.MAP, "no"); hmm.setProperty(HMMFile.REFERENCE_ANNOTATION, "yes"); hmm.setProperty(HMMFile.CONSENSUS_STRUCTURE, "no"); hmm.setProperty(HMMFile.MASKED_VALUE, "no"); Scanner scanner2 = new Scanner("- y x - -"); node = new HMMNode(); nodes.add(node); testFile.parseAnnotations(scanner2, node); hmm.setNodes(nodes); assertEquals(hmm.getNodeMapPosition(1), 1345); assertEquals(hmm.getConsensusResidue(1), 't'); assertEquals(hmm.getReferenceAnnotation(1), 't'); assertEquals(hmm.getMaskedValue(1), 't'); assertEquals(hmm.getConsensusStructure(1), 't'); scanner.close(); } /** * tests to see if file produced by the output matches the file from the input * * @throws IOException */ @Test(groups = "Functional") public void testPrint_roundTrip() throws IOException { String output = pKinase.print(); HMMFile pKinaseClone = new HMMFile( new FileParse(output, DataSourceType.PASTE)); HiddenMarkovModel pKinaseHMM = pKinase.getHMM(); HiddenMarkovModel pKinaseCloneHMM = pKinaseClone.getHMM(); checkModelsMatch(pKinaseHMM, pKinaseCloneHMM); } /** * A helper method to check two HMM models have the same values * * @param model1 * @param model2 */ protected void checkModelsMatch(HiddenMarkovModel model1, HiddenMarkovModel model2) { assertEquals(model1.getLength(), model2.getLength()); for (int i = 0; i < model1.getLength(); i++) { String msg = "For Node" + i; assertEquals(model1.getNode(i).getMatchEmissions(), model2.getNode(i).getMatchEmissions(), msg); assertEquals(model1.getNode(i).getInsertEmissions(), model2.getNode(i).getInsertEmissions(), msg); assertEquals(model1.getNode(i).getStateTransitions(), model2.getNode(i).getStateTransitions(), msg); if (i > 0) { assertEquals(model1.getNodeMapPosition(i), model2.getNodeMapPosition(i), msg); assertEquals(model1.getReferenceAnnotation(i), model2.getReferenceAnnotation(i), msg); assertEquals(model1.getConsensusResidue(i), model2.getConsensusResidue(i), msg); } } } @Test(groups = "Functional") public void testAppendProperties() throws FileNotFoundException { StringBuilder sb = new StringBuilder(); fn3.appendProperties(sb); Scanner testScanner = new Scanner(sb.toString()); String[] expected = new String[] { "HMMER3/f [3.1b1 | May 2013]", "NAME fn3", "ACC PF00041.13", "DESC Fibronectin type III domain", "LENG 86", "ALPH amino", "RF no", "MM no", "CONS yes", "CS yes", "MAP yes", "DATE Fri Jun 20 08:22:31 2014", "NSEQ 106", "EFFN 11.415833", "CKSUM 3564431818", "GA 8.00 7.20", "TC 8.00 7.20", "NC 7.90 7.90", "STATS LOCAL MSV -9.4043 0.71847", "STATS LOCAL VITERBI -9.7737 0.71847", "STATS LOCAL FORWARD -3.8341 0.71847" }; for (String value : expected) { assertEquals(testScanner.nextLine(), value); } testScanner.close(); } @Test(groups = "Functional") public void testAppendModelAsString() throws FileNotFoundException { StringBuilder sb = new StringBuilder(); fn3.appendModelAsString(sb); String string = sb.toString(); assertEquals(findValue(2, 2, 2, string), "4.42225"); assertEquals(findValue(12, 14, 1, string), "2.79307"); assertEquals(findValue(6, 24, 3, string), "0.48576"); assertEquals(findValue(19, 33, 2, string), "4.58477"); assertEquals(findValue(20, 64, 2, string), "3.61505"); assertEquals(findValue(3, 72, 3, string), "6.81068"); assertEquals(findValue(10, 80, 2, string), "2.69355"); assertEquals(findValue(16, 65, 1, string), "2.81003"); assertEquals(findValue(14, 3, 1, string), "2.69012"); assertEquals(findValue(11, 32, 1, string), "4.34805"); } /** * A helper method to find a token in the model string * * @param symbolIndex * index of symbol being searched. First symbol has index 1. * @param nodeIndex * index of node being searched. Begin node has index 0. First node * has index 1. * @param line * index of line being searched in node. First line has index 1. * @param model * string model being searched * @return value at specified position */ private String findValue(int symbolIndex, int nodeIndex, int line, String model) { String value = ""; Scanner scanner = new Scanner(model); scanner.nextLine(); scanner.nextLine(); for (int lineIndex = 0; lineIndex < line - 1; lineIndex++) { scanner.nextLine(); } for (int node = 0; node < nodeIndex; node++) { scanner.nextLine(); scanner.nextLine(); scanner.nextLine(); } for (int symbol = 0; symbol < symbolIndex; symbol++) { value = scanner.next(); if ("COMPO".equals(value)) { scanner.next(); } else if (value.length() < 7) { scanner.next(); } } scanner.close(); return value; } }