X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=test%2Fjalview%2Fio%2FHMMFileTest.java;fp=test%2Fjalview%2Fio%2FHMMFileTest.java;h=cf74f5581c78f1cf3c179a53dd1292044b332e87;hb=6398c0217546a4d6943f63f670c2684c7c7274be;hp=0000000000000000000000000000000000000000;hpb=9b85e1552fa57f02cf6cd312cfbd7efdfd079ea3;p=jalview.git diff --git a/test/jalview/io/HMMFileTest.java b/test/jalview/io/HMMFileTest.java new file mode 100644 index 0000000..cf74f55 --- /dev/null +++ b/test/jalview/io/HMMFileTest.java @@ -0,0 +1,531 @@ +package jalview.io; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; + +import jalview.datamodel.HMMNode; +import jalview.datamodel.HiddenMarkovModel; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Scanner; + +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import junit.extensions.PA; + +public class HMMFileTest { + + HMMFile fn3; + + HMMFile pKinase; + + HMMFile made1; + + @BeforeClass(alwaysRun = true) + public void setUp() throws IOException + { + fn3 = new HMMFile("test/jalview/io/test_fn3_hmm.txt", + DataSourceType.FILE); + + pKinase = new HMMFile("test/jalview/io/test_PKinase_hmm.txt", + DataSourceType.FILE); + + made1 = new HMMFile("test/jalview/io/test_MADE1_hmm.txt", + DataSourceType.FILE); + } + + @Test(groups = "Functional") + public void testParse() throws IOException + { + HiddenMarkovModel hmm = pKinase.getHMM(); + assertEquals(hmm.getName(), "Pkinase"); + assertEquals(hmm.getProperty(HMMFile.ACCESSION_NUMBER), "PF00069.17"); + assertEquals(hmm.getProperty(HMMFile.DESCRIPTION), + "Protein kinase domain"); + assertEquals(hmm.getLength(), 260); + assertNull(hmm.getProperty(HMMFile.MAX_LENGTH)); + assertEquals(hmm.getAlphabetType(), "amino"); + assertFalse(hmm.getBooleanProperty(HMMFile.REFERENCE_ANNOTATION)); + assertFalse(hmm.getBooleanProperty(HMMFile.MASKED_VALUE)); + assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_RESIDUE)); + assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_STRUCTURE)); + assertTrue(hmm.getBooleanProperty(HMMFile.MAP)); + assertEquals(hmm.getProperty(HMMFile.DATE), "Thu Jun 16 11:44:06 2011"); + assertNull(hmm.getProperty(HMMFile.COMMAND_LOG)); + assertEquals(hmm.getProperty(HMMFile.NUMBER_OF_SEQUENCES), "54"); + assertEquals(hmm.getProperty(HMMFile.EFF_NUMBER_OF_SEQUENCES), + "3.358521"); + assertEquals(hmm.getProperty(HMMFile.CHECK_SUM), "3106786190"); + assertEquals(hmm.getProperty(HMMFile.GATHERING_THRESHOLD), + "70.30 70.30"); + assertEquals(hmm.getProperty(HMMFile.TRUSTED_CUTOFF), "70.30 70.30"); + assertEquals(hmm.getProperty(HMMFile.NOISE_CUTOFF), "70.20 70.20"); + + assertEquals(hmm.getSymbols(), "ACDEFGHIKLMNPQRSTVWY"); + + assertEquals(hmm.getMatchEmissionProbability(0, 'Y'), 0.16102, 0.001d); + assertEquals(hmm.getMatchEmissionProbability(11, 'P'), 0.0130, 0.001d); + assertEquals(hmm.getMatchEmissionProbability(24, 'I'), 0.02583, 0.001d); + assertEquals(hmm.getMatchEmissionProbability(83, 'C'), 0.008549, + 0.001d); + assertEquals(hmm.getMatchEmissionProbability(332, 'E'), 0.07998, + 0.001d); + assertEquals(hmm.getMatchEmissionProbability(381, 'D'), 0.014465, + 0.001d); + assertEquals(hmm.getMatchEmissionProbability(475, 'Y'), 0.02213, + 0.001d); + + assertEquals(hmm.getInsertEmissionProbability(1, 'C'), 0.012, 0.001d); + assertEquals(hmm.getInsertEmissionProbability(14, 'H'), 0.02411, + 0.001d); + assertEquals(hmm.getInsertEmissionProbability(23, 'L'), 0.06764, + 0.001d); + assertEquals(hmm.getInsertEmissionProbability(90, 'D'), 0.0623, 0.001d); + assertEquals(hmm.getInsertEmissionProbability(374, 'T'), 0.0623, + 0.001d); + assertEquals(hmm.getInsertEmissionProbability(470, 'P'), 0.0647, + 0.001d); + + assertEquals(hmm.getStateTransitionProbability(2, 6), 0.3848, 0.001d); + assertEquals(hmm.getStateTransitionProbability(38, 3), 0.5382, 0.001d); + assertEquals(hmm.getStateTransitionProbability(305, 3), 0.2916, 0.001d); + assertEquals(hmm.getStateTransitionProbability(380, 0), 0.99, 0.001d); + assertEquals(hmm.getStateTransitionProbability(453, 1), 0.0066, 0.001d); + + assertEquals(hmm.getNodeMapPosition(3), 3); + assertEquals(hmm.getReferenceAnnotation(7), '-'); + assertEquals(hmm.getConsensusResidue(23), 't'); + assertEquals(hmm.getMaskedValue(30), '-'); + assertEquals(hmm.getConsensusStructure(56), 'S'); + + assertEquals(hmm.getNodeMapPosition(78), 136); + assertEquals(hmm.getReferenceAnnotation(93), '-'); + assertEquals(hmm.getConsensusResidue(145), 'a'); + assertEquals(hmm.getMaskedValue(183), '-'); + assertEquals(hmm.getConsensusStructure(240), 'H'); + } + + /** + * Test that Jalview can parse an HMM file even with a bunch of 'mandatory' + * fields missing (including no MAP annotation or // terminator line) + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testParse_minimalFile() throws IOException + { + /* + * ALPH is absent, alphabet inferred from HMM header line + * Optional COMPO line is absent + * first line after HMM is a guide line for readability + * next line is BEGIN node insert emissions + * next line is BEGIN node transitions + * next line is first sequence node match emissions 1.1 1.2 1.3 + * next line is first sequence node insert emissions 1.4 1.5 1.6 + * last line is first sequence node transitions + */ + //@formatter:off + String hmmData = + "HMMER3\n" + + "HMM P M J\n" + + // both spec and parser require a line after the HMM line + " m->m m->i m->d i->m i->i d->m d->d\n" + + " 0.1 0.2 0.3\n" + + " 0.4 0.5 0.6 0.7 0.8 0.9 0.95\n" + + " 1 1.1 1.2 1.3 - - - - -\n" + + " 1.4 1.5 1.6\n" + + " 1.7 1.8 1.9 2.0 2.1 2.2 2.3\n" + + " 2 1.01 1.02 1.03 - - - - -\n" + + " 1.04 1.05 1.06\n" + + " 1.7 1.8 1.9 2.0 2.1 2.2 2.3\n"; + //@formatter:on + HMMFile parser = new HMMFile(hmmData, DataSourceType.PASTE); + HiddenMarkovModel hmm = parser.getHMM(); + assertNotNull(hmm); + assertEquals(hmm.getSymbols(), "PMJ"); + // no LENG property: this should return node count excluding BEGIN node + assertEquals(hmm.getLength(), 2); + + // node 1 (implicitly mapped to column 0) + double prob = hmm.getMatchEmissionProbability(0, 'p'); + assertEquals(prob, Math.pow(Math.E, -1.1)); + prob = hmm.getInsertEmissionProbability(0, 'J'); + assertEquals(prob, Math.pow(Math.E, -1.6)); + + // node 2 (implicitly mapped to column 1) + prob = hmm.getMatchEmissionProbability(1, 'M'); + assertEquals(prob, Math.pow(Math.E, -1.02)); + prob = hmm.getInsertEmissionProbability(1, 'm'); + assertEquals(prob, Math.pow(Math.E, -1.05)); + } + + @Test(groups = "Functional") + public void testParseHeaderLines_amino() throws IOException + { + FileReader fr = new FileReader( + new File("test/jalview/io/test_fn3_hmm.txt")); + BufferedReader br = new BufferedReader(fr); + HiddenMarkovModel hmm = new HiddenMarkovModel(); + HMMFile testee = new HMMFile(); + PA.setValue(testee, "hmm", hmm); + testee.parseHeaderLines(br); + br.close(); + fr.close(); + + assertEquals(hmm.getName(), "fn3"); + assertEquals(hmm.getProperty(HMMFile.ACCESSION_NUMBER), "PF00041.13"); + assertEquals(hmm.getProperty(HMMFile.DESCRIPTION), + "Fibronectin type III domain"); + assertEquals(hmm.getProperty(HMMFile.LENGTH), "86"); + assertNull(hmm.getProperty(HMMFile.MAX_LENGTH)); + assertEquals(hmm.getAlphabetType(), "amino"); + assertFalse(hmm.getBooleanProperty(HMMFile.REFERENCE_ANNOTATION)); + assertFalse(hmm.getBooleanProperty(HMMFile.MASKED_VALUE)); + assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_RESIDUE)); + assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_STRUCTURE)); + + assertTrue(hmm.getBooleanProperty(HMMFile.MAP)); + assertEquals(hmm.getProperty(HMMFile.DATE), "Fri Jun 20 08:22:31 2014"); + assertNull(hmm.getProperty(HMMFile.COMMAND_LOG)); + assertEquals(hmm.getProperty(HMMFile.NUMBER_OF_SEQUENCES), "106"); + assertEquals(hmm.getProperty(HMMFile.EFF_NUMBER_OF_SEQUENCES), + "11.415833"); + assertEquals(hmm.getProperty(HMMFile.CHECK_SUM), "3564431818"); + assertEquals(hmm.getProperty(HMMFile.GATHERING_THRESHOLD), "8.00 7.20"); + assertEquals(hmm.getProperty(HMMFile.TRUSTED_CUTOFF), "8.00 7.20"); + assertEquals(hmm.getProperty(HMMFile.NOISE_CUTOFF), "7.90 7.90"); + assertEquals(hmm.getViterbi(), "-9.7737 0.71847"); + assertEquals(hmm.getMSV(), "-9.4043 0.71847"); + assertEquals(hmm.getForward(), "-3.8341 0.71847"); + } + + @Test(groups = "Functional") + public void testParseHeaderLines_dna() throws IOException + { + FileReader fr = new FileReader( + new File("test/jalview/io/test_MADE1_hmm.txt")); + BufferedReader br = new BufferedReader(fr); + HiddenMarkovModel hmm = new HiddenMarkovModel(); + HMMFile testee = new HMMFile(); + PA.setValue(testee, "hmm", hmm); + testee.parseHeaderLines(br); + br.close(); + fr.close(); + + assertEquals(hmm.getName(), "MADE1"); + assertEquals(hmm.getProperty(HMMFile.ACCESSION_NUMBER), + "DF0000629.2"); + assertEquals(hmm.getProperty(HMMFile.DESCRIPTION), + "MADE1 (MAriner Derived Element 1), a TcMar-Mariner DNA transposon"); + assertEquals(hmm.getProperty(HMMFile.LENGTH), "80"); + assertEquals(hmm.getProperty(HMMFile.MAX_LENGTH), "426"); + assertEquals(hmm.getAlphabetType(), "DNA"); + assertTrue(hmm.getBooleanProperty(HMMFile.REFERENCE_ANNOTATION)); + assertFalse(hmm.getBooleanProperty(HMMFile.MASKED_VALUE)); + assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_RESIDUE)); + assertFalse(hmm.getBooleanProperty(HMMFile.CONSENSUS_STRUCTURE)); + assertTrue(hmm.getBooleanProperty(HMMFile.MAP)); + assertEquals(hmm.getProperty(HMMFile.DATE), "Tue Feb 19 20:33:41 2013"); + assertNull(hmm.getProperty(HMMFile.COMMAND_LOG)); + assertEquals(hmm.getProperty(HMMFile.NUMBER_OF_SEQUENCES), "1997"); + assertEquals(hmm.getProperty(HMMFile.EFF_NUMBER_OF_SEQUENCES), "3.911818"); + assertEquals(hmm.getProperty(HMMFile.CHECK_SUM), "3015610723"); + assertEquals(hmm.getProperty(HMMFile.GATHERING_THRESHOLD), + "2.324 4.234"); + assertEquals(hmm.getProperty(HMMFile.TRUSTED_CUTOFF), "2.343 1.212"); + assertEquals(hmm.getProperty(HMMFile.NOISE_CUTOFF), "2.354 5.456"); + assertEquals(hmm.getViterbi(), "-9.3632 0.71858"); + assertEquals(hmm.getMSV(), "-8.5786 0.71858"); + assertEquals(hmm.getForward(), "-3.4823 0.71858"); + } + + @Test(groups = "Functional") + public void testFillList() throws IOException + { + Scanner scanner1 = new Scanner("1.3 2.4 5.3 3.9 9.8 4.7 4.3 2.3 6.9"); + ArrayList filledArray = new ArrayList<>(); + + filledArray.add(0.27253); + filledArray.add(0.0907); + filledArray.add(0.00499); + filledArray.add(0.02024); + filledArray.add(0.00005); + filledArray.add(0.00909); + filledArray.add(0.01357); + filledArray.add(0.10026); + filledArray.add(0.001); + + double[] testList = HMMFile.parseDoubles(scanner1, 9); + + for (int i = 0; i < 9; i++) + { + assertEquals(testList[i], filledArray.get(i), 0.001d); + } + + filledArray.clear(); + scanner1.close(); + + Scanner scanner2 = new Scanner( + "1.346 5.554 35.345 5.64 1.4"); + filledArray.add(0.2603); + filledArray.add(0.00387); + filledArray.add(0d); + filledArray.add(0.00355); + filledArray.add(0.2466); + + testList = HMMFile.parseDoubles(scanner2, 5); + + for (int i = 0; i < 5; i++) + { + assertEquals(testList[i], filledArray.get(i), 0.001d); + } + } + + @Test(groups = "Functional") + public void testParseModel() throws IOException + { + FileReader fr = new FileReader( + new File("test/jalview/io/test_MADE1_hmm.txt")); + BufferedReader br = new BufferedReader(fr); + HiddenMarkovModel testHMM = new HiddenMarkovModel(); + String line = null; + do + { + line = br.readLine(); // skip header lines up to HMM plus one + } while (!line.startsWith("HMM ")); + br.readLine(); + + made1.parseModel(br); + testHMM = made1.getHMM(); + + br.close(); + fr.close(); + + assertEquals(testHMM.getMatchEmissionProbability(1, 'C'), 0.09267, + 0.001d); + assertEquals(testHMM.getMatchEmissionProbability(25, 'G'), 0.07327, + 0.001d); + assertEquals(testHMM.getMatchEmissionProbability(1092, 'C'), 0.04184, + 0.001d); + assertEquals(testHMM.getMatchEmissionProbability(1107, 'G'), 0.07, + 0.001d); + + assertEquals(testHMM.getInsertEmissionProbability(0, 'G'), 0.25, + 0.001d); + assertEquals(testHMM.getInsertEmissionProbability(247, 'T'), 0.2776, + 0.001d); + assertEquals(testHMM.getInsertEmissionProbability(1096, 'T'), 0.25, + 0.001d); + assertEquals(testHMM.getInsertEmissionProbability(1111, 'T'), 0.25, + 0.001d); + + assertEquals(testHMM.getStateTransitionProbability(1, 0), 0.9634, + 0.001d); + assertEquals(testHMM.getStateTransitionProbability(5, 1), 0.0203, + 0.001d); + assertEquals(testHMM.getStateTransitionProbability(14, 3), 0.2515, + 0.001d); + assertEquals(testHMM.getStateTransitionProbability(65, 4), 0.78808, + 0.001d); + assertEquals(testHMM.getStateTransitionProbability(1080, 2), 0.01845, + 0.001d); + assertEquals(testHMM.getStateTransitionProbability(1111, 6), + Double.NEGATIVE_INFINITY); + } + + @Test(groups = "Functional") + public void testParseAnnotations() + { + HMMFile testFile = new HMMFile(); + HiddenMarkovModel hmm = new HiddenMarkovModel(); + PA.setValue(testFile, "hmm", hmm); + + List nodes = new ArrayList<>(); + nodes.add(new HMMNode()); // BEGIN node + + hmm.setProperty(HMMFile.CONSENSUS_RESIDUE, "yes"); + hmm.setProperty(HMMFile.MAP, "yes"); + hmm.setProperty(HMMFile.REFERENCE_ANNOTATION, "yes"); + hmm.setProperty(HMMFile.CONSENSUS_STRUCTURE, "yes"); + hmm.setProperty(HMMFile.MASKED_VALUE, "yes"); + Scanner scanner = new Scanner("1345 t t t t"); + HMMNode node = new HMMNode(); + nodes.add(node); + testFile.parseAnnotations(scanner, node); + + hmm.setProperty(HMMFile.CONSENSUS_RESIDUE, "yes"); + hmm.setProperty(HMMFile.MAP, "no"); + hmm.setProperty(HMMFile.REFERENCE_ANNOTATION, "yes"); + hmm.setProperty(HMMFile.CONSENSUS_STRUCTURE, "no"); + hmm.setProperty(HMMFile.MASKED_VALUE, "no"); + Scanner scanner2 = new Scanner("- y x - -"); + node = new HMMNode(); + nodes.add(node); + testFile.parseAnnotations(scanner2, node); + + hmm.setNodes(nodes); + + assertEquals(hmm.getNodeMapPosition(1), 1345); + assertEquals(hmm.getConsensusResidue(1), 't'); + assertEquals(hmm.getReferenceAnnotation(1), 't'); + assertEquals(hmm.getMaskedValue(1), 't'); + assertEquals(hmm.getConsensusStructure(1), 't'); + + scanner.close(); + } + + /** + * tests to see if file produced by the output matches the file from the input + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testPrint_roundTrip() throws IOException + { + String output = pKinase.print(); + HMMFile pKinaseClone = new HMMFile( + new FileParse(output, DataSourceType.PASTE)); + HiddenMarkovModel pKinaseHMM = pKinase.getHMM(); + HiddenMarkovModel pKinaseCloneHMM = pKinaseClone.getHMM(); + + checkModelsMatch(pKinaseHMM, pKinaseCloneHMM); + } + + /** + * A helper method to check two HMM models have the same values + * + * @param model1 + * @param model2 + */ + protected void checkModelsMatch(HiddenMarkovModel model1, + HiddenMarkovModel model2) + { + assertEquals(model1.getLength(), model2.getLength()); + + for (int i = 0; i < model1.getLength(); i++) + { + String msg = "For Node" + i; + assertEquals(model1.getNode(i).getMatchEmissions(), + model2.getNode(i).getMatchEmissions(), msg); + assertEquals(model1.getNode(i).getInsertEmissions(), + model2.getNode(i).getInsertEmissions(), msg); + assertEquals(model1.getNode(i).getStateTransitions(), + model2.getNode(i).getStateTransitions(), msg); + + if (i > 0) + { + assertEquals(model1.getNodeMapPosition(i), + model2.getNodeMapPosition(i), msg); + assertEquals(model1.getReferenceAnnotation(i), + model2.getReferenceAnnotation(i), msg); + assertEquals(model1.getConsensusResidue(i), + model2.getConsensusResidue(i), msg); + } + } + } + + @Test(groups = "Functional") + public void testAppendProperties() throws FileNotFoundException + { + StringBuilder sb = new StringBuilder(); + fn3.appendProperties(sb); + + Scanner testScanner = new Scanner(sb.toString()); + + String[] expected = new String[] { "HMMER3/f [3.1b1 | May 2013]", + "NAME fn3", "ACC PF00041.13", + "DESC Fibronectin type III domain", "LENG 86", "ALPH amino", + "RF no", "MM no", "CONS yes", "CS yes", "MAP yes", + "DATE Fri Jun 20 08:22:31 2014", "NSEQ 106", "EFFN 11.415833", + "CKSUM 3564431818", "GA 8.00 7.20", "TC 8.00 7.20", + "NC 7.90 7.90", "STATS LOCAL MSV -9.4043 0.71847", + "STATS LOCAL VITERBI -9.7737 0.71847", + "STATS LOCAL FORWARD -3.8341 0.71847" }; + + for (String value : expected) + { + assertEquals(testScanner.nextLine(), value); + } + + testScanner.close(); + } + + @Test(groups = "Functional") + public void testAppendModelAsString() throws FileNotFoundException + { + StringBuilder sb = new StringBuilder(); + fn3.appendModelAsString(sb); + String string = sb.toString(); + + assertEquals(findValue(2, 2, 2, string), "4.42225"); + assertEquals(findValue(12, 14, 1, string), "2.79307"); + assertEquals(findValue(6, 24, 3, string), "0.48576"); + assertEquals(findValue(19, 33, 2, string), "4.58477"); + assertEquals(findValue(20, 64, 2, string), "3.61505"); + assertEquals(findValue(3, 72, 3, string), "6.81068"); + assertEquals(findValue(10, 80, 2, string), "2.69355"); + assertEquals(findValue(16, 65, 1, string), "2.81003"); + assertEquals(findValue(14, 3, 1, string), "2.69012"); + assertEquals(findValue(11, 32, 1, string), "4.34805"); + } + + /** + * A helper method to find a token in the model string + * + * @param symbolIndex + * index of symbol being searched. First symbol has index 1. + * @param nodeIndex + * index of node being searched. Begin node has index 0. First node + * has index 1. + * @param line + * index of line being searched in node. First line has index 1. + * @param model + * string model being searched + * @return value at specified position + */ + private String findValue(int symbolIndex, int nodeIndex, int line, + String model) + { + String value = ""; + Scanner scanner = new Scanner(model); + scanner.nextLine(); + scanner.nextLine(); + + for (int lineIndex = 0; lineIndex < line - 1; lineIndex++) + { + scanner.nextLine(); + } + for (int node = 0; node < nodeIndex; node++) + { + scanner.nextLine(); + scanner.nextLine(); + scanner.nextLine(); + } + + for (int symbol = 0; symbol < symbolIndex; symbol++) + { + value = scanner.next(); + if ("COMPO".equals(value)) + { + scanner.next(); + } + else if (value.length() < 7) + { + scanner.next(); + } + } + scanner.close(); + return value; + } +} +