package jalview.datamodel; import jalview.io.HMMFile; import jalview.schemes.ResidueProperties; import jalview.util.Comparison; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; /** * Data structure which stores a hidden Markov model * * @author TZVanaalten * */ public class HiddenMarkovModel { public final static String YES = "yes"; public final static String NO = "no"; public static final int MATCHTOMATCH = 0; public static final int MATCHTOINSERT = 1; public static final int MATCHTODELETE = 2; public static final int INSERTTOMATCH = 3; public static final int INSERTTOINSERT = 4; public static final int DELETETOMATCH = 5; public static final int DELETETODELETE = 6; private static final double LOG2 = Math.log(2); /* * properties read from HMM file header lines */ Map fileProperties = new HashMap<>(); String fileHeader; /* * the symbols used in this model e.g. "ACGT" */ String alphabet; /* * symbol lookup index into the alphabet for 'A' to 'Z' */ int[] symbolIndexLookup = new int['Z' - 'A' + 1]; /* * Nodes in the model. The begin node is at index 0, and contains * average emission probabilities for each symbol. */ List nodes = new ArrayList<>(); /* * lookup of the HMM node for each alignment column (from 0) */ Map nodeLookup = new HashMap<>(); /** * Constructor */ public HiddenMarkovModel() { } public HiddenMarkovModel(HiddenMarkovModel hmm) { super(); this.fileProperties = new HashMap<>(hmm.fileProperties); this.alphabet = hmm.alphabet; this.nodes = new ArrayList<>(hmm.nodes); this.nodeLookup = new HashMap<>(hmm.nodeLookup); this.symbolIndexLookup = hmm.symbolIndexLookup; this.fileHeader = new String(hmm.fileHeader); } /** * Returns the information content at a specified column, calculated as the * sum (over possible symbols) of the log ratio * *

   *  log(emission probability / background probability) / log(2)
   *

* * @param column * column position (base 0) * @return */ public float getInformationContent(int column) { float informationContent = 0f; for (char symbol : getSymbols().toCharArray()) { float freq = ResidueProperties.backgroundFrequencies .get(getAlphabetType()).get(symbol); float prob = (float) getMatchEmissionProbability(column, symbol); informationContent += prob * Math.log(prob / freq); } informationContent = informationContent / (float) LOG2; return informationContent; } /** * Gets the file header of the .hmm file this model came from * * @return */ public String getFileHeader() { return fileHeader; } /** * Sets the file header of this model. * * @param header */ public void setFileHeader(String header) { fileHeader = header; } /** * Returns the symbols used in this hidden Markov model * * @return */ public String getSymbols() { return alphabet; } /** * Gets the node in the hidden Markov model at the specified position. * * @param nodeIndex * The index of the node requested. Node 0 optionally contains the * average match emission probabilities across the entire model, and * always contains the insert emission probabilities and state * transition probabilities for the begin node. Node 1 contains the * first node in the HMM that can correspond to a column in the * alignment. * @return */ public HMMNode getNode(int nodeIndex) { return nodes.get(nodeIndex); } /** * Returns the name of the sequence alignment on which the HMM is based. * * @return */ public String getName() { return fileProperties.get(HMMFile.NAME); } /** * Answers the string value of the property (parsed from an HMM file) for the * given key, or null if the property is not present * * @param key * @return */ public String getProperty(String key) { return fileProperties.get(key); } /** * Answers true if the property with the given key is present with a value of * "yes" (not case-sensitive), else false * * @param key * @return */ public boolean getBooleanProperty(String key) { return YES.equalsIgnoreCase(fileProperties.get(key)); } /** * Returns the length of the hidden Markov model, or 0 if not known * * @return */ public int getLength() { if (fileProperties.get(HMMFile.LENGTH) == null) { return 0; } return Integer.parseInt(fileProperties.get(HMMFile.LENGTH)); } /** * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are * the options. Other alphabets may be added. * * @return */ public String getAlphabetType() { return fileProperties.get(HMMFile.ALPHABET); } /** * Sets the model alphabet to the symbols in the given string (ignoring any * whitespace), and returns the number of symbols * * @param symbols */ public int setAlphabet(String symbols) { String trimmed = symbols.toUpperCase().replaceAll("\\s", ""); int count = trimmed.length(); alphabet = trimmed; symbolIndexLookup = new int['Z' - 'A' + 1]; Arrays.fill(symbolIndexLookup, -1); int ignored = 0; /* * save the symbols in order, and a quick lookup of symbol position */ for (short i = 0; i < count; i++) { char symbol = trimmed.charAt(i); if (symbol >= 'A' && symbol <= 'Z' && symbolIndexLookup[symbol - 'A'] == -1) { symbolIndexLookup[symbol - 'A'] = i; } else { System.err .println( "Unexpected or duplicated character in HMM ALPHabet: " + symbol); ignored++; } } return count - ignored; } /** * Sets the list of nodes in this HMM to the given list. * * @param nodes * The list of nodes to which the current list of nodes is being * changed. */ public void setNodes(List nodes) { this.nodes = nodes; } /** * Gets the match emission probability for a given symbol at a column in the * alignment. * * @param alignColumn * The index of the alignment column, starting at index 0. Index 0 * usually corresponds to index 1 in the HMM. * @param symbol * The symbol for which the desired probability is being requested. * @return * */ public double getMatchEmissionProbability(int alignColumn, char symbol) { int symbolIndex = getSymbolIndex(symbol); double probability = 0d; if (symbolIndex != -1 && nodeLookup.containsKey(alignColumn)) { HMMNode node = nodeLookup.get(alignColumn); probability = node.getMatchEmission(symbolIndex); } return probability; } /** * Gets the insert emission probability for a given symbol at a column in the * alignment. * * @param alignColumn * The index of the alignment column, starting at index 0. Index 0 * usually corresponds to index 1 in the HMM. * @param symbol * The symbol for which the desired probability is being requested. * @return * */ public double getInsertEmissionProbability(int alignColumn, char symbol) { int symbolIndex = getSymbolIndex(symbol); double probability = 0d; if (symbolIndex != -1 && nodeLookup.containsKey(alignColumn)) { HMMNode node = nodeLookup.get(alignColumn); probability = node.getInsertEmission(symbolIndex); } return probability; } /** * Gets the state transition probability for a given symbol at a column in the * alignment. * * @param alignColumn * The index of the alignment column, starting at index 0. Index 0 * usually corresponds to index 1 in the HMM. * @param symbol * The symbol for which the desired probability is being requested. * @return * */ public Double getStateTransitionProbability(int alignColumn, int transition) { double probability = 0d; if (nodeLookup.containsKey(alignColumn)) { HMMNode node = nodeLookup.get(alignColumn); probability = node.getStateTransition(transition); } return probability; } /** * Returns the alignment column linked to the node at the given index. * * @param nodeIndex * The index of the node, starting from index 1. Index 0 is the begin * node, which does not correspond to a column in the alignment. * @return */ public Integer getNodeAlignmentColumn(int nodeIndex) { Integer value = nodes.get(nodeIndex).getAlignmentColumn(); return value; } /** * Returns the consensus residue at the specified node. * * @param nodeIndex * The index of the specified node. * @return */ public char getConsensusResidue(int nodeIndex) { char value = nodes.get(nodeIndex).getConsensusResidue(); return value; } /** * Returns the consensus at a given alignment column. If the character is * lower case, its emission probability is less than 0.5. * * @param columnIndex * The index of the column in the alignment for which the consensus * is desired. The list of columns starts at index 0. * @return */ public char getConsensusAtAlignColumn(int columnIndex) { char mostLikely = '-'; if (getBooleanProperty(HMMFile.CONSENSUS_RESIDUE)) { HMMNode node = nodeLookup.get(columnIndex); if (node == null) { return '-'; } mostLikely = node.getConsensusResidue(); return mostLikely; } else { double highestProb = 0; for (char character : alphabet.toCharArray()) { double prob = getMatchEmissionProbability(columnIndex, character); if (prob > highestProb) { highestProb = prob; mostLikely = character; } } if (highestProb < 0.5) { mostLikely = Character.toLowerCase(mostLikely); } return mostLikely; } } /** * Returns the reference annotation at the specified node. * * @param nodeIndex * The index of the specified node. * @return */ public char getReferenceAnnotation(int nodeIndex) { char value = nodes.get(nodeIndex).getReferenceAnnotation(); return value; } /** * Returns the mask value at the specified node. * * @param nodeIndex * The index of the specified node. * @return */ public char getMaskedValue(int nodeIndex) { char value = nodes.get(nodeIndex).getMaskValue(); return value; } /** * Returns the consensus structure at the specified node. * * @param nodeIndex * The index of the specified node. * @return */ public char getConsensusStructure(int nodeIndex) { char value = nodes.get(nodeIndex).getConsensusStructure(); return value; } /** * Returns the number of symbols in the alphabet used in this HMM. * * @return */ public int getNumberOfSymbols() { return alphabet.length(); } /** * Sets a property read from an HMM file * * @param key * @param value */ public void setProperty(String key, String value) { fileProperties.put(key, value); } /** * Sets the alignment column of the specified node * * @param nodeIndex * * @param column * */ public void setAlignmentColumn(HMMNode node, int column) { node.setAlignmentColumn(column); nodeLookup.put(column, node); } /** * Updates the mapping of nodes of the HMM to non-gapped positions of the * sequence. Nodes 1, 2, 3... are mapped to the columns occupied by the first, * second, third... residues of the sequence. The 'begin' node (node 0) of the * HMM is not mapped. * * @param sequence */ public void updateMapping(char[] sequence) { int nodeNo = 1; int column = 0; synchronized (nodeLookup) { clearNodeLookup(); for (char residue : sequence) { if (!Comparison.isGap(residue)) { HMMNode node = nodes.get(nodeNo); if (node == null) { // error : too few nodes for sequence break; } setAlignmentColumn(node, column); nodeNo++; } column++; } } } /** * Clears all data in the node lookup map */ public void clearNodeLookup() { nodeLookup.clear(); } /** * Sets the reference annotation at a given node * * @param nodeIndex * @param value */ public void setReferenceAnnotation(int nodeIndex, char value) { nodes.get(nodeIndex).setReferenceAnnotation(value); } /** * Sets the consensus residue at a given node * * @param nodeIndex * @param value */ public void setConsensusResidue(int nodeIndex, char value) { nodes.get(nodeIndex).setConsensusResidue(value); } /** * Sets the consensus structure at a given node * * @param nodeIndex * @param value */ public void setConsensusStructure(int nodeIndex, char value) { nodes.get(nodeIndex).setConsensusStructure(value); } /** * Sets the mask value at a given node * * @param nodeIndex * @param value */ public void setMaskValue(int nodeIndex, char value) { nodes.get(nodeIndex).setMaskValue(value); } /** * Temporary implementation, should not be used. * * @return */ public String getViterbi() { String value; value = fileProperties.get(HMMFile.VITERBI); return value; } /** * Temporary implementation, should not be used. * * @return */ public String getMSV() { String value; value = fileProperties.get(HMMFile.MSV); return value; } /** * Temporary implementation, should not be used. * * @return */ public String getForward() { String value; value = fileProperties.get(HMMFile.FORWARD); return value; } /** * Answers the HMMNode mapped to the given alignment column (base 0), or null * if none is mapped * * @param alignmentColumn */ public HMMNode getNodeForColumn(int alignmentColumn) { return nodeLookup.get(alignmentColumn); } /** * Returns the consensus sequence based on the most probable symbol at each * position. The sequence is adjusted to match the length of the existing * sequence alignment. Gap characters are used as padding. * * @return */ public Sequence getConsensusSequence() { int start; int end; int modelLength; start = getNodeAlignmentColumn(1); modelLength = getLength(); end = getNodeAlignmentColumn(modelLength); char[] sequence = new char[end + 1]; for (int index = 0; index < end + 1; index++) { Character character; character = getConsensusAtAlignColumn(index); if (character == null || character == '-') { sequence[index] = '-'; } else { sequence[index] = Character.toUpperCase(character); } } Sequence seq = new Sequence(getName(), sequence, start, end); return seq; } /** * Initiates a HMM consensus sequence * * @return A new HMM consensus sequence */ public SequenceI initHMMSequence() { Sequence consensus = getConsensusSequence(); consensus.setIsHMMConsensusSequence(true); consensus.setHMM(this); return consensus; } /** * Answers the index position (0...) of the given symbol, or -1 if not a valid * symbol for this HMM * * @param symbol * @return */ public int getSymbolIndex(char symbol) { /* * symbolIndexLookup holds the index for 'A' to 'Z' */ char c = Character.toUpperCase(symbol); if ('A' <= c && c <= 'Z') { return symbolIndexLookup[c - 'A']; } return -1; } public void addNode(HMMNode node) { nodes.add(node); } }