1 package jalview.datamodel;
3 import jalview.io.HMMFile;
4 import jalview.schemes.ResidueProperties;
5 import jalview.util.Comparison;
7 import java.util.ArrayList;
8 import java.util.Arrays;
9 import java.util.HashMap;
10 import java.util.List;
14 * Data structure which stores a hidden Markov model
19 public class HiddenMarkovModel
21 public final static String YES = "yes";
23 public final static String NO = "no";
25 public static final int MATCHTOMATCH = 0;
27 public static final int MATCHTOINSERT = 1;
29 public static final int MATCHTODELETE = 2;
31 public static final int INSERTTOMATCH = 3;
33 public static final int INSERTTOINSERT = 4;
35 public static final int DELETETOMATCH = 5;
37 public static final int DELETETODELETE = 6;
39 private static final double LOG2 = Math.log(2);
42 * properties read from HMM file header lines
44 Map<String, String> fileProperties = new HashMap<>();
49 * the symbols used in this model e.g. "ACGT"
54 * symbol lookup index into the alphabet for 'A' to 'Z'
56 int[] symbolIndexLookup = new int['Z' - 'A' + 1];
59 * Nodes in the model. The begin node is at index 0, and contains
60 * average emission probabilities for each symbol.
62 List<HMMNode> nodes = new ArrayList<>();
65 * lookup of the HMM node for each alignment column (from 0)
67 Map<Integer, HMMNode> nodeLookup = new HashMap<>();
72 public HiddenMarkovModel()
76 public HiddenMarkovModel(HiddenMarkovModel hmm)
79 this.fileProperties = new HashMap<>(hmm.fileProperties);
80 this.alphabet = hmm.alphabet;
81 this.nodes = new ArrayList<>(hmm.nodes);
82 this.nodeLookup = new HashMap<>(hmm.nodeLookup);
83 this.symbolIndexLookup = hmm.symbolIndexLookup;
84 this.fileHeader = new String(hmm.fileHeader);
88 * Returns the information content at a specified column, calculated as the
89 * sum (over possible symbols) of the log ratio
92 * log(emission probability / background probability) / log(2)
96 * column position (base 0)
99 public float getInformationContent(int column)
101 float informationContent = 0f;
103 for (char symbol : getSymbols().toCharArray())
105 float freq = ResidueProperties.backgroundFrequencies
106 .get(getAlphabetType()).get(symbol);
107 float prob = (float) getMatchEmissionProbability(column, symbol);
108 informationContent += prob * Math.log(prob / freq);
111 informationContent = informationContent / (float) LOG2;
113 return informationContent;
117 * Gets the file header of the .hmm file this model came from
121 public String getFileHeader()
127 * Sets the file header of this model.
131 public void setFileHeader(String header)
137 * Returns the symbols used in this hidden Markov model
141 public String getSymbols()
147 * Gets the node in the hidden Markov model at the specified position.
150 * The index of the node requested. Node 0 optionally contains the
151 * average match emission probabilities across the entire model, and
152 * always contains the insert emission probabilities and state
153 * transition probabilities for the begin node. Node 1 contains the
154 * first node in the HMM that can correspond to a column in the
158 public HMMNode getNode(int nodeIndex)
160 return nodes.get(nodeIndex);
164 * Returns the name of the sequence alignment on which the HMM is based.
168 public String getName()
170 return fileProperties.get(HMMFile.NAME);
174 * Answers the string value of the property (parsed from an HMM file) for the
175 * given key, or null if the property is not present
180 public String getProperty(String key)
182 return fileProperties.get(key);
186 * Answers true if the property with the given key is present with a value of
187 * "yes" (not case-sensitive), else false
192 public boolean getBooleanProperty(String key)
194 return YES.equalsIgnoreCase(fileProperties.get(key));
198 * Returns the length of the hidden Markov model, or 0 if not known
202 public int getLength()
204 if (fileProperties.get(HMMFile.LENGTH) == null)
208 return Integer.parseInt(fileProperties.get(HMMFile.LENGTH));
212 * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are
213 * the options. Other alphabets may be added.
217 public String getAlphabetType()
219 return fileProperties.get(HMMFile.ALPHABET);
223 * Sets the model alphabet to the symbols in the given string (ignoring any
224 * whitespace), and returns the number of symbols
228 public int setAlphabet(String symbols)
230 String trimmed = symbols.toUpperCase().replaceAll("\\s", "");
231 int count = trimmed.length();
233 symbolIndexLookup = new int['Z' - 'A' + 1];
234 Arrays.fill(symbolIndexLookup, -1);
238 * save the symbols in order, and a quick lookup of symbol position
240 for (short i = 0; i < count; i++)
242 char symbol = trimmed.charAt(i);
243 if (symbol >= 'A' && symbol <= 'Z'
244 && symbolIndexLookup[symbol - 'A'] == -1)
246 symbolIndexLookup[symbol - 'A'] = i;
252 "Unexpected or duplicated character in HMM ALPHabet: "
257 return count - ignored;
261 * Sets the list of nodes in this HMM to the given list.
264 * The list of nodes to which the current list of nodes is being
267 public void setNodes(List<HMMNode> nodes)
273 * Gets the match emission probability for a given symbol at a column in the
277 * The index of the alignment column, starting at index 0. Index 0
278 * usually corresponds to index 1 in the HMM.
280 * The symbol for which the desired probability is being requested.
284 public double getMatchEmissionProbability(int alignColumn, char symbol)
286 int symbolIndex = getSymbolIndex(symbol);
287 double probability = 0d;
288 if (symbolIndex != -1 && nodeLookup.containsKey(alignColumn))
290 HMMNode node = nodeLookup.get(alignColumn);
291 probability = node.getMatchEmission(symbolIndex);
297 * Gets the insert emission probability for a given symbol at a column in the
301 * The index of the alignment column, starting at index 0. Index 0
302 * usually corresponds to index 1 in the HMM.
304 * The symbol for which the desired probability is being requested.
308 public double getInsertEmissionProbability(int alignColumn, char symbol)
310 int symbolIndex = getSymbolIndex(symbol);
311 double probability = 0d;
312 if (symbolIndex != -1 && nodeLookup.containsKey(alignColumn))
314 HMMNode node = nodeLookup.get(alignColumn);
315 probability = node.getInsertEmission(symbolIndex);
321 * Gets the state transition probability for a given symbol at a column in the
325 * The index of the alignment column, starting at index 0. Index 0
326 * usually corresponds to index 1 in the HMM.
328 * The symbol for which the desired probability is being requested.
332 public Double getStateTransitionProbability(int alignColumn,
335 double probability = 0d;
336 if (nodeLookup.containsKey(alignColumn))
338 HMMNode node = nodeLookup.get(alignColumn);
339 probability = node.getStateTransition(transition);
345 * Returns the alignment column linked to the node at the given index.
348 * The index of the node, starting from index 1. Index 0 is the begin
349 * node, which does not correspond to a column in the alignment.
352 public Integer getNodeAlignmentColumn(int nodeIndex)
354 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
359 * Returns the consensus residue at the specified node.
362 * The index of the specified node.
365 public char getConsensusResidue(int nodeIndex)
367 char value = nodes.get(nodeIndex).getConsensusResidue();
372 * Returns the consensus at a given alignment column. If the character is
373 * lower case, its emission probability is less than 0.5.
376 * The index of the column in the alignment for which the consensus
377 * is desired. The list of columns starts at index 0.
380 public char getConsensusAtAlignColumn(int columnIndex)
382 char mostLikely = '-';
383 if (getBooleanProperty(HMMFile.CONSENSUS_RESIDUE))
385 HMMNode node = nodeLookup.get(columnIndex);
390 mostLikely = node.getConsensusResidue();
395 double highestProb = 0;
396 for (char character : alphabet.toCharArray())
398 double prob = getMatchEmissionProbability(columnIndex, character);
399 if (prob > highestProb)
402 mostLikely = character;
405 if (highestProb < 0.5)
407 mostLikely = Character.toLowerCase(mostLikely);
415 * Returns the reference annotation at the specified node.
418 * The index of the specified node.
421 public char getReferenceAnnotation(int nodeIndex)
423 char value = nodes.get(nodeIndex).getReferenceAnnotation();
428 * Returns the mask value at the specified node.
431 * The index of the specified node.
434 public char getMaskedValue(int nodeIndex)
436 char value = nodes.get(nodeIndex).getMaskValue();
441 * Returns the consensus structure at the specified node.
444 * The index of the specified node.
447 public char getConsensusStructure(int nodeIndex)
449 char value = nodes.get(nodeIndex).getConsensusStructure();
454 * Returns the number of symbols in the alphabet used in this HMM.
458 public int getNumberOfSymbols()
460 return alphabet.length();
464 * Sets a property read from an HMM file
469 public void setProperty(String key, String value)
471 fileProperties.put(key, value);
475 * Sets the alignment column of the specified node
482 public void setAlignmentColumn(HMMNode node, int column)
484 node.setAlignmentColumn(column);
485 nodeLookup.put(column, node);
489 * Updates the mapping of nodes of the HMM to non-gapped positions of the
490 * sequence. Nodes 1, 2, 3... are mapped to the columns occupied by the first,
491 * second, third... residues of the sequence. The 'begin' node (node 0) of the
496 public void updateMapping(char[] sequence)
500 synchronized (nodeLookup)
503 for (char residue : sequence)
505 if (!Comparison.isGap(residue))
507 HMMNode node = nodes.get(nodeNo);
510 // error : too few nodes for sequence
513 setAlignmentColumn(node, column);
522 * Clears all data in the node lookup map
524 public void clearNodeLookup()
530 * Sets the reference annotation at a given node
535 public void setReferenceAnnotation(int nodeIndex, char value)
537 nodes.get(nodeIndex).setReferenceAnnotation(value);
541 * Sets the consensus residue at a given node
546 public void setConsensusResidue(int nodeIndex, char value)
548 nodes.get(nodeIndex).setConsensusResidue(value);
552 * Sets the consensus structure at a given node
557 public void setConsensusStructure(int nodeIndex, char value)
559 nodes.get(nodeIndex).setConsensusStructure(value);
563 * Sets the mask value at a given node
568 public void setMaskValue(int nodeIndex, char value)
570 nodes.get(nodeIndex).setMaskValue(value);
574 * Temporary implementation, should not be used.
578 public String getViterbi()
581 value = fileProperties.get(HMMFile.VITERBI);
586 * Temporary implementation, should not be used.
590 public String getMSV()
593 value = fileProperties.get(HMMFile.MSV);
598 * Temporary implementation, should not be used.
602 public String getForward()
605 value = fileProperties.get(HMMFile.FORWARD);
610 * Answers the HMMNode mapped to the given alignment column (base 0), or null
613 * @param alignmentColumn
615 public HMMNode getNodeForColumn(int alignmentColumn)
617 return nodeLookup.get(alignmentColumn);
621 * Returns the consensus sequence based on the most probable symbol at each
622 * position. The sequence is adjusted to match the length of the existing
623 * sequence alignment. Gap characters are used as padding.
627 public Sequence getConsensusSequence()
632 start = getNodeAlignmentColumn(1);
633 modelLength = getLength();
634 end = getNodeAlignmentColumn(modelLength);
635 char[] sequence = new char[end + 1];
636 for (int index = 0; index < end + 1; index++)
640 character = getConsensusAtAlignColumn(index);
642 if (character == null || character == '-')
644 sequence[index] = '-';
648 sequence[index] = Character.toUpperCase(character);
653 Sequence seq = new Sequence(getName(), sequence, start,
660 * Initiates a HMM consensus sequence
662 * @return A new HMM consensus sequence
664 public SequenceI initHMMSequence()
666 Sequence consensus = getConsensusSequence();
667 consensus.setIsHMMConsensusSequence(true);
668 consensus.setHMM(this);
673 * Answers the index position (0...) of the given symbol, or -1 if not a valid
674 * symbol for this HMM
679 public int getSymbolIndex(char symbol)
682 * symbolIndexLookup holds the index for 'A' to 'Z'
684 char c = Character.toUpperCase(symbol);
685 if ('A' <= c && c <= 'Z')
687 return symbolIndexLookup[c - 'A'];
692 public void addNode(HMMNode node)