1 package jalview.datamodel;
3 import jalview.bin.Cache;
4 import jalview.io.HMMFile;
5 import jalview.schemes.ResidueProperties;
6 import jalview.util.Comparison;
7 import jalview.util.MapList;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.HashMap;
12 import java.util.List;
16 * Data structure which stores a hidden Markov model
21 public class HiddenMarkovModel
23 private static final char GAP_DASH = '-';
25 public final static String YES = "yes";
27 public final static String NO = "no";
29 public static final int MATCHTOMATCH = 0;
31 public static final int MATCHTOINSERT = 1;
33 public static final int MATCHTODELETE = 2;
35 public static final int INSERTTOMATCH = 3;
37 public static final int INSERTTOINSERT = 4;
39 public static final int DELETETOMATCH = 5;
41 public static final int DELETETODELETE = 6;
43 private static final double LOG2 = Math.log(2);
46 * properties read from HMM file header lines
48 private Map<String, String> fileProperties = new HashMap<>();
50 private String fileHeader;
53 * the symbols used in this model e.g. "ACGT"
55 private String alphabet;
58 * symbol lookup index into the alphabet for 'A' to 'Z'
60 private int[] symbolIndexLookup = new int['Z' - 'A' + 1];
63 * Nodes in the model. The begin node is at index 0, and contains
64 * average emission probabilities for each symbol.
66 private List<HMMNode> nodes = new ArrayList<>();
69 * the aligned HMM consensus sequence extracted from the HMM profile
71 private SequenceI hmmSeq;
74 * mapping from HMM nodes to residues of the hmm consensus sequence
76 private Mapping mapToHmmConsensus;
81 public HiddenMarkovModel()
86 * Copy constructor given a new aligned sequence with which to associate the
92 public HiddenMarkovModel(HiddenMarkovModel hmm, SequenceI sq)
95 this.fileProperties = new HashMap<>(hmm.fileProperties);
96 this.alphabet = hmm.alphabet;
97 this.nodes = new ArrayList<>(hmm.nodes);
98 this.symbolIndexLookup = hmm.symbolIndexLookup;
99 this.fileHeader = new String(hmm.fileHeader);
101 if (sq.getDatasetSequence() == hmm.mapToHmmConsensus.getTo())
103 this.mapToHmmConsensus = hmm.mapToHmmConsensus;
107 Cache.log.error("Error: HMM copied with change of mapped sequence");
112 * Returns the information content at a specified column, calculated as the
113 * sum (over possible symbols) of the log ratio
116 * log(emission probability / background probability) / log(2)
120 * column position (base 0)
123 public float getInformationContent(int column)
125 float informationContent = 0f;
127 for (char symbol : getSymbols().toCharArray())
129 float freq = ResidueProperties.backgroundFrequencies
130 .get(getAlphabetType()).get(symbol);
131 float prob = (float) getMatchEmissionProbability(column, symbol);
132 informationContent += prob * Math.log(prob / freq);
135 informationContent = informationContent / (float) LOG2;
137 return informationContent;
141 * Gets the file header of the .hmm file this model came from
145 public String getFileHeader()
151 * Sets the file header of this model.
155 public void setFileHeader(String header)
161 * Returns the symbols used in this hidden Markov model
165 public String getSymbols()
171 * Gets the node in the hidden Markov model at the specified position.
174 * The index of the node requested. Node 0 optionally contains the
175 * average match emission probabilities across the entire model, and
176 * always contains the insert emission probabilities and state
177 * transition probabilities for the begin node. Node 1 contains the
178 * first node in the HMM that can correspond to a column in the
182 public HMMNode getNode(int nodeIndex)
184 return nodes.get(nodeIndex);
188 * Returns the name of the sequence alignment on which the HMM is based.
192 public String getName()
194 return fileProperties.get(HMMFile.NAME);
198 * Answers the string value of the property (parsed from an HMM file) for the
199 * given key, or null if the property is not present
204 public String getProperty(String key)
206 return fileProperties.get(key);
210 * Answers true if the property with the given key is present with a value of
211 * "yes" (not case-sensitive), else false
216 public boolean getBooleanProperty(String key)
218 return YES.equalsIgnoreCase(fileProperties.get(key));
222 * Returns the length of the hidden Markov model. The value returned is the
223 * LENG property if specified, else the number of nodes, excluding the begin
224 * node (which should be the same thing).
228 public int getLength()
230 if (fileProperties.get(HMMFile.LENGTH) == null)
232 return nodes.size() - 1; // not counting BEGIN node
234 return Integer.parseInt(fileProperties.get(HMMFile.LENGTH));
238 * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are
239 * the options. Other alphabets may be added.
243 public String getAlphabetType()
245 return fileProperties.get(HMMFile.ALPHABET);
249 * Sets the model alphabet to the symbols in the given string (ignoring any
250 * whitespace), and returns the number of symbols
254 public int setAlphabet(String symbols)
256 String trimmed = symbols.toUpperCase().replaceAll("\\s", "");
257 int count = trimmed.length();
259 symbolIndexLookup = new int['Z' - 'A' + 1];
260 Arrays.fill(symbolIndexLookup, -1);
264 * save the symbols in order, and a quick lookup of symbol position
266 for (short i = 0; i < count; i++)
268 char symbol = trimmed.charAt(i);
269 if (symbol >= 'A' && symbol <= 'Z'
270 && symbolIndexLookup[symbol - 'A'] == -1)
272 symbolIndexLookup[symbol - 'A'] = i;
278 "Unexpected or duplicated character in HMM ALPHabet: "
283 return count - ignored;
287 * Answers the node of the model corresponding to an aligned column position
288 * (0...), or null if there is no such node
293 HMMNode getNodeForColumn(int column)
296 * if the hmm consensus is gapped at the column,
297 * there is no corresponding node
299 if (Comparison.isGap(hmmSeq.getCharAt(column)))
305 * find the node (if any) that is mapped to the
306 * consensus sequence residue position at the column
308 int seqPos = hmmSeq.findPosition(column);
309 int[] nodeNo = mapToHmmConsensus.getMap().locateInFrom(seqPos, seqPos);
312 return getNode(nodeNo[0]);
318 * Gets the match emission probability for a given symbol at a column in the
322 * The index of the alignment column, starting at index 0. Index 0
323 * usually corresponds to index 1 in the HMM.
325 * The symbol for which the desired probability is being requested.
329 public double getMatchEmissionProbability(int alignColumn, char symbol)
331 HMMNode node = getNodeForColumn(alignColumn);
332 int symbolIndex = getSymbolIndex(symbol);
333 if (node != null && symbolIndex != -1)
335 return node.getMatchEmission(symbolIndex);
341 * Gets the insert emission probability for a given symbol at a column in the
345 * The index of the alignment column, starting at index 0. Index 0
346 * usually corresponds to index 1 in the HMM.
348 * The symbol for which the desired probability is being requested.
352 public double getInsertEmissionProbability(int alignColumn, char symbol)
354 HMMNode node = getNodeForColumn(alignColumn);
355 int symbolIndex = getSymbolIndex(symbol);
356 if (node != null && symbolIndex != -1)
358 return node.getInsertEmission(symbolIndex);
364 * Gets the state transition probability for a given symbol at a column in the
368 * The index of the alignment column, starting at index 0. Index 0
369 * usually corresponds to index 1 in the HMM.
371 * The symbol for which the desired probability is being requested.
375 public double getStateTransitionProbability(int alignColumn,
378 HMMNode node = getNodeForColumn(alignColumn);
381 return node.getStateTransition(transition);
387 * Returns the sequence position linked to the node at the given index. This
388 * corresponds to an aligned column position (counting from 1).
391 * The index of the node, starting from index 1. Index 0 is the begin
392 * node, which does not correspond to a column in the alignment.
395 public int getNodeMapPosition(int nodeIndex)
397 return nodes.get(nodeIndex).getResidueNumber();
401 * Returns the consensus residue at the specified node.
404 * The index of the specified node.
407 public char getConsensusResidue(int nodeIndex)
409 char value = nodes.get(nodeIndex).getConsensusResidue();
414 * Returns the reference annotation at the specified node.
417 * The index of the specified node.
420 public char getReferenceAnnotation(int nodeIndex)
422 char value = nodes.get(nodeIndex).getReferenceAnnotation();
427 * Returns the mask value at the specified node.
430 * The index of the specified node.
433 public char getMaskedValue(int nodeIndex)
435 char value = nodes.get(nodeIndex).getMaskValue();
440 * Returns the consensus structure at the specified node.
443 * The index of the specified node.
446 public char getConsensusStructure(int nodeIndex)
448 char value = nodes.get(nodeIndex).getConsensusStructure();
453 * Sets a property read from an HMM file
458 public void setProperty(String key, String value)
460 fileProperties.put(key, value);
464 * Temporary implementation, should not be used.
468 public String getViterbi()
471 value = fileProperties.get(HMMFile.VITERBI);
476 * Temporary implementation, should not be used.
480 public String getMSV()
483 value = fileProperties.get(HMMFile.MSV);
488 * Temporary implementation, should not be used.
492 public String getForward()
495 value = fileProperties.get(HMMFile.FORWARD);
500 * Constructs the consensus sequence based on the most probable symbol at each
501 * position. Gap characters are inserted for discontinuities in the node map
502 * numbering (if provided), else an ungapped sequence is generated.
504 * A mapping between the HMM nodes and residue positions of the sequence is
505 * also built and saved.
509 void buildConsensusSequence()
511 List<int[]> toResidues = new ArrayList<>();
514 * if the HMM provided a map to sequence, use those start/end values,
515 * else just treat it as for a contiguous sequence numbered from 1
517 boolean hasMap = getBooleanProperty(HMMFile.MAP);
518 int start = hasMap ? getNode(1).getResidueNumber() : 1;
519 int endResNo = hasMap ? getNode(nodes.size() - 1).getResidueNumber()
520 : (start + getLength() - 1);
521 char[] sequence = new char[endResNo - start + 1];
523 int lastResNo = start - 1;
527 for (int nodeNo = 1; nodeNo < nodes.size(); nodeNo++)
529 HMMNode node = nodes.get(nodeNo);
530 final int resNo = hasMap ? node.getResidueNumber() : lastResNo + 1;
533 * insert gaps if map numbering is not continuous
535 while (resNo > lastResNo + 1)
537 sequence[seqOffset++] = '-';
541 char consensusResidue = node.getConsensusResidue();
542 if (GAP_DASH == consensusResidue)
545 * no residue annotation in HMM - scan for the symbol
546 * with the highest match emission probability
548 int symbolIndex = node.getMaxMatchEmissionIndex();
549 consensusResidue = alphabet.charAt(symbolIndex);
550 if (node.getMatchEmission(symbolIndex) < 0.5D)
552 // follow convention of lower case if match emission prob < 0.5
553 consensusResidue = Character.toLowerCase(consensusResidue);
556 sequence[seqOffset++] = consensusResidue;
560 Sequence seq = new Sequence(getName(), sequence, start,
561 lastResNo - gapCount);
562 seq.createDatasetSequence();
567 * construct and store Mapping of nodes to residues
568 * note as constructed this is just an identity mapping,
569 * but it allows for greater flexibility in future
571 List<int[]> fromNodes = new ArrayList<>();
572 fromNodes.add(new int[] { 1, getLength() });
573 toResidues.add(new int[] { seq.getStart(), seq.getEnd() });
574 MapList mapList = new MapList(fromNodes, toResidues, 1, 1);
575 mapToHmmConsensus = new Mapping(seq.getDatasetSequence(), mapList);
580 * Answers the aligned consensus sequence for the profile. Note this will
581 * return null if called before <code>setNodes</code> has been called.
585 public SequenceI getConsensusSequence()
591 * Answers the index position (0...) of the given symbol, or -1 if not a valid
592 * symbol for this HMM
597 private int getSymbolIndex(char symbol)
600 * symbolIndexLookup holds the index for 'A' to 'Z'
602 char c = Character.toUpperCase(symbol);
603 if ('A' <= c && c <= 'Z')
605 return symbolIndexLookup[c - 'A'];
611 * Sets the nodes of this HMM, and also extracts the HMM consensus sequence
612 * and a mapping between node numbers and sequence positions
616 public void setNodes(List<HMMNode> nodeList)
619 if (nodes.size() > 1)
621 buildConsensusSequence();
626 * Sets the aligned consensus sequence this HMM is the model for
630 public void setHmmSeq(SequenceI hmmSeq)
632 this.hmmSeq = hmmSeq;