1 package jalview.datamodel;
3 import jalview.io.HMMFile;
4 import jalview.schemes.ResidueProperties;
5 import jalview.util.Comparison;
6 import jalview.util.MapList;
8 import java.util.ArrayList;
9 import java.util.Arrays;
10 import java.util.HashMap;
11 import java.util.List;
15 * Data structure which stores a hidden Markov model
20 public class HiddenMarkovModel
22 private static final char GAP_DASH = '-';
24 public final static String YES = "yes";
26 public final static String NO = "no";
28 public static final int MATCHTOMATCH = 0;
30 public static final int MATCHTOINSERT = 1;
32 public static final int MATCHTODELETE = 2;
34 public static final int INSERTTOMATCH = 3;
36 public static final int INSERTTOINSERT = 4;
38 public static final int DELETETOMATCH = 5;
40 public static final int DELETETODELETE = 6;
42 private static final double LOG2 = Math.log(2);
45 * properties read from HMM file header lines
47 private Map<String, String> fileProperties = new HashMap<>();
49 private String fileHeader;
52 * the symbols used in this model e.g. "ACGT"
54 private String alphabet;
57 * symbol lookup index into the alphabet for 'A' to 'Z'
59 private int[] symbolIndexLookup = new int['Z' - 'A' + 1];
62 * Nodes in the model. The begin node is at index 0, and contains
63 * average emission probabilities for each symbol.
65 private List<HMMNode> nodes = new ArrayList<>();
68 * the aligned HMM consensus sequence extracted from the HMM profile
70 private SequenceI hmmSeq;
73 * mapping from HMM nodes to residues of the hmm consensus sequence
75 private Mapping mapToHmmConsensus;
77 // stores background frequencies of alignment from which this model came
78 private Map<Character, Float> backgroundFrequencies;
83 public HiddenMarkovModel()
88 * Copy constructor given a new aligned sequence with which to associate the
94 public HiddenMarkovModel(HiddenMarkovModel hmm, SequenceI sq)
97 this.fileProperties = new HashMap<>(hmm.fileProperties);
98 this.alphabet = hmm.alphabet;
99 this.nodes = new ArrayList<>(hmm.nodes);
100 this.symbolIndexLookup = hmm.symbolIndexLookup;
101 this.fileHeader = new String(hmm.fileHeader);
103 this.backgroundFrequencies = hmm.getBackgroundFrequencies();
104 if (sq.getDatasetSequence() == hmm.mapToHmmConsensus.getTo())
106 // same dataset sequence e.g. after realigning search results
107 this.mapToHmmConsensus = hmm.mapToHmmConsensus;
111 // different dataset sequence e.g. after loading HMM from project
112 this.mapToHmmConsensus = new Mapping(sq.getDatasetSequence(),
113 hmm.mapToHmmConsensus.getMap());
118 * Returns the information content at a specified column, calculated as the
119 * sum (over possible symbols) of the log ratio
122 * log(emission probability / background probability) / log(2)
126 * column position (base 0)
129 public float getInformationContent(int column)
131 float informationContent = 0f;
133 for (char symbol : getSymbols().toCharArray())
135 float freq = ResidueProperties.backgroundFrequencies
136 .get(getAlphabetType()).get(symbol);
137 float prob = (float) getMatchEmissionProbability(column, symbol);
138 informationContent += prob * Math.log(prob / freq);
141 informationContent = informationContent / (float) LOG2;
143 return informationContent;
147 * Gets the file header of the .hmm file this model came from
151 public String getFileHeader()
157 * Sets the file header of this model.
161 public void setFileHeader(String header)
167 * Returns the symbols used in this hidden Markov model
171 public String getSymbols()
177 * Gets the node in the hidden Markov model at the specified position.
180 * The index of the node requested. Node 0 optionally contains the
181 * average match emission probabilities across the entire model, and
182 * always contains the insert emission probabilities and state
183 * transition probabilities for the begin node. Node 1 contains the
184 * first node in the HMM that can correspond to a column in the
188 public HMMNode getNode(int nodeIndex)
190 return nodes.get(nodeIndex);
194 * Returns the name of the sequence alignment on which the HMM is based.
198 public String getName()
200 return fileProperties.get(HMMFile.NAME);
204 * Answers the string value of the property (parsed from an HMM file) for the
205 * given key, or null if the property is not present
210 public String getProperty(String key)
212 return fileProperties.get(key);
216 * Answers true if the property with the given key is present with a value of
217 * "yes" (not case-sensitive), else false
222 public boolean getBooleanProperty(String key)
224 return YES.equalsIgnoreCase(fileProperties.get(key));
228 * Returns the length of the hidden Markov model. The value returned is the
229 * LENG property if specified, else the number of nodes, excluding the begin
230 * node (which should be the same thing).
234 public int getLength()
236 if (fileProperties.get(HMMFile.LENGTH) == null)
238 return nodes.size() - 1; // not counting BEGIN node
240 return Integer.parseInt(fileProperties.get(HMMFile.LENGTH));
244 * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are
245 * the options. Other alphabets may be added.
249 public String getAlphabetType()
251 return fileProperties.get(HMMFile.ALPHABET);
255 * Sets the model alphabet to the symbols in the given string (ignoring any
256 * whitespace), and returns the number of symbols
260 public int setAlphabet(String symbols)
262 String trimmed = symbols.toUpperCase().replaceAll("\\s", "");
263 int count = trimmed.length();
265 symbolIndexLookup = new int['Z' - 'A' + 1];
266 Arrays.fill(symbolIndexLookup, -1);
270 * save the symbols in order, and a quick lookup of symbol position
272 for (short i = 0; i < count; i++)
274 char symbol = trimmed.charAt(i);
275 if (symbol >= 'A' && symbol <= 'Z'
276 && symbolIndexLookup[symbol - 'A'] == -1)
278 symbolIndexLookup[symbol - 'A'] = i;
284 "Unexpected or duplicated character in HMM ALPHabet: "
289 return count - ignored;
293 * Answers the node of the model corresponding to an aligned column position
294 * (0...), or null if there is no such node
299 HMMNode getNodeForColumn(int column)
302 * if the hmm consensus is gapped at the column,
303 * there is no corresponding node
305 if (Comparison.isGap(hmmSeq.getCharAt(column)))
311 * find the node (if any) that is mapped to the
312 * consensus sequence residue position at the column
314 int seqPos = hmmSeq.findPosition(column);
315 int[] nodeNo = mapToHmmConsensus.getMap().locateInFrom(seqPos, seqPos);
318 return getNode(nodeNo[0]);
324 * Gets the match emission probability for a given symbol at a column in the
328 * The index of the alignment column, starting at index 0. Index 0
329 * usually corresponds to index 1 in the HMM.
331 * The symbol for which the desired probability is being requested.
335 public double getMatchEmissionProbability(int alignColumn, char symbol)
337 HMMNode node = getNodeForColumn(alignColumn);
338 int symbolIndex = getSymbolIndex(symbol);
339 if (node != null && symbolIndex != -1)
341 return node.getMatchEmission(symbolIndex);
347 * Gets the insert emission probability for a given symbol at a column in the
351 * The index of the alignment column, starting at index 0. Index 0
352 * usually corresponds to index 1 in the HMM.
354 * The symbol for which the desired probability is being requested.
358 public double getInsertEmissionProbability(int alignColumn, char symbol)
360 HMMNode node = getNodeForColumn(alignColumn);
361 int symbolIndex = getSymbolIndex(symbol);
362 if (node != null && symbolIndex != -1)
364 return node.getInsertEmission(symbolIndex);
370 * Gets the state transition probability for a given symbol at a column in the
374 * The index of the alignment column, starting at index 0. Index 0
375 * usually corresponds to index 1 in the HMM.
377 * The symbol for which the desired probability is being requested.
381 public double getStateTransitionProbability(int alignColumn,
384 HMMNode node = getNodeForColumn(alignColumn);
387 return node.getStateTransition(transition);
393 * Returns the sequence position linked to the node at the given index. This
394 * corresponds to an aligned column position (counting from 1).
397 * The index of the node, starting from index 1. Index 0 is the begin
398 * node, which does not correspond to a column in the alignment.
401 public int getNodeMapPosition(int nodeIndex)
403 return nodes.get(nodeIndex).getResidueNumber();
407 * Returns the consensus residue at the specified node.
410 * The index of the specified node.
413 public char getConsensusResidue(int nodeIndex)
415 char value = nodes.get(nodeIndex).getConsensusResidue();
420 * Returns the reference annotation at the specified node.
423 * The index of the specified node.
426 public char getReferenceAnnotation(int nodeIndex)
428 char value = nodes.get(nodeIndex).getReferenceAnnotation();
433 * Returns the mask value at the specified node.
436 * The index of the specified node.
439 public char getMaskedValue(int nodeIndex)
441 char value = nodes.get(nodeIndex).getMaskValue();
446 * Returns the consensus structure at the specified node.
449 * The index of the specified node.
452 public char getConsensusStructure(int nodeIndex)
454 char value = nodes.get(nodeIndex).getConsensusStructure();
459 * Sets a property read from an HMM file
464 public void setProperty(String key, String value)
466 fileProperties.put(key, value);
470 * Temporary implementation, should not be used.
474 public String getViterbi()
477 value = fileProperties.get(HMMFile.VITERBI);
482 * Temporary implementation, should not be used.
486 public String getMSV()
489 value = fileProperties.get(HMMFile.MSV);
494 * Temporary implementation, should not be used.
498 public String getForward()
501 value = fileProperties.get(HMMFile.FORWARD);
506 * Constructs the consensus sequence based on the most probable symbol at each
507 * position. Gap characters are inserted for discontinuities in the node map
508 * numbering (if provided), else an ungapped sequence is generated.
510 * A mapping between the HMM nodes and residue positions of the sequence is
511 * also built and saved.
515 void buildConsensusSequence()
517 List<int[]> toResidues = new ArrayList<>();
520 * if the HMM provided a map to sequence, use those start/end values,
521 * else just treat it as for a contiguous sequence numbered from 1
523 boolean hasMap = getBooleanProperty(HMMFile.MAP);
524 int start = hasMap ? getNode(1).getResidueNumber() : 1;
525 int endResNo = hasMap ? getNode(nodes.size() - 1).getResidueNumber()
526 : (start + getLength() - 1);
527 char[] sequence = new char[endResNo];
529 int lastResNo = start - 1;
534 for (int seqN = 0; seqN < start; seqN++)
536 sequence[seqN] = GAP_DASH;
540 for (int nodeNo = 1; nodeNo < nodes.size(); nodeNo++)
542 HMMNode node = nodes.get(nodeNo);
543 final int resNo = hasMap ? node.getResidueNumber() : lastResNo + 1;
546 * insert gaps if map numbering is not continuous
548 while (resNo > lastResNo + 1)
550 sequence[seqOffset++] = GAP_DASH;
554 char consensusResidue = node.getConsensusResidue();
555 if (GAP_DASH == consensusResidue)
558 * no residue annotation in HMM - scan for the symbol
559 * with the highest match emission probability
561 int symbolIndex = node.getMaxMatchEmissionIndex();
562 consensusResidue = alphabet.charAt(symbolIndex);
563 if (node.getMatchEmission(symbolIndex) < 0.5D)
565 // follow convention of lower case if match emission prob < 0.5
566 consensusResidue = Character.toLowerCase(consensusResidue);
569 sequence[seqOffset++] = consensusResidue;
573 Sequence seq = new Sequence(getName(), sequence, start,
574 lastResNo - gapCount);
575 seq.createDatasetSequence();
580 * construct and store Mapping of nodes to residues
581 * note as constructed this is just an identity mapping,
582 * but it allows for greater flexibility in future
584 List<int[]> fromNodes = new ArrayList<>();
585 fromNodes.add(new int[] { 1, getLength() });
586 toResidues.add(new int[] { seq.getStart(), seq.getEnd() });
587 MapList mapList = new MapList(fromNodes, toResidues, 1, 1);
588 mapToHmmConsensus = new Mapping(seq.getDatasetSequence(), mapList);
593 * Answers the aligned consensus sequence for the profile. Note this will
594 * return null if called before <code>setNodes</code> has been called.
598 public SequenceI getConsensusSequence()
604 * Answers the index position (0...) of the given symbol, or -1 if not a valid
605 * symbol for this HMM
610 private int getSymbolIndex(char symbol)
613 * symbolIndexLookup holds the index for 'A' to 'Z'
615 char c = Character.toUpperCase(symbol);
616 if ('A' <= c && c <= 'Z')
618 return symbolIndexLookup[c - 'A'];
624 * Sets the nodes of this HMM, and also extracts the HMM consensus sequence
625 * and a mapping between node numbers and sequence positions
629 public void setNodes(List<HMMNode> nodeList)
632 if (nodes.size() > 1)
634 buildConsensusSequence();
639 * Sets the aligned consensus sequence this HMM is the model for
643 public void setHmmSeq(SequenceI hmmSeq)
645 this.hmmSeq = hmmSeq;
648 public void setBackgroundFrequencies(Map<Character, Float> bkgdFreqs)
650 backgroundFrequencies = bkgdFreqs;
653 public void setBackgroundFrequencies(ResidueCount bkgdFreqs)
655 backgroundFrequencies = new HashMap<>();
657 int total = bkgdFreqs.getTotalResidueCount();
659 for (char c : bkgdFreqs.getSymbolCounts().symbols)
661 backgroundFrequencies.put(c, bkgdFreqs.getCount(c) * 1f / total);
666 public Map<Character, Float> getBackgroundFrequencies()
668 return backgroundFrequencies;