1 package jalview.datamodel;
3 import jalview.schemes.ResidueProperties;
4 import jalview.util.Comparison;
6 import java.util.ArrayList;
7 import java.util.HashMap;
12 * Data structure which stores a hidden Markov model. Currently contains file
13 * properties as well, not sure whether these should be transferred to the
19 public class HiddenMarkovModel
21 private static final double LOG2 = Math.log(2);
23 // Stores file properties. Do not directly access this field as it contains
24 // only string value - use the getter methods. For example, to find the length
25 // of theHMM, use getModelLength()to return an int value
26 Map<String, String> fileProperties = new HashMap<>();
28 // contains all of the symbols used in this model. The index of each symbol
29 // represents its lookup value
30 List<Character> symbols = new ArrayList<>();
32 // contains information for each node in the model. The begin node is at index
33 // 0. Node 0 contains average emission probabilities for each symbol
34 List<HMMNode> nodes = new ArrayList<>();
36 // contains the HMM node for each alignment column, alignment columns start at
38 Map<Integer, HMMNode> nodeLookup = new HashMap<>();
40 // contains the symbol index for each symbol
41 Map<Character, Integer> symbolIndexLookup = new HashMap<>();
43 final static String YES = "yes";
45 final static String NO = "no";
47 // keys for file properties hashmap
48 private static final String NAME = "NAME";
50 private static final String ACCESSION_NUMBER = "ACC";
52 private static final String DESCRIPTION = "DESC";
54 private static final String LENGTH = "LENG";
56 private static final String MAX_LENGTH = "MAXL";
58 private static final String ALPHABET = "ALPH";
60 private static final String DATE = "DATE";
62 private static final String COMMAND_LOG = "COM";
64 private static final String NUMBER_OF_SEQUENCES = "NSEQ";
66 private static final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
68 private static final String CHECK_SUM = "CKSUM";
70 private static final String GATHERING_THRESHOLDS = "GA";
72 private static final String TRUSTED_CUTOFFS = "TC";
74 private static final String NOISE_CUTOFFS = "NC";
76 private static final String STATISTICS = "STATS";
78 private static final String COMPO = "COMPO";
80 private static final String GATHERING_THRESHOLD = "GA";
82 private static final String TRUSTED_CUTOFF = "TC";
84 private final String NOISE_CUTOFF = "NC";
86 private static final String VITERBI = "VITERBI";
88 private static final String MSV = "MSV";
90 private static final String FORWARD = "FORWARD";
92 private static final String MAP = "MAP";
94 private static final String REFERENCE_ANNOTATION = "RF";
96 private static final String CONSENSUS_RESIDUE = "CONS";
98 private static final String CONSENSUS_STRUCTURE = "CS";
100 private static final String MASKED_VALUE = "MM";
102 public static final int MATCHTOMATCH = 0;
104 public static final int MATCHTOINSERT = 1;
106 public static final int MATCHTODELETE = 2;
108 public static final int INSERTTOMATCH = 3;
110 public static final int INSERTTOINSERT = 4;
112 public static final int DELETETOMATCH = 5;
114 public static final int DELETETODELETE = 6;
121 public HiddenMarkovModel()
125 public HiddenMarkovModel(HiddenMarkovModel hmm)
128 this.fileProperties = new HashMap<>(hmm.fileProperties);
129 this.symbols = new ArrayList<>(hmm.symbols);
130 this.nodes = new ArrayList<>(hmm.nodes);
131 this.nodeLookup = new HashMap<>(hmm.nodeLookup);
132 this.symbolIndexLookup = new HashMap<>(
133 hmm.symbolIndexLookup);
134 this.fileHeader = new String(hmm.fileHeader);
138 * Returns the information content at a specified column, calculated as the
139 * sum (over possible symbols) of the log ratio
142 * log(emission probability / background probability) / log(2)
146 * column position (base 0)
149 public float getInformationContent(int column)
151 float informationContent = 0f;
153 for (char symbol : getSymbols())
155 float freq = ResidueProperties.backgroundFrequencies
156 .get(getAlphabetType()).get(symbol);
157 float prob = (float) getMatchEmissionProbability(column, symbol);
158 informationContent += prob * Math.log(prob / freq);
161 informationContent = informationContent / (float) LOG2;
163 return informationContent;
167 * Gets the file header of the .hmm file this model came from
171 public String getFileHeader()
177 * Sets the file header of this model.
181 public void setFileHeader(String header)
187 * Returns the map containing the matches between nodes and alignment column
193 public Map<Integer, HMMNode> getNodeLookup()
199 * Returns the list of symbols used in this hidden Markov model.
203 public List<Character> getSymbols()
209 * Returns the file properties.
213 public Map<String, String> getFileProperties()
215 return fileProperties;
219 * Gets the node in the hidden Markov model at the specified position.
222 * The index of the node requested. Node 0 optionally contains the
223 * average match emission probabilities across the entire model, and
224 * always contains the insert emission probabilities and state
225 * transition probabilities for the begin node. Node 1 contains the
226 * first node in the HMM that can correspond to a column in the
230 public HMMNode getNode(int nodeIndex)
232 return getNodes().get(nodeIndex);
236 * Sets the list of symbols used in the hidden Markov model to the list
240 * The list of symbols to which the current list is to be changed.
243 public void setSymbols(List<Character> symbolsL)
245 this.symbols = symbolsL;
249 * Returns the name of the sequence alignment on which the HMM is based.
253 public String getName()
255 return fileProperties.get(NAME);
259 * Returns the accession number.
262 public String getAccessionNumber()
264 return fileProperties.get(ACCESSION_NUMBER);
268 * Returns a description of the sequence alignment on which the hidden Markov
273 public String getDescription()
275 return fileProperties.get(DESCRIPTION);
279 * Returns the length of the hidden Markov model.
283 public Integer getLength()
285 if (fileProperties.get(LENGTH) == null)
289 return Integer.parseInt(fileProperties.get(LENGTH));
293 * Returns the max instance length within the hidden Markov model.
297 public Integer getMaxInstanceLength()
299 if (fileProperties.get(MAX_LENGTH) == null)
303 return Integer.parseInt(fileProperties.get(MAX_LENGTH));
307 * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the
308 * options. Other alphabets may be added.
312 public String getAlphabetType()
314 return fileProperties.get(ALPHABET);
318 * Returns the date as a String.
322 public String getDate()
324 return fileProperties.get(DATE);
328 * Returns the command line log.
332 public String getCommandLineLog()
334 return fileProperties.get(COMMAND_LOG);
338 * Returns the number of sequences on which the HMM was trained.
342 public Integer getNumberOfSequences()
344 if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
348 return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
352 * Returns the effective number of sequences on which the HMM was based.
356 public Double getEffectiveNumberOfSequences()
358 if (fileProperties.get(LENGTH) == null)
362 return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
366 * Returns the checksum.
370 public Long getCheckSum()
372 if (fileProperties.get(LENGTH) == null)
376 return Long.parseLong(fileProperties.get(CHECK_SUM));
380 * Returns the list of nodes in this HMM.
384 public List<HMMNode> getNodes()
390 * Sets the list of nodes in this HMM to the given list.
393 * The list of nodes to which the current list of nodes is being
396 public void setNodes(List<HMMNode> nodes)
402 * Gets the match emission probability for a given symbol at a column in the
406 * The index of the alignment column, starting at index 0. Index 0
407 * usually corresponds to index 1 in the HMM.
409 * The symbol for which the desired probability is being requested.
413 public double getMatchEmissionProbability(int alignColumn, char symbol)
415 if (!symbolIndexLookup.containsKey(symbol))
419 int symbolIndex = symbolIndexLookup.get(symbol);
420 double probability = 0d;
421 if (nodeLookup.containsKey(alignColumn))
423 HMMNode node = nodeLookup.get(alignColumn);
424 probability = node.getMatchEmissions().get(symbolIndex);
430 * Gets the insert emission probability for a given symbol at a column in the
434 * The index of the alignment column, starting at index 0. Index 0
435 * usually corresponds to index 1 in the HMM.
437 * The symbol for which the desired probability is being requested.
441 public double getInsertEmissionProbability(int alignColumn, char symbol)
443 if (!symbolIndexLookup.containsKey(symbol))
447 int symbolIndex = symbolIndexLookup.get(symbol);
448 double probability = 0d;
449 if (nodeLookup.containsKey(alignColumn))
451 HMMNode node = nodeLookup.get(alignColumn);
452 probability = node.getInsertEmissions().get(symbolIndex);
458 * Gets the state transition probability for a given symbol at a column in the
462 * The index of the alignment column, starting at index 0. Index 0
463 * usually corresponds to index 1 in the HMM.
465 * The symbol for which the desired probability is being requested.
469 public Double getStateTransitionProbability(int alignColumn,
472 double probability = 0d;
473 if (nodeLookup.containsKey(alignColumn))
475 HMMNode node = nodeLookup.get(alignColumn);
476 probability = node.getStateTransitions().get(transition);
482 * Returns the alignment column linked to the node at the given index.
485 * The index of the node, starting from index 1. Index 0 is the begin
486 * node, which does not correspond to a column in the alignment.
489 public Integer getNodeAlignmentColumn(int nodeIndex)
491 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
496 * Returns the consensus residue at the specified node.
499 * The index of the specified node.
502 public char getConsensusResidue(int nodeIndex)
504 char value = nodes.get(nodeIndex).getConsensusResidue();
509 * Returns the consensus at a given alignment column. If the character is
510 * lower case, its emission probability is less than 0.5.
513 * The index of the column in the alignment for which the consensus
514 * is desired. The list of columns starts at index 0.
517 public char getConsensusAtAlignColumn(int columnIndex)
519 char mostLikely = '-';
520 if (consensusResidueIsActive())
522 HMMNode node = nodeLookup.get(columnIndex);
527 mostLikely = node.getConsensusResidue();
532 double highestProb = 0;
533 for (char character : symbols)
535 double prob = getMatchEmissionProbability(columnIndex, character);
536 if (prob > highestProb)
539 mostLikely = character;
542 if (highestProb < 0.5)
544 mostLikely = Character.toLowerCase(mostLikely);
552 * Returns the reference annotation at the specified node.
555 * The index of the specified node.
558 public char getReferenceAnnotation(int nodeIndex)
560 char value = nodes.get(nodeIndex).getReferenceAnnotation();
565 * Returns the mask value at the specified node.
568 * The index of the specified node.
571 public char getMaskedValue(int nodeIndex)
573 char value = nodes.get(nodeIndex).getMaskValue();
578 * Returns the consensus structure at the specified node.
581 * The index of the specified node.
584 public char getConsensusStructure(int nodeIndex)
586 char value = nodes.get(nodeIndex).getConsensusStructure();
591 * Returns the average match emission probability for a given symbol
594 * The index of the symbol.
598 public double getAverageMatchEmission(int symbolIndex)
600 double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
605 * Returns the number of symbols in the alphabet used in this HMM.
609 public int getNumberOfSymbols()
611 return symbols.size();
615 * Adds a file property.
620 public void addFileProperty(String key, String value)
622 fileProperties.put(key, value);
626 * Returns a boolean indicating whether the reference annotation is active.
630 public boolean referenceAnnotationIsActive()
633 status = fileProperties.get(REFERENCE_ANNOTATION);
651 * Returns a boolean indicating whether the mask value annotation is active.
655 public boolean maskValueIsActive()
658 status = fileProperties.get(MASKED_VALUE);
676 * Returns a boolean indicating whether the consensus residue annotation is
681 public boolean consensusResidueIsActive()
684 status = fileProperties.get(CONSENSUS_RESIDUE);
702 * Returns a boolean indicating whether the consensus structure annotation is
707 public boolean consensusStructureIsActive()
710 status = fileProperties.get(CONSENSUS_STRUCTURE);
728 * Returns a boolean indicating whether the MAP annotation is active.
732 public boolean mapIsActive()
735 status = fileProperties.get(MAP);
753 * Sets the alignment column of the specified node
760 public void setAlignmentColumn(HMMNode node, int column)
762 node.setAlignmentColumn(column);
763 nodeLookup.put(column, node);
766 public void updateMapping(char[] sequence)
770 synchronized (nodeLookup)
773 for (char residue : sequence)
775 if (!Comparison.isGap(residue))
777 HMMNode node = nodes.get(nodeNo);
780 // error : too few nodes for sequence
783 setAlignmentColumn(node, column);
792 * Clears all data in the node lookup map
794 public void clearNodeLookup()
800 * Sets the reference annotation at a given node.
805 public void setReferenceAnnotation(int nodeIndex, char value)
807 nodes.get(nodeIndex).setReferenceAnnotation(value);
811 * Sets the consensus residue at a given node.
816 public void setConsensusResidue(int nodeIndex, char value)
818 nodes.get(nodeIndex).setConsensusResidue(value);
822 * Sets the consensus structure at a given node.
827 public void setConsensusStructure(int nodeIndex, char value)
829 nodes.get(nodeIndex).setConsensusStructure(value);
833 * Sets the mask value at a given node.
838 public void setMaskValue(int nodeIndex, char value)
840 nodes.get(nodeIndex).setMaskValue(value);
844 * Temporary implementation, should not be used.
848 public String getGatheringThreshold()
851 value = fileProperties.get("GA");
856 * Temporary implementation, should not be used.
860 public String getNoiseCutoff()
863 value = fileProperties.get("NC");
868 * Temporary implementation, should not be used.
872 public String getTrustedCutoff()
875 value = fileProperties.get("TC");
880 * Temporary implementation, should not be used.
884 public String getViterbi()
887 value = fileProperties.get(VITERBI);
892 * Temporary implementation, should not be used.
896 public String getMSV()
899 value = fileProperties.get(MSV);
904 * Temporary implementation, should not be used.
908 public String getForward()
911 value = fileProperties.get(FORWARD);
916 * Sets the activation status of the MAP annotation.
920 public void setMAPStatus(boolean status)
922 fileProperties.put(MAP, status ? YES : NO);
926 * Sets the activation status of the reference annotation.
930 public void setReferenceAnnotationStatus(boolean status)
932 fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO);
936 * Sets the activation status of the mask value annotation.
940 public void setMaskedValueStatus(boolean status)
942 fileProperties.put(MASKED_VALUE, status ? YES : NO);
946 * Sets the activation status of the consensus residue annotation.
950 public void setConsensusResidueStatus(boolean status)
952 fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO);
956 * Sets the activation status of the consensus structure annotation.
960 public void setConsensusStructureStatus(boolean status)
962 fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO);
966 * Answers the HMMNode mapped to the given alignment column (base 0), or null
969 * @param alignmentColumn
971 public HMMNode getNodeForColumn(int alignmentColumn)
973 return nodeLookup.get(alignmentColumn);
977 * Finds the String values of a boolean. "yes" for true and "no" for false.
982 public static String findStringFromBoolean(boolean value)
997 * Returns the consensus sequence based on the most probable symbol at each
998 * position. The sequence is adjusted to match the length of the existing
999 * sequence alignment. Gap characters are used as padding.
1002 * The length of the longest sequence in the existing alignment.
1005 public Sequence getConsensusSequence()
1010 start = getNodeAlignmentColumn(1);
1011 modelLength = getLength();
1012 end = getNodeAlignmentColumn(modelLength);
1013 char[] sequence = new char[end + 1];
1014 for (int index = 0; index < end + 1; index++)
1016 Character character;
1018 character = getConsensusAtAlignColumn(index);
1020 if (character == null || character == '-')
1022 sequence[index] = '-';
1026 sequence[index] = Character.toUpperCase(character);
1031 Sequence seq = new Sequence(getName(), sequence, start,
1038 * Initiates a HMM consensus sequence
1040 * @return A new HMM consensus sequence
1042 public SequenceI initHMMSequence()
1044 Sequence consensus = getConsensusSequence();
1045 consensus.setIsHMMConsensusSequence(true);
1046 consensus.setHMM(this);
1050 public int getSymbolIndex(char c)
1052 return symbolIndexLookup.get(c);
1055 public void setSymbolIndex(Character c, Integer i)
1057 symbolIndexLookup.put(c, i);