1 package jalview.datamodel;
3 import jalview.schemes.ResidueProperties;
5 import java.util.ArrayList;
6 import java.util.HashMap;
11 * Data structure which stores a hidden Markov model. Currently contains file
12 * properties as well, not sure whether these should be transferred to the
18 public class HiddenMarkovModel
20 private static final double LOG2 = Math.log(2);
22 // Stores file properties. Do not directly access this field as it contains
23 // only string value - use the getter methods. For example, to find the length
24 // of theHMM, use getModelLength()to return an int value
25 Map<String, String> fileProperties = new HashMap<>();
27 // contains all of the symbols used in this model. The index of each symbol
28 // represents its lookup value
29 List<Character> symbols = new ArrayList<>();
31 // contains information for each node in the model. The begin node is at index
32 // 0. Node 0 contains average emission probabilities for each symbol
33 List<HMMNode> nodes = new ArrayList<>();
35 // contains the HMM node for each alignment column, alignment columns start at
37 Map<Integer, Integer> nodeLookup = new HashMap<>();
39 // contains the symbol index for each symbol
40 Map<Character, Integer> symbolIndexLookup = new HashMap<>();
42 final static String YES = "yes";
44 final static String NO = "no";
46 // keys for file properties hashmap
47 private final String NAME = "NAME";
49 private final String ACCESSION_NUMBER = "ACC";
51 private final String DESCRIPTION = "DESC";
53 private final String LENGTH = "LENG";
55 private final String MAX_LENGTH = "MAXL";
57 private final String ALPHABET = "ALPH";
59 private final String DATE = "DATE";
61 private final String COMMAND_LOG = "COM";
63 private final String NUMBER_OF_SEQUENCES = "NSEQ";
65 private final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
67 private final String CHECK_SUM = "CKSUM";
69 private final String GATHERING_THRESHOLDS = "GA";
71 private final String TRUSTED_CUTOFFS = "TC";
73 private final String NOISE_CUTOFFS = "NC";
75 private final String STATISTICS = "STATS";
77 private final String COMPO = "COMPO";
79 private final String GATHERING_THRESHOLD = "GA";
81 private final String TRUSTED_CUTOFF = "TC";
83 private final String NOISE_CUTOFF = "NC";
85 private final String VITERBI = "VITERBI";
87 private final String MSV = "MSV";
89 private final String FORWARD = "FORWARD";
91 private final String MAP = "MAP";
93 private final String REFERENCE_ANNOTATION = "RF";
95 private final String CONSENSUS_RESIDUE = "CONS";
97 private final String CONSENSUS_STRUCTURE = "CS";
99 private final String MASKED_VALUE = "MM";
101 public static final int MATCHTOMATCH = 0;
103 public static final int MATCHTOINSERT = 1;
105 public static final int MATCHTODELETE = 2;
107 public static final int INSERTTOMATCH = 3;
109 public static final int INSERTTOINSERT = 4;
111 public static final int DELETETOMATCH = 5;
113 public static final int DELETETODELETE = 6;
120 public HiddenMarkovModel()
124 public HiddenMarkovModel(HiddenMarkovModel hmm)
127 this.fileProperties = new HashMap<>(hmm.fileProperties);
128 this.symbols = new ArrayList<>(hmm.symbols);
129 this.nodes = new ArrayList<>(hmm.nodes);
130 this.nodeLookup = new HashMap<>(hmm.nodeLookup);
131 this.symbolIndexLookup = new HashMap<>(
132 hmm.symbolIndexLookup);
133 this.fileHeader = new String(hmm.fileHeader);
137 * Returns the information content at a specified column, calculated as the
138 * sum (over possible symbols) of the log ratio
141 * log(emission probability / background probability) / log(2)
145 * column position (base 0)
148 public float getInformationContent(int column)
150 float informationContent = 0f;
152 for (char symbol : getSymbols())
154 float freq = ResidueProperties.backgroundFrequencies
155 .get(getAlphabetType()).get(symbol);
156 float prob = (float) getMatchEmissionProbability(column, symbol);
157 informationContent += prob * Math.log(prob / freq);
160 informationContent = informationContent / (float) LOG2;
162 return informationContent;
166 * Gets the file header of the .hmm file this model came from
170 public String getFileHeader()
176 * Sets the file header of this model.
180 public void setFileHeader(String header)
186 * Returns the map containing the matches between nodes and alignment column
192 public Map<Integer, Integer> getNodeLookup()
198 * Returns the list of symbols used in this hidden Markov model.
202 public List<Character> getSymbols()
208 * Returns the file properties.
212 public Map<String, String> getFileProperties()
214 return fileProperties;
218 * Gets the node in the hidden Markov model at the specified position.
221 * The index of the node requested. Node 0 optionally contains the
222 * average match emission probabilities across the entire model, and
223 * always contains the insert emission probabilities and state
224 * transition probabilities for the begin node. Node 1 contains the
225 * first node in the HMM that can correspond to a column in the
229 public HMMNode getNode(int nodeIndex)
231 return getNodes().get(nodeIndex);
235 * Sets the list of symbols used in the hidden Markov model to the list
239 * The list of symbols to which the current list is to be changed.
242 public void setSymbols(List<Character> symbolsL)
244 this.symbols = symbolsL;
248 * Returns the name of the sequence alignment on which the HMM is based.
252 public String getName()
254 return fileProperties.get(NAME);
258 * Returns the accession number.
261 public String getAccessionNumber()
263 return fileProperties.get(ACCESSION_NUMBER);
267 * Returns a description of the sequence alignment on which the hidden Markov
272 public String getDescription()
274 return fileProperties.get(DESCRIPTION);
278 * Returns the length of the hidden Markov model.
282 public Integer getLength()
284 if (fileProperties.get(LENGTH) == null)
288 return Integer.parseInt(fileProperties.get(LENGTH));
292 * Returns the max instance length within the hidden Markov model.
296 public Integer getMaxInstanceLength()
298 if (fileProperties.get(MAX_LENGTH) == null)
302 return Integer.parseInt(fileProperties.get(MAX_LENGTH));
306 * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the
307 * options. Other alphabets may be added.
311 public String getAlphabetType()
313 return fileProperties.get(ALPHABET);
317 * Returns the date as a String.
321 public String getDate()
323 return fileProperties.get(DATE);
327 * Returns the command line log.
331 public String getCommandLineLog()
333 return fileProperties.get(COMMAND_LOG);
337 * Returns the number of sequences on which the HMM was trained.
341 public Integer getNumberOfSequences()
343 if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
347 return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
351 * Returns the effective number of sequences on which the HMM was based.
355 public Double getEffectiveNumberOfSequences()
357 if (fileProperties.get(LENGTH) == null)
361 return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
365 * Returns the checksum.
369 public Long getCheckSum()
371 if (fileProperties.get(LENGTH) == null)
375 return Long.parseLong(fileProperties.get(CHECK_SUM));
379 * Returns the list of nodes in this HMM.
383 public List<HMMNode> getNodes()
389 * Sets the list of nodes in this HMM to the given list.
392 * The list of nodes to which the current list of nodes is being
395 public void setNodes(List<HMMNode> nodes)
401 * Gets the match emission probability for a given symbol at a column in the
405 * The index of the alignment column, starting at index 0. Index 0
406 * usually corresponds to index 1 in the HMM.
408 * The symbol for which the desired probability is being requested.
412 public double getMatchEmissionProbability(int alignColumn, char symbol)
417 if (!symbolIndexLookup.containsKey(symbol))
421 symbolIndex = symbolIndexLookup.get(symbol);
422 if (nodeLookup.containsKey(alignColumn))
424 nodeIndex = nodeLookup.get(alignColumn);
425 probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex);
435 * Gets the insert emission probability for a given symbol at a column in the
439 * The index of the alignment column, starting at index 0. Index 0
440 * usually corresponds to index 1 in the HMM.
442 * The symbol for which the desired probability is being requested.
446 public double getInsertEmissionProbability(int alignColumn, char symbol)
451 if (!symbolIndexLookup.containsKey(symbol))
455 symbolIndex = symbolIndexLookup.get(symbol);
456 if (nodeLookup.containsKey(alignColumn))
458 nodeIndex = nodeLookup.get(alignColumn);
459 probability = getNode(nodeIndex).getInsertEmissions()
471 * Gets the state transition probability for a given symbol at a column in the
475 * The index of the alignment column, starting at index 0. Index 0
476 * usually corresponds to index 1 in the HMM.
478 * The symbol for which the desired probability is being requested.
482 public Double getStateTransitionProbability(int alignColumn,
487 if (nodeLookup.containsKey(alignColumn))
489 nodeIndex = nodeLookup.get(alignColumn);
490 probability = getNode(nodeIndex).getStateTransitions()
502 * Returns the alignment column linked to the node at the given index.
505 * The index of the node, starting from index 1. Index 0 is the begin
506 * node, which does not correspond to a column in the alignment.
509 public Integer getNodeAlignmentColumn(int nodeIndex)
511 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
516 * Returns the consensus residue at the specified node.
519 * The index of the specified node.
522 public char getConsensusResidue(int nodeIndex)
524 char value = nodes.get(nodeIndex).getConsensusResidue();
529 * Returns the consensus at a given alignment column. If the character is
530 * lower case, its emission probability is less than 0.5.
533 * The index of the column in the alignment for which the consensus
534 * is desired. The list of columns starts at index 0.
537 public char getConsensusAtAlignColumn(int columnIndex)
539 char mostLikely = '-';
540 if (consensusResidueIsActive())
543 Integer index = findNodeIndex(columnIndex);
548 mostLikely = getNodes().get(index).getConsensusResidue();
553 double highestProb = 0;
554 for (char character : symbols)
556 Double prob = getMatchEmissionProbability(columnIndex, character);
557 if (prob > highestProb)
560 mostLikely = character;
563 if (highestProb < 0.5)
565 mostLikely = Character.toLowerCase(mostLikely);
573 * Returns the reference annotation at the specified node.
576 * The index of the specified node.
579 public char getReferenceAnnotation(int nodeIndex)
581 char value = nodes.get(nodeIndex).getReferenceAnnotation();
586 * Returns the mask value at the specified node.
589 * The index of the specified node.
592 public char getMaskedValue(int nodeIndex)
594 char value = nodes.get(nodeIndex).getMaskValue();
599 * Returns the consensus structure at the specified node.
602 * The index of the specified node.
605 public char getConsensusStructure(int nodeIndex)
607 char value = nodes.get(nodeIndex).getConsensusStructure();
612 * Returns the average match emission probability for a given symbol
615 * The index of the symbol.
619 public double getAverageMatchEmission(int symbolIndex)
621 double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
626 * Returns the number of symbols in the alphabet used in this HMM.
630 public int getNumberOfSymbols()
632 return symbols.size();
636 * Adds a file property.
641 public void addFileProperty(String key, String value)
643 fileProperties.put(key, value);
647 * Returns a boolean indicating whether the reference annotation is active.
651 public boolean referenceAnnotationIsActive()
654 status = fileProperties.get(REFERENCE_ANNOTATION);
672 * Returns a boolean indicating whether the mask value annotation is active.
676 public boolean maskValueIsActive()
679 status = fileProperties.get(MASKED_VALUE);
697 * Returns a boolean indicating whether the consensus residue annotation is
702 public boolean consensusResidueIsActive()
705 status = fileProperties.get(CONSENSUS_RESIDUE);
723 * Returns a boolean indicating whether the consensus structure annotation is
728 public boolean consensusStructureIsActive()
731 status = fileProperties.get(CONSENSUS_STRUCTURE);
749 * Returns a boolean indicating whether the MAP annotation is active.
753 public boolean mapIsActive()
756 status = fileProperties.get(MAP);
774 * Sets the alignment column of the specified node.
781 public void setAlignmentColumn(int nodeIndex, int column)
783 nodes.get(nodeIndex).setAlignmentColumn(column);
784 nodeLookup.put(column, nodeIndex);
788 * Clears all data in the node lookup map
790 public void emptyNodeLookup()
792 nodeLookup = new HashMap<>();
797 * Sets the reference annotation at a given node.
802 public void setReferenceAnnotation(int nodeIndex, char value)
804 nodes.get(nodeIndex).setReferenceAnnotation(value);
808 * Sets the consensus residue at a given node.
813 public void setConsensusResidue(int nodeIndex, char value)
815 nodes.get(nodeIndex).setConsensusResidue(value);
819 * Sets the consensus structure at a given node.
824 public void setConsensusStructure(int nodeIndex, char value)
826 nodes.get(nodeIndex).setConsensusStructure(value);
830 * Sets the mask value at a given node.
835 public void setMaskValue(int nodeIndex, char value)
837 nodes.get(nodeIndex).setMaskValue(value);
841 * Temporary implementation, should not be used.
845 public String getGatheringThreshold()
848 value = fileProperties.get("GA");
853 * Temporary implementation, should not be used.
857 public String getNoiseCutoff()
860 value = fileProperties.get("NC");
865 * Temporary implementation, should not be used.
869 public String getTrustedCutoff()
872 value = fileProperties.get("TC");
877 * Temporary implementation, should not be used.
881 public String getViterbi()
884 value = fileProperties.get(VITERBI);
889 * Temporary implementation, should not be used.
893 public String getMSV()
896 value = fileProperties.get(MSV);
901 * Temporary implementation, should not be used.
905 public String getForward()
908 value = fileProperties.get(FORWARD);
913 * Sets the activation status of the MAP annotation.
917 public void setMAPStatus(boolean status)
919 fileProperties.put(MAP, status ? YES : NO);
923 * Sets the activation status of the reference annotation.
927 public void setReferenceAnnotationStatus(boolean status)
929 fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO);
933 * Sets the activation status of the mask value annotation.
937 public void setMaskedValueStatus(boolean status)
939 fileProperties.put(MASKED_VALUE, status ? YES : NO);
943 * Sets the activation status of the consensus residue annotation.
947 public void setConsensusResidueStatus(boolean status)
949 fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO);
953 * Sets the activation status of the consensus structure annotation.
957 public void setConsensusStructureStatus(boolean status)
959 fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO);
963 * Finds the index of the node in a hidden Markov model based on the column in
966 * @param alignmentColumn
967 * The index of the column in the alignment, with the indexes
971 public Integer findNodeIndex(int alignmentColumn)
974 index = nodeLookup.get(alignmentColumn);
979 * Finds the String values of a boolean. "yes" for true and "no" for false.
984 public static String findStringFromBoolean(boolean value)
999 * Returns the consensus sequence based on the most probable symbol at each
1000 * position. The sequence is adjusted to match the length of the existing
1001 * sequence alignment. Gap characters are used as padding.
1004 * The length of the longest sequence in the existing alignment.
1007 public Sequence getConsensusSequence()
1012 start = getNodeAlignmentColumn(1);
1013 modelLength = getLength();
1014 end = getNodeAlignmentColumn(modelLength);
1015 char[] sequence = new char[end + 1];
1016 for (int index = 0; index < end + 1; index++)
1018 Character character;
1020 character = getConsensusAtAlignColumn(index);
1022 if (character == null || character == '-')
1024 sequence[index] = '-';
1028 sequence[index] = Character.toUpperCase(character);
1033 Sequence seq = new Sequence(getName(), sequence, start,
1040 * Initiates a HMM consensus sequence
1042 * @return A new HMM consensus sequence
1044 public SequenceI initHMMSequence()
1046 Sequence consensus = getConsensusSequence();
1047 consensus.setIsHMMConsensusSequence(true);
1048 consensus.setHMM(this);
1052 public int getSymbolIndex(char c)
1054 return symbolIndexLookup.get(c);
1057 public void setSymbolIndex(Character c, Integer i)
1059 symbolIndexLookup.put(c, i);