1 package jalview.datamodel;
3 import java.util.ArrayList;
4 import java.util.HashMap;
9 * Data structure which stores a hidden Markov model. Currently contains file
10 * properties as well, not sure whether these should be transferred to the
16 public class HiddenMarkovModel
20 // Stores file properties. Do not directly access this field as it contains
21 // only string value - use the getter methods. For example, to find the length
22 // of theHMM, use getModelLength()to return an int value
23 Map<String, String> fileProperties = new HashMap<>();
25 // contains all of the symbols used in this model. The index of each symbol
26 // represents its lookup value
27 List<Character> symbols = new ArrayList<>();
29 // contains information for each node in the model. The begin node is at index
30 // 0. Node 0 contains average emission probabilities for each symbol
31 List<HMMNode> nodes = new ArrayList<>();
33 // contains the HMM node for each alignment column, alignment columns start at
35 Map<Integer, Integer> nodeLookup = new HashMap<>();
37 // contains the symbol index for each symbol
38 Map<Character, Integer> symbolIndexLookup = new HashMap<>();
40 final static String YES = "yes";
42 final static String NO = "no";
44 // keys for file properties hashmap
45 private final String NAME = "NAME";
47 private final String ACCESSION_NUMBER = "ACC";
49 private final String DESCRIPTION = "DESC";
51 private final String LENGTH = "LENG";
53 private final String MAX_LENGTH = "MAXL";
55 private final String ALPHABET = "ALPH";
57 private final String DATE = "DATE";
59 private final String COMMAND_LOG = "COM";
61 private final String NUMBER_OF_SEQUENCES = "NSEQ";
63 private final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
65 private final String CHECK_SUM = "CKSUM";
67 private final String GATHERING_THRESHOLDS = "GA";
69 private final String TRUSTED_CUTOFFS = "TC";
71 private final String NOISE_CUTOFFS = "NC";
73 private final String STATISTICS = "STATS";
75 private final String COMPO = "COMPO";
77 private final String GATHERING_THRESHOLD = "GA";
79 private final String TRUSTED_CUTOFF = "TC";
81 private final String NOISE_CUTOFF = "NC";
83 private final String VITERBI = "VITERBI";
85 private final String MSV = "MSV";
87 private final String FORWARD = "FORWARD";
89 private final String MAP = "MAP";
91 private final String REFERENCE_ANNOTATION = "RF";
93 private final String CONSENSUS_RESIDUE = "CONS";
95 private final String CONSENSUS_STRUCTURE = "CS";
97 private final String MASKED_VALUE = "MM";
99 public static final int MATCHTOMATCH = 0;
101 public static final int MATCHTOINSERT = 1;
103 public static final int MATCHTODELETE = 2;
105 public static final int INSERTTOMATCH = 3;
107 public static final int INSERTTOINSERT = 4;
109 public static final int DELETETOMATCH = 5;
111 public static final int DELETETODELETE = 6;
115 public HiddenMarkovModel()
120 public HiddenMarkovModel(HiddenMarkovModel hmm)
123 this.fileProperties = new HashMap<>(hmm.fileProperties);
124 this.symbols = new ArrayList<>(hmm.symbols);
125 this.nodes = new ArrayList<>(hmm.nodes);
126 this.nodeLookup = new HashMap<>(hmm.nodeLookup);
127 this.symbolIndexLookup = new HashMap<>(
128 hmm.symbolIndexLookup);
129 this.fileHeader = new String(hmm.fileHeader);
133 * Gets the file header of the .hmm file this model came from.
137 public String getFileHeader()
143 * Sets the file header of this model.
147 public void setFileHeader(String header)
153 * Returns the map containing the matches between nodes and alignment column
159 public Map<Integer, Integer> getNodeLookup()
165 * Returns the list of symbols used in this hidden Markov model.
169 public List<Character> getSymbols()
175 * Returns the file properties.
179 public Map<String, String> getFileProperties()
181 return fileProperties;
185 * Gets the node in the hidden Markov model at the specified position.
188 * The index of the node requested. Node 0 optionally contains the
189 * average match emission probabilities across the entire model, and
190 * always contains the insert emission probabilities and state
191 * transition probabilities for the begin node. Node 1 contains the
192 * first node in the HMM that can correspond to a column in the
196 public HMMNode getNode(int nodeIndex)
198 return getNodes().get(nodeIndex);
202 * Sets the list of symbols used in the hidden Markov model to the list
206 * The list of symbols to which the current list is to be changed.
209 public void setSymbols(List<Character> symbolsL)
211 this.symbols = symbolsL;
215 * Returns the name of the sequence alignment on which the HMM is based.
219 public String getName()
221 return fileProperties.get(NAME);
225 * Returns the accession number.
228 public String getAccessionNumber()
230 return fileProperties.get(ACCESSION_NUMBER);
234 * Returns a description of the sequence alignment on which the hidden Markov
239 public String getDescription()
241 return fileProperties.get(DESCRIPTION);
245 * Returns the length of the hidden Markov model.
249 public Integer getLength()
251 if (fileProperties.get(LENGTH) == null)
255 return Integer.parseInt(fileProperties.get(LENGTH));
259 * Returns the max instance length within the hidden Markov model.
263 public Integer getMaxInstanceLength()
265 if (fileProperties.get(MAX_LENGTH) == null)
269 return Integer.parseInt(fileProperties.get(MAX_LENGTH));
273 * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the
274 * options. Other alphabets may be added.
278 public String getAlphabetType()
280 return fileProperties.get(ALPHABET);
284 * Returns the date as a String.
288 public String getDate()
290 return fileProperties.get(DATE);
294 * Returns the command line log.
298 public String getCommandLineLog()
300 return fileProperties.get(COMMAND_LOG);
304 * Returns the number of sequences on which the HMM was trained.
308 public Integer getNumberOfSequences()
310 if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
314 return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
318 * Returns the effective number of sequences on which the HMM was based.
322 public Double getEffectiveNumberOfSequences()
324 if (fileProperties.get(LENGTH) == null)
328 return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
332 * Returns the checksum.
336 public Long getCheckSum()
338 if (fileProperties.get(LENGTH) == null)
342 return Long.parseLong(fileProperties.get(CHECK_SUM));
346 * Returns the list of nodes in this HMM.
350 public List<HMMNode> getNodes()
356 * Sets the list of nodes in this HMM to the given list.
359 * The list of nodes to which the current list of nodes is being
362 public void setNodes(List<HMMNode> nodes)
368 * Gets the match emission probability for a given symbol at a column in the
372 * The index of the alignment column, starting at index 0. Index 0
373 * usually corresponds to index 1 in the HMM.
375 * The symbol for which the desired probability is being requested.
379 public Double getMatchEmissionProbability(int alignColumn, char symbol)
384 if (!symbolIndexLookup.containsKey(symbol))
388 symbolIndex = symbolIndexLookup.get(symbol);
389 if (nodeLookup.containsKey(alignColumn))
391 nodeIndex = nodeLookup.get(alignColumn);
392 probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex);
403 * Gets the insert emission probability for a given symbol at a column in the
407 * The index of the alignment column, starting at index 0. Index 0
408 * usually corresponds to index 1 in the HMM.
410 * The symbol for which the desired probability is being requested.
414 public Double getInsertEmissionProbability(int alignColumn, char symbol)
419 if (!symbolIndexLookup.containsKey(symbol))
423 symbolIndex = symbolIndexLookup.get(symbol);
424 if (nodeLookup.containsKey(alignColumn))
426 nodeIndex = nodeLookup.get(alignColumn);
427 probability = getNode(nodeIndex).getInsertEmissions()
439 * Gets the state transition probability for a given symbol at a column in the
443 * The index of the alignment column, starting at index 0. Index 0
444 * usually corresponds to index 1 in the HMM.
446 * The symbol for which the desired probability is being requested.
450 public Double getStateTransitionProbability(int alignColumn,
455 if (nodeLookup.containsKey(alignColumn))
457 nodeIndex = nodeLookup.get(alignColumn);
458 probability = getNode(nodeIndex).getStateTransitions()
470 * Returns the alignment column linked to the node at the given index.
473 * The index of the node, starting from index 1. Index 0 is the begin
474 * node, which does not correspond to a column in the alignment.
477 public Integer getNodeAlignmentColumn(int nodeIndex)
479 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
484 * Returns the consensus residue at the specified node.
487 * The index of the specified node.
490 public char getConsensusResidue(int nodeIndex)
492 char value = nodes.get(nodeIndex).getConsensusResidue();
497 * Returns the consensus at a given alignment column. If the character is
498 * lower case, its emission probability is less than 0.5.
501 * The index of the column in the alignment for which the consensus
502 * is desired. The list of columns starts at index 0.
505 public char getConsensusAtAlignColumn(int columnIndex)
507 char mostLikely = '-';
508 if (consensusResidueIsActive())
511 Integer index = findNodeIndex(columnIndex);
516 mostLikely = getNodes().get(index).getConsensusResidue();
521 double highestProb = 0;
522 for (char character : symbols)
524 Double prob = getMatchEmissionProbability(columnIndex, character);
525 if (prob > highestProb)
528 mostLikely = character;
531 if (highestProb < 0.5)
533 mostLikely = Character.toLowerCase(mostLikely);
541 * Returns the reference annotation at the specified node.
544 * The index of the specified node.
547 public char getReferenceAnnotation(int nodeIndex)
549 char value = nodes.get(nodeIndex).getReferenceAnnotation();
554 * Returns the mask value at the specified node.
557 * The index of the specified node.
560 public char getMaskedValue(int nodeIndex)
562 char value = nodes.get(nodeIndex).getMaskValue();
567 * Returns the consensus structure at the specified node.
570 * The index of the specified node.
573 public char getConsensusStructure(int nodeIndex)
575 char value = nodes.get(nodeIndex).getConsensusStructure();
580 * Returns the average match emission probability for a given symbol
583 * The index of the symbol.
587 public double getAverageMatchEmission(int symbolIndex)
589 double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
594 * Returns the number of symbols in the alphabet used in this HMM.
598 public int getNumberOfSymbols()
600 return symbols.size();
604 * Adds a file property.
609 public void addFileProperty(String key, String value)
611 fileProperties.put(key, value);
615 * Returns a boolean indicating whether the reference annotation is active.
619 public boolean referenceAnnotationIsActive()
622 status = fileProperties.get(REFERENCE_ANNOTATION);
640 * Returns a boolean indicating whether the mask value annotation is active.
644 public boolean maskValueIsActive()
647 status = fileProperties.get(MASKED_VALUE);
665 * Returns a boolean indicating whether the consensus residue annotation is
670 public boolean consensusResidueIsActive()
673 status = fileProperties.get(CONSENSUS_RESIDUE);
691 * Returns a boolean indicating whether the consensus structure annotation is
696 public boolean consensusStructureIsActive()
699 status = fileProperties.get(CONSENSUS_STRUCTURE);
717 * Returns a boolean indicating whether the MAP annotation is active.
721 public boolean mapIsActive()
724 status = fileProperties.get(MAP);
742 * Sets the alignment column of the specified node.
749 public void setAlignmentColumn(int nodeIndex, int column)
751 int currentCol = getNodeAlignmentColumn(nodeIndex);
752 nodeLookup.remove(currentCol);
753 nodes.get(nodeIndex).setAlignmentColumn(column);
754 nodeLookup.put(column, nodeIndex);
758 * Clears all data in the node lookup map
760 public void emptyNodeLookup()
762 nodeLookup = new HashMap<>();
767 * Sets the reference annotation at a given node.
772 public void setReferenceAnnotation(int nodeIndex, char value)
774 nodes.get(nodeIndex).setReferenceAnnotation(value);
778 * Sets the consensus residue at a given node.
783 public void setConsensusResidue(int nodeIndex, char value)
785 nodes.get(nodeIndex).setConsensusResidue(value);
789 * Sets the consensus structure at a given node.
794 public void setConsensusStructure(int nodeIndex, char value)
796 nodes.get(nodeIndex).setConsensusStructure(value);
800 * Sets the mask value at a given node.
805 public void setMaskValue(int nodeIndex, char value)
807 nodes.get(nodeIndex).setMaskValue(value);
811 * Temporary implementation, should not be used.
815 public String getGatheringThreshold()
818 value = fileProperties.get("GA");
823 * Temporary implementation, should not be used.
827 public String getNoiseCutoff()
830 value = fileProperties.get("NC");
835 * Temporary implementation, should not be used.
839 public String getTrustedCutoff()
842 value = fileProperties.get("TC");
847 * Temporary implementation, should not be used.
851 public String getViterbi()
854 value = fileProperties.get(VITERBI);
859 * Temporary implementation, should not be used.
863 public String getMSV()
866 value = fileProperties.get(MSV);
871 * Temporary implementation, should not be used.
875 public String getForward()
878 value = fileProperties.get(FORWARD);
883 * Sets the activation status of the MAP annotation.
887 public void setMAPStatus(boolean status)
889 fileProperties.put(MAP, status ? YES : NO);
893 * Sets the activation status of the reference annotation.
897 public void setReferenceAnnotationStatus(boolean status)
899 fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO);
903 * Sets the activation status of the mask value annotation.
907 public void setMaskedValueStatus(boolean status)
909 fileProperties.put(MASKED_VALUE, status ? YES : NO);
913 * Sets the activation status of the consensus residue annotation.
917 public void setConsensusResidueStatus(boolean status)
919 fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO);
923 * Sets the activation status of the consensus structure annotation.
927 public void setConsensusStructureStatus(boolean status)
929 fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO);
933 * Finds the index of the node in a hidden Markov model based on the column in
936 * @param alignmentColumn
937 * The index of the column in the alignment, with the indexes
941 public Integer findNodeIndex(int alignmentColumn)
944 index = nodeLookup.get(alignmentColumn);
949 * Finds the String values of a boolean. "yes" for true and "no" for false.
954 public static String findStringFromBoolean(boolean value)
969 * Returns the consensus sequence based on the most probable symbol at each
970 * position. The sequence is adjusted to match the length of the existing
971 * sequence alignment. Gap characters are used as padding.
974 * The length of the longest sequence in the existing alignment.
977 public Sequence getConsensusSequence()
982 start = getNodeAlignmentColumn(1);
983 modelLength = getLength();
984 end = getNodeAlignmentColumn(modelLength);
985 char[] sequence = new char[end + 1];
986 for (int index = 0; index < end + 1; index++)
990 character = getConsensusAtAlignColumn(index);
992 if (character == null || character == '-')
994 sequence[index] = '-';
998 sequence[index] = Character.toUpperCase(character);
1003 Sequence seq = new Sequence(getName(), sequence, start,
1010 * Initiates a HMM consensus sequence
1012 * @return A new HMM consensus sequence
1014 public SequenceI initHMMSequence()
1016 Sequence consensus = getConsensusSequence();
1017 consensus.setIsHMMConsensusSequence(true);
1018 consensus.setHMM(this);
1022 public int getSymbolIndex(char c)
1024 return symbolIndexLookup.get(c);
1027 public void setSymbolIndex(Character c, Integer i)
1029 symbolIndexLookup.put(c, i);