1 package jalview.datamodel;
3 import java.util.ArrayList;
4 import java.util.HashMap;
7 import java.util.Scanner;
10 * Data structure which stores a hidden Markov model. Currently contains file
11 * properties as well, not sure whether these should be transferred to the
17 public class HiddenMarkovModel
21 // Stores file properties. Do not directly access this field as it contains
22 // only string value - use the getter methods. For example, to find the length
23 // of theHMM, use getModelLength()to return an int value
24 Map<String, String> fileProperties = new HashMap<>();
26 // contains all of the symbols used in this model. The index of each symbol
27 // represents its lookup value
28 List<Character> symbols = new ArrayList<>();
30 // contains information for each node in the model. The begin node is at index
31 // 0. Node 0 contains average emission probabilities for each symbol
32 List<HMMNode> nodes = new ArrayList<>();
34 // contains the HMM node for each alignment column, alignment columns start at
36 Map<Integer, Integer> nodeLookup = new HashMap<>();
38 // contains the symbol index for each symbol
39 Map<Character, Integer> symbolIndexLookup = new HashMap<>();
41 final static String YES = "yes";
43 final static String NO = "no";
47 // keys for file properties hashmap
48 private final String NAME = "NAME";
50 private final String ACCESSION_NUMBER = "ACC";
52 private final String DESCRIPTION = "DESC";
54 private final String LENGTH = "LENG";
56 private final String MAX_LENGTH = "MAXL";
58 private final String ALPHABET = "ALPH";
60 private final String DATE = "DATE";
62 private final String COMMAND_LOG = "COM";
64 private final String NUMBER_OF_SEQUENCES = "NSEQ";
66 private final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
68 private final String CHECK_SUM = "CKSUM";
70 private final String GATHERING_THRESHOLDS = "GA";
72 private final String TRUSTED_CUTOFFS = "TC";
74 private final String NOISE_CUTOFFS = "NC";
76 private final String STATISTICS = "STATS";
78 private final String COMPO = "COMPO";
80 private final String GATHERING_THRESHOLD = "GA";
82 private final String TRUSTED_CUTOFF = "TC";
84 private final String NOISE_CUTOFF = "NC";
86 private final String VITERBI = "VITERBI";
88 private final String MSV = "MSV";
90 private final String FORWARD = "FORWARD";
92 private final String MAP = "MAP";
94 private final String REFERENCE_ANNOTATION = "RF";
96 private final String CONSENSUS_RESIDUE = "CONS";
98 private final String CONSENSUS_STRUCTURE = "CS";
100 private final String MASKED_VALUE = "MM";
102 public static final int MATCHTOMATCH = 0;
104 public static final int MATCHTOINSERT = 1;
106 public static final int MATCHTODELETE = 2;
108 public static final int INSERTTOMATCH = 3;
110 public static final int INSERTTOINSERT = 4;
112 public static final int DELETETOMATCH = 5;
114 public static final int DELETETODELETE = 6;
118 public HiddenMarkovModel()
123 public HiddenMarkovModel(HiddenMarkovModel hmm)
126 this.fileProperties = new HashMap<>(hmm.fileProperties);
127 this.symbols = new ArrayList<>(hmm.symbols);
128 this.nodes = new ArrayList<>(hmm.nodes);
129 this.nodeLookup = new HashMap<>(hmm.nodeLookup);
130 this.symbolIndexLookup = new HashMap<>(
131 hmm.symbolIndexLookup);
132 this.numberOfSymbols = hmm.numberOfSymbols;
133 this.fileHeader = new String(hmm.fileHeader);
137 * Gets the file header of the .hmm file this model came from.
141 public String getFileHeader()
147 * Sets the file header of this model.
151 public void setFileHeader(String header)
157 * Returns the map containing the matches between nodes and alignment column
163 public Map<Integer, Integer> getNodeLookup()
169 * Returns the list of symbols used in this hidden Markov model.
173 public List<Character> getSymbols()
179 * Returns the file properties.
183 public Map<String, String> getFileProperties()
185 return fileProperties;
189 * Gets the node in the hidden Markov model at the specified position.
192 * The index of the node requested. Node 0 optionally contains the
193 * average match emission probabilities across the entire model, and
194 * always contains the insert emission probabilities and state
195 * transition probabilities for the begin node. Node 1 contains the
196 * first node in the HMM that can correspond to a column in the
200 public HMMNode getNode(int nodeIndex)
202 return getNodes().get(nodeIndex);
206 * Sets the list of symbols used in the hidden Markov model to the list
210 * The list of symbols to which the current list is to be changed.
213 public void setSymbols(List<Character> symbolsL)
215 this.symbols = symbolsL;
219 * Returns the name of the sequence alignment on which the HMM is based.
223 public String getName()
225 return fileProperties.get(NAME);
229 * Returns the accession number.
232 public String getAccessionNumber()
234 return fileProperties.get(ACCESSION_NUMBER);
238 * Returns a description of the sequence alignment on which the hidden Markov
243 public String getDescription()
245 return fileProperties.get(DESCRIPTION);
249 * Returns the length of the hidden Markov model.
253 public Integer getLength()
255 if (fileProperties.get(LENGTH) == null)
259 return Integer.parseInt(fileProperties.get(LENGTH));
263 * Returns the max instance length within the hidden Markov model.
267 public Integer getMaxInstanceLength()
269 if (fileProperties.get(MAX_LENGTH) == null)
273 return Integer.parseInt(fileProperties.get(MAX_LENGTH));
277 * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the
278 * options. Other alphabets may be added.
282 public String getAlphabetType()
284 return fileProperties.get(ALPHABET);
288 * Returns the date as a String.
292 public String getDate()
294 return fileProperties.get(DATE);
298 * Returns the command line log.
302 public String getCommandLineLog()
304 return fileProperties.get(COMMAND_LOG);
308 * Returns the number of sequences on which the HMM was trained.
312 public Integer getNumberOfSequences()
314 if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
318 return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
322 * Returns the effective number of sequences on which the HMM was based.
326 public Double getEffectiveNumberOfSequences()
328 if (fileProperties.get(LENGTH) == null)
332 return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
336 * Returns the checksum.
340 public Long getCheckSum()
342 if (fileProperties.get(LENGTH) == null)
346 return Long.parseLong(fileProperties.get(CHECK_SUM));
350 * Returns the list of nodes in this HMM.
354 public List<HMMNode> getNodes()
360 * Sets the list of nodes in this HMM to the given list.
363 * The list of nodes to which the current list of nodes is being
366 public void setNodes(List<HMMNode> nodes)
372 * Gets the match emission probability for a given symbol at a column in the
376 * The index of the alignment column, starting at index 0. Index 0
377 * usually corresponds to index 1 in the HMM.
379 * The symbol for which the desired probability is being requested.
383 public Double getMatchEmissionProbability(int alignColumn, char symbol)
388 if (!symbolIndexLookup.containsKey(symbol))
392 symbolIndex = symbolIndexLookup.get(symbol);
393 if (nodeLookup.containsKey(alignColumn))
395 nodeIndex = nodeLookup.get(alignColumn);
396 probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex);
407 * Gets the insert emission probability for a given symbol at a column in the
411 * The index of the alignment column, starting at index 0. Index 0
412 * usually corresponds to index 1 in the HMM.
414 * The symbol for which the desired probability is being requested.
418 public Double getInsertEmissionProbability(int alignColumn, char symbol)
423 if (!symbolIndexLookup.containsKey(symbol))
427 symbolIndex = symbolIndexLookup.get(symbol);
428 if (nodeLookup.containsKey(alignColumn))
430 nodeIndex = nodeLookup.get(alignColumn);
431 probability = getNode(nodeIndex).getInsertEmissions()
443 * Gets the state transition probability for a given symbol at a column in the
447 * The index of the alignment column, starting at index 0. Index 0
448 * usually corresponds to index 1 in the HMM.
450 * The symbol for which the desired probability is being requested.
454 public Double getStateTransitionProbability(int alignColumn,
460 if (nodeLookup.containsKey(alignColumn))
462 nodeIndex = nodeLookup.get(alignColumn);
463 probability = getNode(nodeIndex).getStateTransitions()
475 * Returns the alignment column linked to the node at the given index.
478 * The index of the node, starting from index 1. Index 0 is the begin
479 * node, which does not correspond to a column in the alignment.
482 public Integer getNodeAlignmentColumn(int nodeIndex)
484 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
489 * Returns the consensus residue at the specified node.
492 * The index of the specified node.
495 public char getConsensusResidue(int nodeIndex)
497 char value = nodes.get(nodeIndex).getConsensusResidue();
502 * Returns the consensus at a given alignment column.
505 * The index of the column in the alignment for which the consensus
506 * is desired. The list of columns starts at index 0.
509 public char getConsensusAtAlignColumn(int columnIndex)
511 char mostLikely = '-';
512 if (consensusResidueIsActive())
515 Integer index = findNodeIndex(columnIndex);
520 mostLikely = getNodes().get(index).getConsensusResidue();
525 double highestProb = 0;
526 for (char character : symbols)
528 Double prob = getMatchEmissionProbability(columnIndex, character);
529 if (prob > highestProb)
532 mostLikely = character;
541 * Returns the reference annotation at the specified node.
544 * The index of the specified node.
547 public char getReferenceAnnotation(int nodeIndex)
549 char value = nodes.get(nodeIndex).getReferenceAnnotation();
554 * Returns the mask value at the specified node.
557 * The index of the specified node.
560 public char getMaskedValue(int nodeIndex)
562 char value = nodes.get(nodeIndex).getMaskValue();
567 * Returns the consensus structure at the specified node.
570 * The index of the specified node.
573 public char getConsensusStructure(int nodeIndex)
575 char value = nodes.get(nodeIndex).getConsensusStructure();
580 * Returns the average match emission probability for a given symbol
583 * The index of the symbol.
587 public double getAverageMatchEmission(int symbolIndex)
589 double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
594 * Returns the number of symbols in the alphabet used in this HMM.
598 public int getNumberOfSymbols()
600 return numberOfSymbols;
604 * Fills symbol array and whilst doing so, updates the value of the number of
608 * The scanner scanning the symbol line in the file.
610 public void fillSymbols(Scanner parser)
613 while (parser.hasNext())
615 String strSymbol = parser.next();
616 char[] symbol = strSymbol.toCharArray();
617 symbols.add(symbol[0]);
618 symbolIndexLookup.put(symbol[0], i);
621 numberOfSymbols = symbols.size();
625 * Adds a file property.
630 public void addFileProperty(String key, String value)
632 fileProperties.put(key, value);
636 * Returns a boolean indicating whether the reference annotation is active.
640 public boolean referenceAnnotationIsActive()
643 status = fileProperties.get(REFERENCE_ANNOTATION);
661 * Returns a boolean indicating whether the mask value annotation is active.
665 public boolean maskValueIsActive()
668 status = fileProperties.get(MASKED_VALUE);
686 * Returns a boolean indicating whether the consensus residue annotation is
691 public boolean consensusResidueIsActive()
694 status = fileProperties.get(CONSENSUS_RESIDUE);
712 * Returns a boolean indicating whether the consensus structure annotation is
717 public boolean consensusStructureIsActive()
720 status = fileProperties.get(CONSENSUS_STRUCTURE);
738 * Returns a boolean indicating whether the MAP annotation is active.
742 public boolean mapIsActive()
745 status = fileProperties.get(MAP);
763 * Sets the alignment column of the specified node.
770 public void setAlignmentColumn(int nodeIndex, int column)
772 nodes.get(nodeIndex).setAlignmentColumn(column);
776 * Sets the reference annotation at a given node.
781 public void setReferenceAnnotation(int nodeIndex, char value)
783 nodes.get(nodeIndex).setReferenceAnnotation(value);
787 * Sets the consensus residue at a given node.
792 public void setConsensusResidue(int nodeIndex, char value)
794 nodes.get(nodeIndex).setConsensusResidue(value);
798 * Sets the consensus structure at a given node.
803 public void setConsensusStructure(int nodeIndex, char value)
805 nodes.get(nodeIndex).setConsensusStructure(value);
809 * Sets the mask value at a given node.
814 public void setMaskValue(int nodeIndex, char value)
816 nodes.get(nodeIndex).setMaskValue(value);
820 * Temporary implementation, should not be used.
824 public String getGatheringThreshold()
827 value = fileProperties.get("GA");
832 * Temporary implementation, should not be used.
836 public String getNoiseCutoff()
839 value = fileProperties.get("NC");
844 * Temporary implementation, should not be used.
848 public String getTrustedCutoff()
851 value = fileProperties.get("TC");
856 * Temporary implementation, should not be used.
860 public String getViterbi()
863 value = fileProperties.get(VITERBI);
868 * Temporary implementation, should not be used.
872 public String getMSV()
875 value = fileProperties.get(MSV);
880 * Temporary implementation, should not be used.
884 public String getForward()
887 value = fileProperties.get(FORWARD);
892 * Sets the activation status of the MAP annotation.
896 public void setMAPStatus(boolean status)
898 fileProperties.put(MAP, status ? YES : NO);
902 * Sets the activation status of the reference annotation.
906 public void setReferenceAnnotationStatus(boolean status)
908 fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO);
912 * Sets the activation status of the mask value annotation.
916 public void setMaskedValueStatus(boolean status)
918 fileProperties.put(MASKED_VALUE, status ? YES : NO);
922 * Sets the activation status of the consensus residue annotation.
926 public void setConsensusResidueStatus(boolean status)
928 fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO);
932 * Sets the activation status of the consensus structure annotation.
936 public void setConsensusStructureStatus(boolean status)
938 fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO);
942 * Finds the index of the node in a hidden Markov model based on the column in
945 * @param alignmentColumn
946 * The index of the column in the alignment, with the indexes
950 public Integer findNodeIndex(int alignmentColumn)
953 index = nodeLookup.get(alignmentColumn);
958 * Finds the String values of a boolean. "yes" for true and "no" for false.
963 public static String findStringFromBoolean(boolean value)
978 * Returns the consensus sequence based on the most probable symbol at each
979 * position. The sequence is adjusted to match the length of the existing
980 * sequence alignment. Gap characters are used as padding.
983 * The length of the longest sequence in the existing alignment.
986 public Sequence getConsensusSequence()
991 start = getNodeAlignmentColumn(1);
992 modelLength = getLength();
993 end = getNodeAlignmentColumn(modelLength);
994 char[] sequence = new char[end + 1];
995 for (int index = 0; index < end + 1; index++)
999 character = getConsensusAtAlignColumn(index);
1001 if (character == null || character == '-')
1003 sequence[index] = '-';
1007 sequence[index] = Character.toUpperCase(character);
1012 Sequence seq = new Sequence(getName() + "_HMM", sequence, start,
1019 * Initiates a HMM consensus sequence
1021 * @return A new HMM consensus sequence
1023 public SequenceI initHMMSequence()
1025 Sequence consensus = getConsensusSequence();
1026 consensus.setIsHMMConsensusSequence(true);
1027 consensus.setHMM(this);