1 package jalview.datamodel;
3 import jalview.gui.AlignFrame;
5 import java.util.ArrayList;
6 import java.util.HashMap;
9 import java.util.Scanner;
12 * Data structure which stores a hidden Markov model. Currently contains file
13 * properties as well, not sure whether these should be transferred to the
19 public class HiddenMarkovModel
23 // Stores file properties. Do not directly access this field as it contains
24 // only string value - use the getter methods. For example, to find the length
25 // of theHMM, use getModelLength()to return an int value
26 Map<String, String> fileProperties = new HashMap<>();
28 // contains all of the symbols used in this model. The index of each symbol
29 // represents its lookup value
30 List<Character> symbols = new ArrayList<>();
32 // contains information for each node in the model. The begin node is at index
33 // 0. Node 0 contains average emission probabilities for each symbol
34 List<HMMNode> nodes = new ArrayList<>();
36 // contains the HMM node for each alignment column, alignment columns start at
38 Map<Integer, Integer> nodeLookup = new HashMap<>();
40 // contains the symbol index for each symbol
41 Map<Character, Integer> symbolIndexLookup = new HashMap<>();
43 final static String YES = "yes";
45 final static String NO = "no";
49 // keys for file properties hashmap
50 private final String NAME = "NAME";
52 private final String ACCESSION_NUMBER = "ACC";
54 private final String DESCRIPTION = "DESC";
56 private final String LENGTH = "LENG";
58 private final String MAX_LENGTH = "MAXL";
60 private final String ALPHABET = "ALPH";
62 private final String DATE = "DATE";
64 private final String COMMAND_LOG = "COM";
66 private final String NUMBER_OF_SEQUENCES = "NSEQ";
68 private final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
70 private final String CHECK_SUM = "CKSUM";
72 private final String GATHERING_THRESHOLDS = "GA";
74 private final String TRUSTED_CUTOFFS = "TC";
76 private final String NOISE_CUTOFFS = "NC";
78 private final String STATISTICS = "STATS";
80 private final String COMPO = "COMPO";
82 private final String GATHERING_THRESHOLD = "GA";
84 private final String TRUSTED_CUTOFF = "TC";
86 private final String NOISE_CUTOFF = "NC";
88 private final String VITERBI = "VITERBI";
90 private final String MSV = "MSV";
92 private final String FORWARD = "FORWARD";
94 private final String MAP = "MAP";
96 private final String REFERENCE_ANNOTATION = "RF";
98 private final String CONSENSUS_RESIDUE = "CONS";
100 private final String CONSENSUS_STRUCTURE = "CS";
102 private final String MASKED_VALUE = "MM";
104 public static final int MATCHTOMATCH = 0;
106 public static final int MATCHTOINSERT = 1;
108 public static final int MATCHTODELETE = 2;
110 public static final int INSERTTOMATCH = 3;
112 public static final int INSERTTOINSERT = 4;
114 public static final int DELETETOMATCH = 5;
116 public static final int DELETETODELETE = 6;
120 public HiddenMarkovModel()
125 public HiddenMarkovModel(HiddenMarkovModel hmm)
128 this.fileProperties = new HashMap<>(hmm.fileProperties);
129 this.symbols = new ArrayList<>(hmm.symbols);
130 this.nodes = new ArrayList<>(hmm.nodes);
131 this.nodeLookup = new HashMap<>(hmm.nodeLookup);
132 this.symbolIndexLookup = new HashMap<>(
133 hmm.symbolIndexLookup);
134 this.numberOfSymbols = hmm.numberOfSymbols;
135 this.fileHeader = new String(hmm.fileHeader);
139 * Gets the file header of the .hmm file this model came from.
143 public String getFileHeader()
149 * Sets the file header of this model.
153 public void setFileHeader(String header)
159 * Returns the map containing the matches between nodes and alignment column
165 public Map<Integer, Integer> getNodeLookup()
171 * Returns the list of symbols used in this hidden Markov model.
175 public List<Character> getSymbols()
181 * Returns the file properties.
185 public Map<String, String> getFileProperties()
187 return fileProperties;
191 * Gets the node in the hidden Markov model at the specified position.
194 * The index of the node requested. Node 0 optionally contains the
195 * average match emission probabilities across the entire model, and
196 * always contains the insert emission probabilities and state
197 * transition probabilities for the begin node. Node 1 contains the
198 * first node in the HMM that can correspond to a column in the
202 public HMMNode getNode(int nodeIndex)
204 return getNodes().get(nodeIndex);
208 * Sets the list of symbols used in the hidden Markov model to the list
212 * The list of symbols to which the current list is to be changed.
215 public void setSymbols(List<Character> symbolsL)
217 this.symbols = symbolsL;
221 * Returns the name of the sequence alignment on which the HMM is based.
225 public String getName()
227 return fileProperties.get(NAME);
231 * Returns the accession number.
234 public String getAccessionNumber()
236 return fileProperties.get(ACCESSION_NUMBER);
240 * Returns a description of the sequence alignment on which the hidden Markov
245 public String getDescription()
247 return fileProperties.get(DESCRIPTION);
251 * Returns the length of the hidden Markov model.
255 public Integer getLength()
257 if (fileProperties.get(LENGTH) == null)
261 return Integer.parseInt(fileProperties.get(LENGTH));
265 * Returns the max instance length within the hidden Markov model.
269 public Integer getMaxInstanceLength()
271 if (fileProperties.get(MAX_LENGTH) == null)
275 return Integer.parseInt(fileProperties.get(MAX_LENGTH));
279 * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the
280 * options. Other alphabets may be added.
284 public String getAlphabetType()
286 return fileProperties.get(ALPHABET);
290 * Returns the date as a String.
294 public String getDate()
296 return fileProperties.get(DATE);
300 * Returns the command line log.
304 public String getCommandLineLog()
306 return fileProperties.get(COMMAND_LOG);
310 * Returns the number of sequences on which the HMM was trained.
314 public Integer getNumberOfSequences()
316 if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
320 return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
324 * Returns the effective number of sequences on which the HMM was based.
328 public Double getEffectiveNumberOfSequences()
330 if (fileProperties.get(LENGTH) == null)
334 return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
338 * Returns the checksum.
342 public Long getCheckSum()
344 if (fileProperties.get(LENGTH) == null)
348 return Long.parseLong(fileProperties.get(CHECK_SUM));
352 * Returns the list of nodes in this HMM.
356 public List<HMMNode> getNodes()
362 * Sets the list of nodes in this HMM to the given list.
365 * The list of nodes to which the current list of nodes is being
368 public void setNodes(List<HMMNode> nodes)
374 * Gets the match emission probability for a given symbol at a column in the
378 * The index of the alignment column, starting at index 0. Index 0
379 * usually corresponds to index 1 in the HMM.
381 * The symbol for which the desired probability is being requested.
385 public Double getMatchEmissionProbability(int alignColumn, char symbol)
390 if (!symbolIndexLookup.containsKey(symbol))
394 symbolIndex = symbolIndexLookup.get(symbol);
395 if (nodeLookup.containsKey(alignColumn))
397 nodeIndex = nodeLookup.get(alignColumn);
398 probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex);
409 * Gets the insert emission probability for a given symbol at a column in the
413 * The index of the alignment column, starting at index 0. Index 0
414 * usually corresponds to index 1 in the HMM.
416 * The symbol for which the desired probability is being requested.
420 public Double getInsertEmissionProbability(int alignColumn, char symbol)
425 if (!symbolIndexLookup.containsKey(symbol))
429 symbolIndex = symbolIndexLookup.get(symbol);
430 if (nodeLookup.containsKey(alignColumn))
432 nodeIndex = nodeLookup.get(alignColumn);
433 probability = getNode(nodeIndex).getInsertEmissions()
445 * Gets the state transition probability for a given symbol at a column in the
449 * The index of the alignment column, starting at index 0. Index 0
450 * usually corresponds to index 1 in the HMM.
452 * The symbol for which the desired probability is being requested.
456 public Double getStateTransitionProbability(int alignColumn,
462 if (nodeLookup.containsKey(alignColumn))
464 nodeIndex = nodeLookup.get(alignColumn);
465 probability = getNode(nodeIndex).getStateTransitions()
477 * Returns the alignment column linked to the node at the given index.
480 * The index of the node, starting from index 1. Index 0 is the begin
481 * node, which does not correspond to a column in the alignment.
484 public Integer getNodeAlignmentColumn(int nodeIndex)
486 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
491 * Returns the consensus residue at the specified node.
494 * The index of the specified node.
497 public char getConsensusResidue(int nodeIndex)
499 char value = nodes.get(nodeIndex).getConsensusResidue();
504 * Returns the consensus at a given alignment column.
507 * The index of the column in the alignment for which the consensus
508 * is desired. The list of columns starts at index 0.
511 public char getConsensusAtAlignColumn(int columnIndex)
513 char mostLikely = '-';
514 if (consensusResidueIsActive())
517 Integer index = findNodeIndex(columnIndex);
522 mostLikely = getNodes().get(index).getConsensusResidue();
527 double highestProb = 0;
528 for (char character : symbols)
530 Double prob = getMatchEmissionProbability(columnIndex, character);
531 if (prob > highestProb)
534 mostLikely = character;
543 * Returns the reference annotation at the specified node.
546 * The index of the specified node.
549 public char getReferenceAnnotation(int nodeIndex)
551 char value = nodes.get(nodeIndex).getReferenceAnnotation();
556 * Returns the mask value at the specified node.
559 * The index of the specified node.
562 public char getMaskedValue(int nodeIndex)
564 char value = nodes.get(nodeIndex).getMaskValue();
569 * Returns the consensus structure at the specified node.
572 * The index of the specified node.
575 public char getConsensusStructure(int nodeIndex)
577 char value = nodes.get(nodeIndex).getConsensusStructure();
582 * Returns the average match emission probability for a given symbol
585 * The index of the symbol.
589 public double getAverageMatchEmission(int symbolIndex)
591 double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
596 * Returns the number of symbols in the alphabet used in this HMM.
600 public int getNumberOfSymbols()
602 return numberOfSymbols;
606 * Fills symbol array and whilst doing so, updates the value of the number of
610 * The scanner scanning the symbol line in the file.
612 public void fillSymbols(Scanner parser)
615 while (parser.hasNext())
617 String strSymbol = parser.next();
618 char[] symbol = strSymbol.toCharArray();
619 symbols.add(symbol[0]);
620 symbolIndexLookup.put(symbol[0], i);
623 numberOfSymbols = symbols.size();
627 * Adds a file property.
632 public void addFileProperty(String key, String value)
634 fileProperties.put(key, value);
638 * Returns a boolean indicating whether the reference annotation is active.
642 public boolean referenceAnnotationIsActive()
645 status = fileProperties.get(REFERENCE_ANNOTATION);
663 * Returns a boolean indicating whether the mask value annotation is active.
667 public boolean maskValueIsActive()
670 status = fileProperties.get(MASKED_VALUE);
688 * Returns a boolean indicating whether the consensus residue annotation is
693 public boolean consensusResidueIsActive()
696 status = fileProperties.get(CONSENSUS_RESIDUE);
714 * Returns a boolean indicating whether the consensus structure annotation is
719 public boolean consensusStructureIsActive()
722 status = fileProperties.get(CONSENSUS_STRUCTURE);
740 * Returns a boolean indicating whether the MAP annotation is active.
744 public boolean mapIsActive()
747 status = fileProperties.get(MAP);
765 * Sets the alignment column of the specified node.
772 public void setAlignmentColumn(int nodeIndex, int column)
774 nodes.get(nodeIndex).setAlignmentColumn(column);
778 * Sets the reference annotation at a given node.
783 public void setReferenceAnnotation(int nodeIndex, char value)
785 nodes.get(nodeIndex).setReferenceAnnotation(value);
789 * Sets the consensus residue at a given node.
794 public void setConsensusResidue(int nodeIndex, char value)
796 nodes.get(nodeIndex).setConsensusResidue(value);
800 * Sets the consensus structure at a given node.
805 public void setConsensusStructure(int nodeIndex, char value)
807 nodes.get(nodeIndex).setConsensusStructure(value);
811 * Sets the mask value at a given node.
816 public void setMaskValue(int nodeIndex, char value)
818 nodes.get(nodeIndex).setMaskValue(value);
822 * Temporary implementation, should not be used.
826 public String getGatheringThreshold()
829 value = fileProperties.get("GA");
834 * Temporary implementation, should not be used.
838 public String getNoiseCutoff()
841 value = fileProperties.get("NC");
846 * Temporary implementation, should not be used.
850 public String getTrustedCutoff()
853 value = fileProperties.get("TC");
858 * Temporary implementation, should not be used.
862 public String getViterbi()
865 value = fileProperties.get(VITERBI);
870 * Temporary implementation, should not be used.
874 public String getMSV()
877 value = fileProperties.get(MSV);
882 * Temporary implementation, should not be used.
886 public String getForward()
889 value = fileProperties.get(FORWARD);
894 * Sets the activation status of the MAP annotation.
898 public void setMAPStatus(boolean status)
900 fileProperties.put(MAP, status ? YES : NO);
904 * Sets the activation status of the reference annotation.
908 public void setReferenceAnnotationStatus(boolean status)
910 fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO);
914 * Sets the activation status of the mask value annotation.
918 public void setMaskedValueStatus(boolean status)
920 fileProperties.put(MASKED_VALUE, status ? YES : NO);
924 * Sets the activation status of the consensus residue annotation.
928 public void setConsensusResidueStatus(boolean status)
930 fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO);
934 * Sets the activation status of the consensus structure annotation.
938 public void setConsensusStructureStatus(boolean status)
940 fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO);
944 * Finds the index of the node in a hidden Markov model based on the column in
947 * @param alignmentColumn
948 * The index of the column in the alignment, with the indexes
952 public Integer findNodeIndex(int alignmentColumn)
955 index = nodeLookup.get(alignmentColumn);
960 * Finds the String values of a boolean. "yes" for true and "no" for false.
965 public static String findStringFromBoolean(boolean value)
980 * Returns the consensus sequence based on the most probable symbol at each
981 * position. The sequence is adjusted to match the length of the existing
982 * sequence alignment. Gap characters are used as padding.
985 * The length of the longest sequence in the existing alignment.
988 public Sequence getConsensusSequence()
993 start = getNodeAlignmentColumn(1);
994 modelLength = getLength();
995 end = getNodeAlignmentColumn(modelLength);
996 char[] sequence = new char[end + 1];
997 for (int index = 0; index < end + 1; index++)
1001 character = getConsensusAtAlignColumn(index);
1003 if (character == null || character == '-')
1005 sequence[index] = '-';
1009 sequence[index] = Character.toUpperCase(character);
1014 Sequence seq = new Sequence(getName() + "_HMM", sequence, start,
1021 * Maps the nodes of the hidden Markov model to the reference annotation and
1022 * then deletes this annotation.
1024 public void mapToReferenceAnnotation(AlignFrame af, SequenceI seq)
1026 AlignmentAnnotation annotArray[] = af.getViewport().getAlignment()
1027 .getAlignmentAnnotation();
1029 AlignmentAnnotation reference = null;
1030 for (AlignmentAnnotation annot : annotArray)
1032 if (annot.label.contains("Reference"))
1038 if (reference == null)
1043 mapToReferenceAnnotation(reference, seq);
1044 af.getViewport().getAlignment().deleteAnnotation(reference);
1047 public void mapToReferenceAnnotation(AlignmentAnnotation reference,
1050 HiddenMarkovModel hmm = seq.getHMM();
1051 Annotation[] annots = reference.annotations;
1054 for (int col = 0; col < annots.length; col++)
1056 String character = annots[col].displayCharacter;
1057 if ("x".equals(character) || "X".equals(character))
1060 if (nodeIndex < hmm.getNodes().size())
1062 HMMNode node = hmm.getNode(nodeIndex);
1063 int alignPos = getNodeAlignmentColumn(nodeIndex);
1064 char seqCharacter = seq.getCharAt(alignPos);
1065 if (alignPos >= seq.getLength() || col >= seq.getLength())
1067 seq.insertCharAt(seq.getLength(),
1068 (alignPos + 1) - seq.getLength(),
1071 seq.getSequence()[alignPos] = '-';
1072 seq.getSequence()[col] = seqCharacter;
1073 node.setAlignmentColumn(col);
1074 hmm.nodeLookup.put(col, nodeIndex);
1079 "The reference annotation contains more consensus columns than the hidden Markov model");
1085 hmm.nodeLookup.remove(col);
1093 public void mapToReferenceAnnotation(AlignmentAnnotation reference)
1095 Annotation[] annots = reference.annotations;
1098 for (int col = 0; col < annots.length; col++)
1100 String character = annots[col].displayCharacter;
1101 if ("x".equals(character) || "X".equals(character))
1104 if (nodeIndex < nodes.size())
1106 HMMNode node = nodes.get(nodeIndex);
1107 node.setAlignmentColumn(col + 1);
1108 nodeLookup.put(col, nodeIndex);
1113 "The reference annotation contains more consensus columns than the hidden Markov model");
1119 nodeLookup.remove(col);
1127 public SequenceI initHMMSequence()
1129 Sequence consensus = getConsensusSequence();
1130 consensus.setIsHMMConsensusSequence(true);
1131 consensus.setHMM(this);