1 package jalview.datamodel;
3 import jalview.gui.AlignFrame;
4 import jalview.schemes.ResidueProperties;
6 import java.util.ArrayList;
7 import java.util.HashMap;
10 import java.util.Scanner;
13 * Data structure which stores a hidden Markov model. Currently contains file
14 * properties as well, not sure whether these should be transferred to the
20 public class HiddenMarkovModel
24 // Stores file properties. Do not directly access this field as it contains
25 // only string value - use the getter methods. For example, to find the length
26 // of theHMM, use getModelLength()to return an int value
27 Map<String, String> fileProperties = new HashMap<>();
29 // contains all of the symbols used in this model. The index of each symbol
30 // represents its lookup value
31 List<Character> symbols = new ArrayList<>();
33 // contains information for each node in the model. The begin node is at index
34 // 0. Node 0 contains average emission probabilities for each symbol
35 List<HMMNode> nodes = new ArrayList<>();
37 // contains the HMM node for each alignment column
38 Map<Integer, Integer> nodeLookup = new HashMap<>();
40 // contains the symbol index for each symbol
41 Map<Character, Integer> symbolIndexLookup = new HashMap<>();
43 final static String YES = "yes";
45 final static String NO = "no";
49 // keys for file properties hashmap
50 private final String NAME = "NAME";
52 private final String ACCESSION_NUMBER = "ACC";
54 private final String DESCRIPTION = "DESC";
56 private final String LENGTH = "LENG";
58 private final String MAX_LENGTH = "MAXL";
60 private final String ALPHABET = "ALPH";
62 private final String DATE = "DATE";
64 private final String COMMAND_LOG = "COM";
66 private final String NUMBER_OF_SEQUENCES = "NSEQ";
68 private final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
70 private final String CHECK_SUM = "CKSUM";
72 private final String GATHERING_THRESHOLDS = "GA";
74 private final String TRUSTED_CUTOFFS = "TC";
76 private final String NOISE_CUTOFFS = "NC";
78 private final String STATISTICS = "STATS";
80 private final String COMPO = "COMPO";
82 private final String GATHERING_THRESHOLD = "GA";
84 private final String TRUSTED_CUTOFF = "TC";
86 private final String NOISE_CUTOFF = "NC";
88 private final String VITERBI = "VITERBI";
90 private final String MSV = "MSV";
92 private final String FORWARD = "FORWARD";
94 private final String MAP = "MAP";
96 private final String REFERENCE_ANNOTATION = "RF";
98 private final String CONSENSUS_RESIDUE = "CONS";
100 private final String CONSENSUS_STRUCTURE = "CS";
102 private final String MASKED_VALUE = "MM";
104 public static final int MATCHTOMATCH = 0;
106 public static final int MATCHTOINSERT = 1;
108 public static final int MATCHTODELETE = 2;
110 public static final int INSERTTOMATCH = 3;
112 public static final int INSERTTOINSERT = 4;
114 public static final int DELETETOMATCH = 5;
116 public static final int DELETETODELETE = 6;
120 public HiddenMarkovModel()
125 public HiddenMarkovModel(HiddenMarkovModel hmm)
128 this.fileProperties = hmm.fileProperties;
129 this.symbols = hmm.symbols;
130 this.nodes = hmm.nodes;
131 this.nodeLookup = hmm.nodeLookup;
132 this.symbolIndexLookup = hmm.symbolIndexLookup;
133 this.numberOfSymbols = hmm.numberOfSymbols;
134 this.fileHeader = hmm.fileHeader;
138 * Gets the file header of the .hmm file this model came from.
142 public String getFileHeader()
148 * Sets the file header of this model.
152 public void setFileHeader(String header)
158 * Returns the map containing the matches between nodes and alignment column
164 public Map<Integer, Integer> getNodeLookup()
170 * Returns the list of symbols used in this hidden Markov model.
174 public List<Character> getSymbols()
180 * Returns the file properties.
184 public Map<String, String> getFileProperties()
186 return fileProperties;
190 * Gets the node in the hidden Markov model at the specified position.
193 * The index of the node requested. Node 0 optionally contains the
194 * average match emission probabilities across the entire model, and
195 * always contains the insert emission probabilities and state
196 * transition probabilities for the begin node. Node 1 contains the
197 * first node in the HMM that can correspond to a column in the
201 public HMMNode getNode(int nodeIndex)
203 return getNodes().get(nodeIndex);
207 * Sets the list of symbols used in the hidden Markov model to the list
211 * The list of symbols to which the current list is to be changed.
214 public void setSymbols(List<Character> symbolsL)
216 this.symbols = symbolsL;
220 * Returns the name of the sequence alignment on which the HMM is based.
224 public String getName()
226 return fileProperties.get(NAME);
230 * Returns the accession number.
233 public String getAccessionNumber()
235 return fileProperties.get(ACCESSION_NUMBER);
239 * Returns a description of the sequence alignment on which the hidden Markov
244 public String getDescription()
246 return fileProperties.get(DESCRIPTION);
250 * Returns the length of the hidden Markov model.
254 public Integer getLength()
256 if (fileProperties.get(LENGTH) == null)
260 return Integer.parseInt(fileProperties.get(LENGTH));
264 * Returns the max instance length within the hidden Markov model.
268 public Integer getMaxInstanceLength()
270 if (fileProperties.get(MAX_LENGTH) == null)
274 return Integer.parseInt(fileProperties.get(MAX_LENGTH));
278 * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the
279 * options. Other alphabets may be added.
283 public String getAlphabetType()
285 return fileProperties.get(ALPHABET);
289 * Returns the date as a String.
293 public String getDate()
295 return fileProperties.get(DATE);
299 * Returns the command line log.
303 public String getCommandLineLog()
305 return fileProperties.get(COMMAND_LOG);
309 * Returns the number of sequences on which the HMM was trained.
313 public Integer getNumberOfSequences()
315 if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
319 return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
323 * Returns the effective number of sequences on which the HMM was based.
327 public Double getEffectiveNumberOfSequences()
329 if (fileProperties.get(LENGTH) == null)
333 return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
337 * Returns the checksum.
341 public Long getCheckSum()
343 if (fileProperties.get(LENGTH) == null)
347 return Long.parseLong(fileProperties.get(CHECK_SUM));
351 * Returns the list of nodes in this HMM.
355 public List<HMMNode> getNodes()
361 * Sets the list of nodes in this HMM to the given list.
364 * The list of nodes to which the current list of nodes is being
367 public void setNodes(List<HMMNode> nodes)
373 * Gets the match emission probability for a given symbol at a column in the
377 * The index of the alignment column, starting at index 0. Index 0
378 * usually corresponds to index 1 in the HMM.
380 * The symbol for which the desired probability is being requested.
384 public Double getMatchEmissionProbability(int alignColumn, char symbol)
389 if (!symbolIndexLookup.containsKey(symbol))
393 symbolIndex = symbolIndexLookup.get(symbol);
394 if (nodeLookup.containsKey(alignColumn + 1))
396 nodeIndex = nodeLookup.get(alignColumn + 1);
397 probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex);
408 * Gets the insert emission probability for a given symbol at a column in the
412 * The index of the alignment column, starting at index 0. Index 0
413 * usually corresponds to index 1 in the HMM.
415 * The symbol for which the desired probability is being requested.
419 public Double getInsertEmissionProbability(int alignColumn, char symbol)
424 if (!symbolIndexLookup.containsKey(symbol))
428 symbolIndex = symbolIndexLookup.get(symbol);
429 if (nodeLookup.containsKey(alignColumn + 1))
431 nodeIndex = nodeLookup.get(alignColumn + 1);
432 probability = getNode(nodeIndex).getInsertEmissions()
444 * Gets the state transition probability for a given symbol at a column in the
448 * The index of the alignment column, starting at index 0. Index 0
449 * usually corresponds to index 1 in the HMM.
451 * The symbol for which the desired probability is being requested.
455 public Double getStateTransitionProbability(int alignColumn,
461 if (nodeLookup.containsKey(alignColumn + 1))
463 nodeIndex = nodeLookup.get(alignColumn + 1);
464 probability = getNode(nodeIndex).getStateTransitions()
476 * Returns the alignment column linked to the node at the given index.
479 * The index of the node, starting from index 1. Index 0 is the begin
480 * node, which does not correspond to a column in the alignment.
483 public Integer getNodeAlignmentColumn(int nodeIndex)
485 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
490 * Returns the consensus residue at the specified node.
493 * The index of the specified node.
496 public char getConsensusResidue(int nodeIndex)
498 char value = nodes.get(nodeIndex).getConsensusResidue();
503 * Returns the consensus at a given alignment column.
506 * The index of the column in the alignment for which the consensus
507 * is desired. The list of columns starts at index 0.
510 public char getConsensusAtAlignColumn(int columnIndex)
512 char mostLikely = '-';
513 if (consensusResidueIsActive())
516 Integer index = findNodeIndex(columnIndex);
521 mostLikely = getNodes().get(index).getConsensusResidue();
526 double highestProb = 0;
527 for (char character : symbols)
529 Double prob = getMatchEmissionProbability(columnIndex, character);
530 if (prob > highestProb)
533 mostLikely = character;
542 * Returns the reference annotation at the specified node.
545 * The index of the specified node.
548 public char getReferenceAnnotation(int nodeIndex)
550 char value = nodes.get(nodeIndex).getReferenceAnnotation();
555 * Returns the mask value at the specified node.
558 * The index of the specified node.
561 public char getMaskedValue(int nodeIndex)
563 char value = nodes.get(nodeIndex).getMaskValue();
568 * Returns the consensus structure at the specified node.
571 * The index of the specified node.
574 public char getConsensusStructure(int nodeIndex)
576 char value = nodes.get(nodeIndex).getConsensusStructure();
581 * Returns the average match emission probability for a given symbol
584 * The index of the symbol.
588 public double getAverageMatchEmission(int symbolIndex)
590 double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
595 * Returns the number of symbols in the alphabet used in this HMM.
599 public int getNumberOfSymbols()
601 return numberOfSymbols;
605 * Fills symbol array and whilst doing so, updates the value of the number of
609 * The scanner scanning the symbol line in the file.
611 public void fillSymbols(Scanner parser)
614 while (parser.hasNext())
616 String strSymbol = parser.next();
617 char[] symbol = strSymbol.toCharArray();
618 symbols.add(symbol[0]);
619 symbolIndexLookup.put(symbol[0], i);
622 numberOfSymbols = symbols.size();
626 * Adds a file property.
631 public void addFileProperty(String key, String value)
633 fileProperties.put(key, value);
637 * Returns a boolean indicating whether the reference annotation is active.
641 public boolean referenceAnnotationIsActive()
644 status = fileProperties.get(REFERENCE_ANNOTATION);
662 * Returns a boolean indicating whether the mask value annotation is active.
666 public boolean maskValueIsActive()
669 status = fileProperties.get(MASKED_VALUE);
687 * Returns a boolean indicating whether the consensus residue annotation is
692 public boolean consensusResidueIsActive()
695 status = fileProperties.get(CONSENSUS_RESIDUE);
713 * Returns a boolean indicating whether the consensus structure annotation is
718 public boolean consensusStructureIsActive()
721 status = fileProperties.get(CONSENSUS_STRUCTURE);
739 * Returns a boolean indicating whether the MAP annotation is active.
743 public boolean mapIsActive()
746 status = fileProperties.get(MAP);
764 * Sets the alignment column of the specified node.
771 public void setAlignmentColumn(int nodeIndex, int column)
773 nodes.get(nodeIndex).setAlignmentColumn(column);
777 * Sets the reference annotation at a given node.
782 public void setReferenceAnnotation(int nodeIndex, char value)
784 nodes.get(nodeIndex).setReferenceAnnotation(value);
788 * Sets the consensus residue at a given node.
793 public void setConsensusResidue(int nodeIndex, char value)
795 nodes.get(nodeIndex).setConsensusResidue(value);
799 * Sets the consensus structure at a given node.
804 public void setConsensusStructure(int nodeIndex, char value)
806 nodes.get(nodeIndex).setConsensusStructure(value);
810 * Sets the mask value at a given node.
815 public void setMaskValue(int nodeIndex, char value)
817 nodes.get(nodeIndex).setMaskValue(value);
821 * Temporary implementation, should not be used.
825 public String getGatheringThreshold()
828 value = fileProperties.get("GA");
833 * Temporary implementation, should not be used.
837 public String getNoiseCutoff()
840 value = fileProperties.get("NC");
845 * Temporary implementation, should not be used.
849 public String getTrustedCutoff()
852 value = fileProperties.get("TC");
857 * Temporary implementation, should not be used.
861 public String getViterbi()
864 value = fileProperties.get(VITERBI);
869 * Temporary implementation, should not be used.
873 public String getMSV()
876 value = fileProperties.get(MSV);
881 * Temporary implementation, should not be used.
885 public String getForward()
888 value = fileProperties.get(FORWARD);
893 * Sets the activation status of the MAP annotation.
897 public void setMAPStatus(boolean status)
899 fileProperties.put(MAP, status ? YES : NO);
903 * Sets the activation status of the reference annotation.
907 public void setReferenceAnnotationStatus(boolean status)
909 fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO);
913 * Sets the activation status of the mask value annotation.
917 public void setMaskedValueStatus(boolean status)
919 fileProperties.put(MASKED_VALUE, status ? YES : NO);
923 * Sets the activation status of the consensus residue annotation.
927 public void setConsensusResidueStatus(boolean status)
929 fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO);
933 * Sets the activation status of the consensus structure annotation.
937 public void setConsensusStructureStatus(boolean status)
939 fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO);
943 * Finds the index of the node in a hidden Markov model based on the column in
946 * @param alignmentColumn
947 * The index of the column in the alignment, with the indexes
951 public Integer findNodeIndex(int alignmentColumn)
954 index = nodeLookup.get(alignmentColumn + 1);
959 * Finds the String values of a boolean. "yes" for true and "no" for false.
964 public static String findStringFromBoolean(boolean value)
977 * Creates the HMM Logo alignment annotation, and populates it with
978 * information content data.
980 * @return The alignment annotation.
982 public AlignmentAnnotation createAnnotation(int length)
984 Annotation[] annotations = new Annotation[length];
986 for (int alignPos = 0; alignPos < length; alignPos++)
988 Float content = getInformationContent(alignPos);
996 cons = getConsensusAtAlignColumn(alignPos);
998 cons = Character.toUpperCase(cons);
1000 String description = String.format("%.3f", content);
1001 description += " bits";
1002 annotations[alignPos] = new Annotation(cons.toString(), description,
1007 AlignmentAnnotation annotation = new AlignmentAnnotation(
1009 "The information content of each column, measured in bits",
1011 0f, max, AlignmentAnnotation.BAR_GRAPH);
1012 annotation.setHMM(this);
1017 * Returns the information content at a specified column.
1020 * Index of the column, starting from 0.
1023 public float getInformationContent(int column)
1025 float informationContent = 0f;
1027 for (char symbol : symbols)
1030 if ("amino".equals(getAlphabetType()))
1032 freq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
1034 if ("DNA".equals(getAlphabetType()))
1036 freq = ResidueProperties.dnaBackgroundFrequencies.get(symbol);
1038 if ("RNA".equals(getAlphabetType()))
1040 freq = ResidueProperties.rnaBackgroundFrequencies
1043 Double hmmProb = getMatchEmissionProbability(column, symbol);
1044 float prob = hmmProb.floatValue();
1045 informationContent += prob * (Math.log(prob / freq) / Math.log(2));
1049 return informationContent;
1053 * Returns the consensus sequence based on the most probable symbol at each
1054 * position. The sequence is adjusted to match the length of the existing
1055 * sequence alignment. Gap characters are used as padding.
1058 * The length of the longest sequence in the existing alignment.
1061 public Sequence getConsensusSequence(int length)
1066 start = getNodeAlignmentColumn(1);
1067 modelLength = getLength();
1068 end = getNodeAlignmentColumn(modelLength);
1069 char[] sequence = new char[length];
1070 for (int index = 0; index < length; index++)
1072 Character character;
1074 character = getConsensusAtAlignColumn(index);
1076 if (character == null || character == '-')
1078 sequence[index] = '-';
1082 sequence[index] = Character.toUpperCase(character);
1087 Sequence seq = new Sequence("HMM CONSENSUS", sequence, start, end);
1093 * Maps the nodes of the hidden Markov model to the reference annotation and
1094 * then deletes this annotation.
1096 public void mapToReferenceAnnotation(AlignFrame af)
1098 AlignmentAnnotation annotArray[] = af.getViewport().getAlignment()
1099 .getAlignmentAnnotation();
1101 AlignmentAnnotation reference = null;
1102 for (AlignmentAnnotation annot : annotArray)
1104 if (annot.label.contains("Reference"))
1110 if (reference == null)
1115 Annotation[] annots = reference.annotations;
1118 for (int col = 0; col < annots.length; col++)
1120 String character = annots[col].displayCharacter;
1121 if ("x".equals(character) || "X".equals(character))
1124 if (nodeIndex < nodes.size())
1126 nodes.get(nodeIndex).setAlignmentColumn(col + 1);
1127 nodeLookup.put(col + 1, nodeIndex);
1132 "The reference annotation contains more consensus columns than the hidden Markov model");
1138 nodeLookup.remove(col + 1);
1143 af.getViewport().getAlignment().deleteAnnotation(reference);
1146 public void initPlaceholder(AlignFrame af)
1148 AlignmentI alignment = af.getViewport().getAlignment();
1149 int length = alignment.getWidth();
1150 Sequence consensus = getConsensusSequence(length);
1151 consensus.setHMM(this);
1152 SequenceI[] consensusArr = new Sequence[] { consensus };
1153 AlignmentI newAlignment = new Alignment(consensusArr);
1154 newAlignment.append(alignment);
1155 af.getViewport().setAlignment(newAlignment);