1 package jalview.datamodel;
3 import jalview.schemes.ResidueProperties;
5 import java.util.ArrayList;
6 import java.util.HashMap;
9 import java.util.Scanner;
12 * Data structure which stores a hidden Markov model. Currently contains file
13 * properties as well, not sure whether these should be transferred to the
19 public class HiddenMarkovModel
23 // Stores file properties. Do not directly access this field as it contains
24 // only string value - use the getter methods. For example, to find the length
25 // of theHMM, use getModelLength()to return an int value
26 Map<String, String> fileProperties = new HashMap<>();
28 // contains all of the symbols used in this model. The index of each symbol
29 // represents its lookup value
30 List<Character> symbols = new ArrayList<>();
32 // contains information for each node in the model. The begin node is at index
33 // 0. Node 0 contains average emission probabilities for each symbol
34 List<HMMNode> nodes = new ArrayList<>();
36 // contains the HMM node for each alignment column
37 Map<Integer, Integer> nodeLookup = new HashMap<>();
39 // contains the symbol index for each symbol
40 Map<Character, Integer> symbolIndexLookup = new HashMap<>();
42 final static String YES = "yes";
44 final static String NO = "no";
48 // keys for file properties hashmap
49 private final String NAME = "NAME";
51 private final String ACCESSION_NUMBER = "ACC";
53 private final String DESCRIPTION = "DESC";
55 private final String LENGTH = "LENG";
57 private final String MAX_LENGTH = "MAXL";
59 private final String ALPHABET = "ALPH";
61 private final String DATE = "DATE";
63 private final String COMMAND_LOG = "COM";
65 private final String NUMBER_OF_SEQUENCES = "NSEQ";
67 private final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
69 private final String CHECK_SUM = "CKSUM";
71 private final String GATHERING_THRESHOLDS = "GA";
73 private final String TRUSTED_CUTOFFS = "TC";
75 private final String NOISE_CUTOFFS = "NC";
77 private final String STATISTICS = "STATS";
79 private final String COMPO = "COMPO";
81 private final String GATHERING_THRESHOLD = "GA";
83 private final String TRUSTED_CUTOFF = "TC";
85 private final String NOISE_CUTOFF = "NC";
87 private final String VITERBI = "VITERBI";
89 private final String MSV = "MSV";
91 private final String FORWARD = "FORWARD";
93 private final String MAP = "MAP";
95 private final String REFERENCE_ANNOTATION = "RF";
97 private final String CONSENSUS_RESIDUE = "CONS";
99 private final String CONSENSUS_STRUCTURE = "CS";
101 private final String MASKED_VALUE = "MM";
103 public static final int MATCHTOMATCH = 0;
105 public static final int MATCHTOINSERT = 1;
107 public static final int MATCHTODELETE = 2;
109 public static final int INSERTTOMATCH = 3;
111 public static final int INSERTTOINSERT = 4;
113 public static final int DELETETOMATCH = 5;
115 public static final int DELETETODELETE = 6;
118 * Returns the map containing the matches between nodes and alignment column
124 public Map<Integer, Integer> getNodeLookup()
130 * Returns the list of symbols used in this hidden Markov model.
134 public List<Character> getSymbols()
140 * Returns the file properties.
144 public Map<String, String> getFileProperties()
146 return fileProperties;
150 * Gets the node in the hidden Markov model at the specified position.
153 * The index of the node requested. Node 0 optionally contains the
154 * average match emission probabilities across the entire model, and
155 * always contains the insert emission probabilities and state
156 * transition probabilities for the begin node. Node 1 contains the
157 * first node in the HMM that can correspond to a column in the
161 public HMMNode getNode(int nodeIndex)
163 return getNodes().get(nodeIndex);
167 * Sets the list of symbols used in the hidden Markov model to the list
171 * The list of symbols to which the current list is to be changed.
174 public void setSymbols(List<Character> symbolsL)
176 this.symbols = symbolsL;
180 * Returns the name of the sequence alignment on which the HMM is based.
184 public String getName()
186 return fileProperties.get(NAME);
190 * Returns the accession number.
193 public String getAccessionNumber()
195 return fileProperties.get(ACCESSION_NUMBER);
199 * Returns a description of the sequence alignment on which the hidden Markov
204 public String getDescription()
206 return fileProperties.get(DESCRIPTION);
210 * Returns the length of the hidden Markov model.
214 public Integer getLength()
216 if (fileProperties.get(LENGTH) == null)
220 return Integer.parseInt(fileProperties.get(LENGTH));
224 * Returns the max instance length within the hidden Markov model.
228 public Integer getMaxInstanceLength()
230 if (fileProperties.get(MAX_LENGTH) == null)
234 return Integer.parseInt(fileProperties.get(MAX_LENGTH));
238 * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the
239 * options. Other alphabets may be added.
243 public String getAlphabetType()
245 return fileProperties.get(ALPHABET);
249 * Returns the date as a String.
253 public String getDate()
255 return fileProperties.get(DATE);
259 * Returns the command line log.
263 public String getCommandLineLog()
265 return fileProperties.get(COMMAND_LOG);
269 * Returns the number of sequences on which the HMM was trained.
273 public Integer getNumberOfSequences()
275 if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
279 return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
283 * Returns the effective number of sequences on which the HMM was based.
287 public Double getEffectiveNumberOfSequences()
289 if (fileProperties.get(LENGTH) == null)
293 return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
297 * Returns the checksum.
301 public Long getCheckSum()
303 if (fileProperties.get(LENGTH) == null)
307 return Long.parseLong(fileProperties.get(CHECK_SUM));
311 * Returns the list of nodes in this HMM.
315 public List<HMMNode> getNodes()
321 * Sets the list of nodes in this HMM to the given list.
324 * The list of nodes to which the current list of nodes is being
327 public void setNodes(List<HMMNode> nodes)
333 * Gets the match emission probability for a given symbol at a column in the
337 * The index of the alignment column, starting at index 0. Index 0
338 * usually corresponds to index 1 in the HMM.
340 * The symbol for which the desired probability is being requested.
344 public Double getMatchEmissionProbability(int alignColumn, char symbol)
349 if (!symbolIndexLookup.containsKey(symbol))
353 symbolIndex = symbolIndexLookup.get(symbol);
354 if (nodeLookup.containsKey(alignColumn + 1))
356 nodeIndex = nodeLookup.get(alignColumn + 1);
357 probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex);
368 * Gets the insert emission probability for a given symbol at a column in the
372 * The index of the alignment column, starting at index 0. Index 0
373 * usually corresponds to index 1 in the HMM.
375 * The symbol for which the desired probability is being requested.
379 public Double getInsertEmissionProbability(int alignColumn, char symbol)
384 if (!symbolIndexLookup.containsKey(symbol))
388 symbolIndex = symbolIndexLookup.get(symbol);
389 if (nodeLookup.containsKey(alignColumn + 1))
391 nodeIndex = nodeLookup.get(alignColumn + 1);
392 probability = getNode(nodeIndex).getInsertEmissions()
404 * Gets the state transition probability for a given symbol at a column in the
408 * The index of the alignment column, starting at index 0. Index 0
409 * usually corresponds to index 1 in the HMM.
411 * The symbol for which the desired probability is being requested.
415 public Double getStateTransitionProbability(int alignColumn,
421 if (nodeLookup.containsKey(alignColumn + 1))
423 nodeIndex = nodeLookup.get(alignColumn + 1);
424 probability = getNode(nodeIndex).getStateTransitions()
436 * Returns the alignment column linked to the node at the given index.
439 * The index of the node, starting from index 1. Index 0 is the begin
440 * node, which does not correspond to a column in the alignment.
443 public Integer getNodeAlignmentColumn(int nodeIndex)
445 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
450 * Returns the consensus residue at the specified node.
453 * The index of the specified node.
456 public char getConsensusResidue(int nodeIndex)
458 char value = nodes.get(nodeIndex).getConsensusResidue();
463 * Returns the consensus at a given alignment column.
466 * The index of the column in the alignment for which the consensus
467 * is desired. The list of columns starts at index 0.
470 public char getConsensusAtAlignColumn(int columnIndex)
473 Integer index = findNodeIndex(columnIndex);
478 value = getNodes().get(index).getConsensusResidue();
483 * Returns the reference annotation at the specified node.
486 * The index of the specified node.
489 public char getReferenceAnnotation(int nodeIndex)
491 char value = nodes.get(nodeIndex).getReferenceAnnotation();
496 * Returns the mask value at the specified node.
499 * The index of the specified node.
502 public char getMaskedValue(int nodeIndex)
504 char value = nodes.get(nodeIndex).getMaskValue();
509 * Returns the consensus structure at the specified node.
512 * The index of the specified node.
515 public char getConsensusStructure(int nodeIndex)
517 char value = nodes.get(nodeIndex).getConsensusStructure();
522 * Returns the average match emission probability for a given symbol
525 * The index of the symbol.
529 public double getAverageMatchEmission(int symbolIndex)
531 double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
536 * Returns the number of symbols in the alphabet used in this HMM.
540 public int getNumberOfSymbols()
542 return numberOfSymbols;
546 * Fills symbol array and whilst doing so, updates the value of the number of
550 * The scanner scanning the symbol line in the file.
552 public void fillSymbols(Scanner parser)
555 while (parser.hasNext())
557 String strSymbol = parser.next();
558 char[] symbol = strSymbol.toCharArray();
559 symbols.add(symbol[0]);
560 symbolIndexLookup.put(symbol[0], i);
563 numberOfSymbols = symbols.size();
567 * Adds a file property.
572 public void addFileProperty(String key, String value)
574 fileProperties.put(key, value);
578 * Returns a boolean indicating whether the reference annotation is active.
582 public boolean referenceAnnotationIsActive()
585 status = fileProperties.get(REFERENCE_ANNOTATION);
603 * Returns a boolean indicating whether the mask value annotation is active.
607 public boolean maskValueIsActive()
610 status = fileProperties.get(MASKED_VALUE);
628 * Returns a boolean indicating whether the consensus residue annotation is
633 public boolean consensusResidueIsActive()
636 status = fileProperties.get(CONSENSUS_RESIDUE);
654 * Returns a boolean indicating whether the consensus structure annotation is
659 public boolean consensusStructureIsActive()
662 status = fileProperties.get(CONSENSUS_STRUCTURE);
680 * Returns a boolean indicating whether the MAP annotation is active.
684 public boolean mapIsActive()
687 status = fileProperties.get(MAP);
705 * Sets the alignment column of the specified node.
712 public void setAlignmentColumn(int nodeIndex, int column)
714 nodes.get(nodeIndex).setAlignmentColumn(column);
718 * Sets the reference annotation at a given node.
723 public void setReferenceAnnotation(int nodeIndex, char value)
725 nodes.get(nodeIndex).setReferenceAnnotation(value);
729 * Sets the consensus residue at a given node.
734 public void setConsensusResidue(int nodeIndex, char value)
736 nodes.get(nodeIndex).setConsensusResidue(value);
740 * Sets the consensus structure at a given node.
745 public void setConsensusStructure(int nodeIndex, char value)
747 nodes.get(nodeIndex).setConsensusStructure(value);
751 * Sets the mask value at a given node.
756 public void setMaskValue(int nodeIndex, char value)
758 nodes.get(nodeIndex).setMaskValue(value);
762 * Temporary implementation, should not be used.
766 public String getGatheringThreshold()
769 value = fileProperties.get("GA");
774 * Temporary implementation, should not be used.
778 public String getNoiseCutoff()
781 value = fileProperties.get("NC");
786 * Temporary implementation, should not be used.
790 public String getTrustedCutoff()
793 value = fileProperties.get("TC");
798 * Temporary implementation, should not be used.
802 public String getViterbi()
805 value = fileProperties.get(VITERBI);
810 * Temporary implementation, should not be used.
814 public String getMSV()
817 value = fileProperties.get(MSV);
822 * Temporary implementation, should not be used.
826 public String getForward()
829 value = fileProperties.get(FORWARD);
834 * Sets the activation status of the MAP annotation.
838 public void setMAPStatus(boolean status)
840 fileProperties.put(MAP, status ? YES : NO);
844 * Sets the activation status of the reference annotation.
848 public void setReferenceAnnotationStatus(boolean status)
850 fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO);
854 * Sets the activation status of the mask value annotation.
858 public void setMaskedValueStatus(boolean status)
860 fileProperties.put(MASKED_VALUE, status ? YES : NO);
864 * Sets the activation status of the consensus residue annotation.
868 public void setConsensusResidueStatus(boolean status)
870 fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO);
874 * Sets the activation status of the consensus structure annotation.
878 public void setConsensusStructureStatus(boolean status)
880 fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO);
884 * Finds the index of the node in a hidden Markov model based on the column in
887 * @param alignmentColumn
888 * The index of the column in the alignment, with the indexes
892 public Integer findNodeIndex(int alignmentColumn)
895 index = nodeLookup.get(alignmentColumn + 1);
900 * Finds the String values of a boolean. "yes" for true and "no" for false.
905 public static String findStringFromBoolean(boolean value)
918 * Creates the HMM Logo alignment annotation, and populates it with
919 * information content data.
921 * @return The alignment annotation.
923 public AlignmentAnnotation createAnnotation(int length)
925 Annotation[] annotations = new Annotation[length];
927 for (int alignPos = 0; alignPos < length; alignPos++)
929 Float content = getInformationContent(alignPos);
936 cons = getConsensusAtAlignColumn(alignPos);
937 cons = Character.toUpperCase(cons);
939 String description = String.format("%.3f", content);
940 description += " bits";
941 annotations[alignPos] = new Annotation(cons.toString(), description,
946 AlignmentAnnotation annotation = new AlignmentAnnotation(
947 "Information Content",
948 "The information content of each column, measured in bits",
950 0f, max, AlignmentAnnotation.BAR_GRAPH);
955 * Returns the information content at a specified column.
958 * Index of the column, starting from 0.
961 public float getInformationContent(int column)
963 float informationContent = 0f;
965 for (char symbol : symbols)
968 if ("amino".equals(getAlphabetType()))
970 freq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
972 if ("DNA".equals(getAlphabetType()))
974 freq = ResidueProperties.dnaBackgroundFrequencies.get(symbol);
976 if ("RNA".equals(getAlphabetType()))
978 freq = ResidueProperties.rnaBackgroundFrequencies
981 Double hmmProb = getMatchEmissionProbability(column, symbol);
982 float prob = hmmProb.floatValue();
983 informationContent += prob * (Math.log(prob / freq) / Math.log(2));
987 return informationContent;
991 * Returns the consensus sequence based on the most probable symbol at each
992 * position. The sequence is adjusted to match the length of the existing
993 * sequence alignment. Gap characters are used as padding.
996 * The length of the longest sequence in the existing alignment.
999 public Sequence getConsensusSequence(int length)
1004 start = getNodeAlignmentColumn(1);
1005 modelLength = getLength();
1006 end = getNodeAlignmentColumn(modelLength);
1007 char[] sequence = new char[length];
1008 for (int index = 0; index < length; index++)
1010 Character character;
1011 if (consensusResidueIsActive())
1013 character = getConsensusAtAlignColumn(index);
1017 character = findConsensusCharacter(index);
1019 if (character == null || character == '-')
1021 sequence[index] = '-';
1025 sequence[index] = Character.toUpperCase(character);
1030 Sequence seq = new Sequence("HMM CONSENSUS", sequence, start, end);
1035 * Finds the most probable character at a column in an alignment based on the
1039 * The index of the node.
1042 Character findConsensusCharacter(int column)
1044 Character mostLikely = null;
1045 double highestProb = 0;
1046 for (char character : symbols)
1048 Double prob = getMatchEmissionProbability(column, character);
1049 if (prob > highestProb)
1052 mostLikely = character;