1 package jalview.datamodel;
3 import jalview.schemes.ResidueProperties;
5 import java.util.ArrayList;
6 import java.util.HashMap;
9 import java.util.Scanner;
12 * Data structure which stores a hidden Markov model. Currently contains file properties as well, not sure whether these should be transferred to the HMMFile class
17 public class HiddenMarkovModel
19 // Stores file properties. Do not directly access this field as it contains
20 // only string value - use the getter methods. For example, to find the length
21 // of theHMM, use getModelLength()to return an int value
22 Map<String, String> fileProperties = new HashMap<>();
24 //contains all of the symbols used in this model. The index of each symbol represents its lookup value
25 List<Character> symbols = new ArrayList<>();
27 // contains information for each node in the model. The begin node is at index
28 // 0. Node 0 contains average emission probabilities for each symbol
29 List<HMMNode> nodes = new ArrayList<>();
31 // contains the HMM node for each alignment column
32 Map<Integer, Integer> nodeLookup = new HashMap<>();
34 //contains the symbol index for each symbol
35 Map<Character, Integer> symbolIndexLookup = new HashMap<>();
37 Map<Character, Double> backgroundFrequencies = new HashMap();
42 final static String YES = "yes";
44 final static String NO = "no";
48 //keys for file properties hashmap
49 private final String NAME = "NAME";
51 private final String ACCESSION_NUMBER = "ACC";
53 private final String DESCRIPTION = "DESC";
55 private final String LENGTH = "LENG";
57 private final String MAX_LENGTH = "MAXL";
59 private final String ALPHABET = "ALPH";
61 private final String DATE = "DATE";
63 private final String COMMAND_LOG = "COM";
65 private final String NUMBER_OF_SEQUENCES = "NSEQ";
67 private final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
69 private final String CHECK_SUM = "CKSUM";
71 private final String GATHERING_THRESHOLDS = "GA";
73 private final String TRUSTED_CUTOFFS = "TC";
75 private final String NOISE_CUTOFFS = "NC";
77 private final String STATISTICS = "STATS";
79 private final String COMPO = "COMPO";
81 private final String GATHERING_THRESHOLD = "GA";
83 private final String TRUSTED_CUTOFF = "TC";
85 private final String NOISE_CUTOFF = "NC";
87 private final String VITERBI = "VITERBI";
89 private final String MSV = "MSV";
91 private final String FORWARD = "FORWARD";
93 private final String MAP = "MAP";
95 private final String REFERENCE_ANNOTATION = "RF";
97 private final String CONSENSUS_RESIDUE = "CONS";
99 private final String CONSENSUS_STRUCTURE = "CS";
101 private final String MASKED_VALUE = "MM";
103 final static String[] TRANSITION_TYPES = new String[] { "m->m", "m->i",
104 "m->d", "i->m", "i->i", "d->m", "d->d" };
106 public String getTransitionType(int index)
108 return TRANSITION_TYPES[index];
111 public Map<Integer, Integer> getNodeLookup()
116 public void setNodeLookup(Map<Integer, Integer> nodeLookup)
118 this.nodeLookup = nodeLookup;
121 public String[] getTransitionTypes()
123 return TRANSITION_TYPES;
126 public List<Character> getSymbols()
131 public Map<String, String> getFileProperties()
133 return fileProperties;
136 public HMMNode getNode(int nodeIndex)
138 return getNodes().get(nodeIndex);
141 public void setSymbols(List<Character> symbolsL)
143 this.symbols = symbolsL;
146 public String getName()
148 return fileProperties.get(NAME);
150 public String getAccessionNumber()
152 return fileProperties.get(ACCESSION_NUMBER);
155 public void setAccessionNumber(String value)
157 fileProperties.put(ACCESSION_NUMBER, value);
160 public String getDescription()
162 return fileProperties.get(DESCRIPTION);
165 public void setDescription(String value)
167 fileProperties.put(DESCRIPTION, value);
170 public Integer getLength()
172 if (fileProperties.get(LENGTH) == null)
176 return Integer.parseInt(fileProperties.get(LENGTH));
179 public void setLength(int value)
181 fileProperties.put(LENGTH, String.valueOf(value));
184 public Integer getMaxInstanceLength()
186 if (fileProperties.get(MAX_LENGTH) == null)
190 return Integer.parseInt(fileProperties.get(MAX_LENGTH));
193 public void setMaxInstanceLength(int value)
195 fileProperties.put(MAX_LENGTH, String.valueOf(value));
198 // gets type of symbol alphabet - "amino", "DNA", "RNA"
199 public String getAlphabetType()
201 return fileProperties.get(ALPHABET);
204 public void setAlphabetType(String value)
206 fileProperties.put(ALPHABET, value);
209 // not sure whether to implement this with Date object
210 public String getDate()
212 return fileProperties.get(DATE);
215 public void setDate(String value)
217 fileProperties.put(DATE, value);
220 // not sure whether to implement this
221 public String getCommandLineLog()
223 return fileProperties.get(COMMAND_LOG);
226 public void setCommandLineLog(String value)
228 fileProperties.put(COMMAND_LOG, value);
231 // gets the number of sequences that the HMM was trained on
232 public Integer getNumberOfSequences()
234 if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
238 return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
241 public void setNumberOfSequences(int value)
243 fileProperties.put(NUMBER_OF_SEQUENCES, String.valueOf(value));
246 // gets the effective number determined during sequence weighting
247 public Double getEffectiveNumberOfSequences()
249 if (fileProperties.get(LENGTH) == null)
253 return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
256 public void setEffectiveNumberOfSequences(double value)
258 fileProperties.put(EFF_NUMBER_OF_SEQUENCES, String.valueOf(value));
261 public Long getCheckSum()
263 if (fileProperties.get(LENGTH) == null)
267 return Long.parseLong(fileProperties.get(CHECK_SUM));
270 public void setCheckSum(long value)
272 fileProperties.put(CHECK_SUM, String.valueOf(value));
275 public List<HMMNode> getNodes()
280 public void setNodes(List<HMMNode> nodes)
286 * get match emission probability for a given symbol at a column in the
294 public Double getMatchEmissionProbability(int alignColumn, char symbol)
299 if (!symbolIndexLookup.containsKey(symbol))
303 symbolIndex = symbolIndexLookup.get(symbol);
304 if (nodeLookup.containsKey(alignColumn + 1))
306 nodeIndex = nodeLookup.get(alignColumn + 1);
307 probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex);
318 * get insert emission probability for a given symbol at a column in the
325 public Double getInsertEmissionProbability(int alignColumn, char symbol)
330 if (!symbolIndexLookup.containsKey(symbol))
334 symbolIndex = symbolIndexLookup.get(symbol);
335 if (nodeLookup.containsKey(alignColumn + 1))
337 nodeIndex = nodeLookup.get(alignColumn + 1);
338 probability = getNode(nodeIndex).getInsertEmissions()
350 * get state transition probability for a given transition type at a column in
357 public Double getStateTransitionProbability(int alignColumn,
363 transitionIndex = getTransitionType(transition);
364 if (nodeLookup.containsKey(alignColumn + 1))
366 nodeIndex = nodeLookup.get(alignColumn + 1);
367 probability = getNode(nodeIndex).getStateTransitions()
368 .get(transitionIndex);
378 public Integer getNodeAlignmentColumn(int nodeIndex)
380 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
384 public char getConsensusResidue(int nodeIndex)
386 char value = nodes.get(nodeIndex).getConsensusResidue();
390 public char getConsensusAtAlignColumn(int columnIndex)
393 Integer index = findNodeIndex(columnIndex + 1);
398 value = getNodes().get(index).getConsensusResidue();
402 public char getReferenceAnnotation(int nodeIndex)
404 char value = nodes.get(nodeIndex).getReferenceAnnotation();
408 public char getMaskedValue(int nodeIndex)
410 char value = nodes.get(nodeIndex).getMaskValue();
414 public char getConsensusStructure(int nodeIndex)
416 char value = nodes.get(nodeIndex).getConsensusStructure();
421 * returns the average match emission for a given symbol
425 * average negative log propbability of a match emission of the given symbol
427 public double getAverageMatchEmission(int symbolIndex)
429 double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
433 public int getNumberOfSymbols()
435 return numberOfSymbols;
438 public void setNumberOfSymbols(int numberOfSymbols)
440 this.numberOfSymbols = numberOfSymbols;
446 * fills symbol array and also finds numberOfSymbols
449 * scanner scanning symbol line in file
451 public void fillSymbols(Scanner parser)
454 while (parser.hasNext())
456 String strSymbol = parser.next();
457 char[] symbol = strSymbol.toCharArray();
458 symbols.add(symbol[0]);
459 symbolIndexLookup.put(symbol[0], i);
462 numberOfSymbols = symbols.size();
471 public void addFileProperty(String key, String value)
473 fileProperties.put(key, value);
476 public boolean referenceAnnotationIsActive()
479 status = fileProperties.get(REFERENCE_ANNOTATION);
496 public boolean maskValueIsActive()
499 status = fileProperties.get(MASKED_VALUE);
516 public boolean consensusResidueIsActive()
519 status = fileProperties.get(CONSENSUS_RESIDUE);
536 public boolean consensusStructureIsActive()
539 status = fileProperties.get(CONSENSUS_STRUCTURE);
556 public boolean mapIsActive()
559 status = fileProperties.get(MAP);
576 public void setAlignmentColumn(int nodeIndex, int column)
578 nodes.get(nodeIndex).setAlignmentColumn(column);
581 public void setReferenceAnnotation(int nodeIndex, char value)
583 nodes.get(nodeIndex).setReferenceAnnotation(value);
586 public void setConsensusResidue(int nodeIndex, char value)
588 nodes.get(nodeIndex).setConsensusResidue(value);
591 public void setConsensusStructure(int nodeIndex, char value)
593 nodes.get(nodeIndex).setConsensusStructure(value);
596 public void setMaskValue(int nodeIndex, char value)
598 nodes.get(nodeIndex).setMaskValue(value);
601 public String getGatheringThreshold()
604 value = fileProperties.get("GA");
608 public String getNoiseCutoff()
611 value = fileProperties.get("NC");
615 public String getTrustedCutoff()
618 value = fileProperties.get("TC");
622 public String getViterbi()
625 value = fileProperties.get(VITERBI);
629 public String getMSV()
632 value = fileProperties.get(MSV);
636 public String getForward()
639 value = fileProperties.get(FORWARD);
643 public void setMAPStatus(boolean status)
647 fileProperties.put(MAP, YES);
651 fileProperties.put(MAP, NO);
655 public void setReferenceAnnotationStatus(boolean status)
659 fileProperties.put(REFERENCE_ANNOTATION, YES);
663 fileProperties.put(REFERENCE_ANNOTATION, NO);
667 public void setMaskedValueStatus(boolean status)
671 fileProperties.put(MASKED_VALUE, YES);
675 fileProperties.put(MASKED_VALUE, NO);
679 public void setConsensusResidueStatus(boolean status)
683 fileProperties.put(CONSENSUS_RESIDUE, YES);
687 fileProperties.put(CONSENSUS_RESIDUE, NO);
691 public void setConsensusStructureStatus(boolean status)
695 fileProperties.put(CONSENSUS_STRUCTURE, YES);
699 fileProperties.put(CONSENSUS_STRUCTURE, NO);
706 * type of transition occuring
707 * @return index value representing position along stateTransition array.
709 public Integer getTransitionType(String transition)
742 * find the index of the node in a hidden Markov model based on the column in
745 * @param alignmentColumn
748 public Integer findNodeIndex(int alignmentColumn)
751 index = nodeLookup.get(alignmentColumn);
755 public static String findStringFromBoolean(boolean value)
768 * creates the HMM annotation
772 public AlignmentAnnotation createAnnotation(int length)
774 Annotation[] annotations = new Annotation[length];
776 for (int alignPos = 0; alignPos < length; alignPos++)
778 Float content = getInformationContent(alignPos);
785 cons = getConsensusAtAlignColumn(alignPos);
786 cons = Character.toUpperCase(cons);
788 String description = String.format("%.3f", content);
789 description += " bits";
790 annotations[alignPos] = new Annotation(cons.toString(), description,
795 AlignmentAnnotation annotation = new AlignmentAnnotation(
796 "Information Content",
797 "The information content of each column, measured in bits",
799 0f, max, AlignmentAnnotation.BAR_GRAPH);
803 public float getInformationContent(int column)
805 float informationContent = 0f;
807 for (char symbol : symbols)
810 if (symbols.size() == 20)
812 freq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
814 if (symbols.size() == 4)
816 freq = ResidueProperties.nucleotideBackgroundFrequencies
819 Double hmmProb = getMatchEmissionProbability(column, symbol);
820 float prob = hmmProb.floatValue();
821 informationContent += prob * (Math.log(prob / freq) / Math.log(2));
825 return informationContent;