1 package jalview.datamodel;
3 import jalview.schemes.ResidueProperties;
5 import java.util.ArrayList;
6 import java.util.HashMap;
9 import java.util.Scanner;
12 * Data structure which stores a hidden Markov model. Currently contains file properties as well, not sure whether these should be transferred to the HMMFile class
17 public class HiddenMarkovModel
19 // Stores file properties. Do not directly access this field as it contains
20 // only string value - use the getter methods. For example, to find the length
21 // of theHMM, use getModelLength()to return an int value
22 Map<String, String> fileProperties = new HashMap<>();
24 //contains all of the symbols used in this model. The index of each symbol represents its lookup value
25 List<Character> symbols = new ArrayList<>();
27 // contains information for each node in the model. The begin node is at index
28 // 0. Node 0 contains average emission probabilities for each symbol
29 List<HMMNode> nodes = new ArrayList<>();
31 // contains the HMM node for each alignment column
32 Map<Integer, Integer> nodeLookup = new HashMap<>();
34 //contains the symbol index for each symbol
35 Map<Character, Integer> symbolIndexLookup = new HashMap<>();
37 Map<Character, Double> backgroundFrequencies = new HashMap();
40 final static String YES = "yes";
42 final static String NO = "no";
46 //keys for file properties hashmap
47 private final String NAME = "NAME";
49 private final String ACCESSION_NUMBER = "ACC";
51 private final String DESCRIPTION = "DESC";
53 private final String LENGTH = "LENG";
55 private final String MAX_LENGTH = "MAXL";
57 private final String ALPHABET = "ALPH";
59 private final String DATE = "DATE";
61 private final String COMMAND_LOG = "COM";
63 private final String NUMBER_OF_SEQUENCES = "NSEQ";
65 private final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
67 private final String CHECK_SUM = "CKSUM";
69 private final String GATHERING_THRESHOLDS = "GA";
71 private final String TRUSTED_CUTOFFS = "TC";
73 private final String NOISE_CUTOFFS = "NC";
75 private final String STATISTICS = "STATS";
77 private final String COMPO = "COMPO";
79 private final String GATHERING_THRESHOLD = "GA";
81 private final String TRUSTED_CUTOFF = "TC";
83 private final String NOISE_CUTOFF = "NC";
85 private final String VITERBI = "VITERBI";
87 private final String MSV = "MSV";
89 private final String FORWARD = "FORWARD";
91 private final String MAP = "MAP";
93 private final String REFERENCE_ANNOTATION = "RF";
95 private final String CONSENSUS_RESIDUE = "CONS";
97 private final String CONSENSUS_STRUCTURE = "CS";
99 private final String MASKED_VALUE = "MM";
101 final static String[] TRANSITION_TYPES = new String[] { "m->m", "m->i",
102 "m->d", "i->m", "i->i", "d->m", "d->d" };
104 public String getTransitionType(int index)
106 return TRANSITION_TYPES[index];
109 public Map<Integer, Integer> getNodeLookup()
114 public void setNodeLookup(Map<Integer, Integer> nodeLookup)
116 this.nodeLookup = nodeLookup;
119 public String[] getTransitionTypes()
121 return TRANSITION_TYPES;
124 public List<Character> getSymbols()
129 public Map<String, String> getFileProperties()
131 return fileProperties;
134 public HMMNode getNode(int nodeIndex)
136 return getNodes().get(nodeIndex);
139 public void setSymbols(List<Character> symbolsL)
141 this.symbols = symbolsL;
144 public String getName()
146 return fileProperties.get(NAME);
148 public String getAccessionNumber()
150 return fileProperties.get(ACCESSION_NUMBER);
153 public void setAccessionNumber(String value)
155 fileProperties.put(ACCESSION_NUMBER, value);
158 public String getDescription()
160 return fileProperties.get(DESCRIPTION);
163 public void setDescription(String value)
165 fileProperties.put(DESCRIPTION, value);
168 public Integer getLength()
170 if (fileProperties.get(LENGTH) == null)
174 return Integer.parseInt(fileProperties.get(LENGTH));
177 public void setLength(int value)
179 fileProperties.put(LENGTH, String.valueOf(value));
182 public Integer getMaxInstanceLength()
184 if (fileProperties.get(MAX_LENGTH) == null)
188 return Integer.parseInt(fileProperties.get(MAX_LENGTH));
191 public void setMaxInstanceLength(int value)
193 fileProperties.put(MAX_LENGTH, String.valueOf(value));
196 // gets type of symbol alphabet - "amino", "DNA", "RNA"
197 public String getAlphabetType()
199 return fileProperties.get(ALPHABET);
202 public void setAlphabetType(String value)
204 fileProperties.put(ALPHABET, value);
207 // not sure whether to implement this with Date object
208 public String getDate()
210 return fileProperties.get(DATE);
213 public void setDate(String value)
215 fileProperties.put(DATE, value);
218 // not sure whether to implement this
219 public String getCommandLineLog()
221 return fileProperties.get(COMMAND_LOG);
224 public void setCommandLineLog(String value)
226 fileProperties.put(COMMAND_LOG, value);
229 // gets the number of sequences that the HMM was trained on
230 public Integer getNumberOfSequences()
232 if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
236 return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
239 public void setNumberOfSequences(int value)
241 fileProperties.put(NUMBER_OF_SEQUENCES, String.valueOf(value));
244 // gets the effective number determined during sequence weighting
245 public Double getEffectiveNumberOfSequences()
247 if (fileProperties.get(LENGTH) == null)
251 return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
254 public void setEffectiveNumberOfSequences(double value)
256 fileProperties.put(EFF_NUMBER_OF_SEQUENCES, String.valueOf(value));
259 public Long getCheckSum()
261 if (fileProperties.get(LENGTH) == null)
265 return Long.parseLong(fileProperties.get(CHECK_SUM));
268 public void setCheckSum(long value)
270 fileProperties.put(CHECK_SUM, String.valueOf(value));
273 public List<HMMNode> getNodes()
278 public void setNodes(List<HMMNode> nodes)
284 * get match emission probability for a given symbol at a column in the
292 public Double getMatchEmissionProbability(int alignColumn, char symbol)
297 if (!symbolIndexLookup.containsKey(symbol))
301 symbolIndex = symbolIndexLookup.get(symbol);
302 if (nodeLookup.containsKey(alignColumn + 1))
304 nodeIndex = nodeLookup.get(alignColumn + 1);
305 probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex);
316 * get insert emission probability for a given symbol at a column in the
323 public Double getInsertEmissionProbability(int alignColumn, char symbol)
328 if (!symbolIndexLookup.containsKey(symbol))
332 symbolIndex = symbolIndexLookup.get(symbol);
333 if (nodeLookup.containsKey(alignColumn + 1))
335 nodeIndex = nodeLookup.get(alignColumn + 1);
336 probability = getNode(nodeIndex).getInsertEmissions()
348 * get state transition probability for a given transition type at a column in
355 public Double getStateTransitionProbability(int alignColumn,
361 transitionIndex = getTransitionType(transition);
362 if (nodeLookup.containsKey(alignColumn + 1))
364 nodeIndex = nodeLookup.get(alignColumn + 1);
365 probability = getNode(nodeIndex).getStateTransitions()
366 .get(transitionIndex);
376 public Integer getNodeAlignmentColumn(int nodeIndex)
378 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
382 public char getConsensusResidue(int nodeIndex)
384 char value = nodes.get(nodeIndex).getConsensusResidue();
388 public char getConsensus(int columnIndex)
391 Integer index = findNodeIndex(columnIndex + 1);
396 value = getNodes().get(index).getConsensusResidue();
400 public char getReferenceAnnotation(int nodeIndex)
402 char value = nodes.get(nodeIndex).getReferenceAnnotation();
406 public char getMaskedValue(int nodeIndex)
408 char value = nodes.get(nodeIndex).getMaskValue();
412 public char getConsensusStructure(int nodeIndex)
414 char value = nodes.get(nodeIndex).getConsensusStructure();
419 * returns the average match emission for a given symbol
423 * average negative log propbability of a match emission of the given symbol
425 public double getAverageMatchEmission(int symbolIndex)
427 double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
431 public int getNumberOfSymbols()
433 return numberOfSymbols;
436 public void setNumberOfSymbols(int numberOfSymbols)
438 this.numberOfSymbols = numberOfSymbols;
444 * fills symbol array and also finds numberOfSymbols
447 * scanner scanning symbol line in file
449 public void fillSymbols(Scanner parser)
452 while (parser.hasNext())
454 String strSymbol = parser.next();
455 char[] symbol = strSymbol.toCharArray();
456 symbols.add(symbol[0]);
457 symbolIndexLookup.put(symbol[0], i);
460 numberOfSymbols = symbols.size();
469 public void addFileProperty(String key, String value)
471 fileProperties.put(key, value);
474 public boolean referenceAnnotationIsActive()
477 status = fileProperties.get(REFERENCE_ANNOTATION);
494 public boolean maskValueIsActive()
497 status = fileProperties.get(MASKED_VALUE);
514 public boolean consensusResidueIsActive()
517 status = fileProperties.get(CONSENSUS_RESIDUE);
534 public boolean consensusStructureIsActive()
537 status = fileProperties.get(CONSENSUS_STRUCTURE);
554 public boolean mapIsActive()
557 status = fileProperties.get(MAP);
574 public void setAlignmentColumn(int nodeIndex, int column)
576 nodes.get(nodeIndex).setAlignmentColumn(column);
579 public void setReferenceAnnotation(int nodeIndex, char value)
581 nodes.get(nodeIndex).setReferenceAnnotation(value);
584 public void setConsensusResidue(int nodeIndex, char value)
586 nodes.get(nodeIndex).setConsensusResidue(value);
589 public void setConsensusStructure(int nodeIndex, char value)
591 nodes.get(nodeIndex).setConsensusStructure(value);
594 public void setMaskValue(int nodeIndex, char value)
596 nodes.get(nodeIndex).setMaskValue(value);
599 public String getGatheringThreshold()
602 value = fileProperties.get("GA");
606 public String getNoiseCutoff()
609 value = fileProperties.get("NC");
613 public String getTrustedCutoff()
616 value = fileProperties.get("TC");
620 public String getViterbi()
623 value = fileProperties.get(VITERBI);
627 public String getMSV()
630 value = fileProperties.get(MSV);
634 public String getForward()
637 value = fileProperties.get(FORWARD);
641 public void setMAPStatus(boolean status)
645 fileProperties.put(MAP, YES);
649 fileProperties.put(MAP, NO);
653 public void setReferenceAnnotationStatus(boolean status)
657 fileProperties.put(REFERENCE_ANNOTATION, YES);
661 fileProperties.put(REFERENCE_ANNOTATION, NO);
665 public void setMaskedValueStatus(boolean status)
669 fileProperties.put(MASKED_VALUE, YES);
673 fileProperties.put(MASKED_VALUE, NO);
677 public void setConsensusResidueStatus(boolean status)
681 fileProperties.put(CONSENSUS_RESIDUE, YES);
685 fileProperties.put(CONSENSUS_RESIDUE, NO);
689 public void setConsensusStructureStatus(boolean status)
693 fileProperties.put(CONSENSUS_STRUCTURE, YES);
697 fileProperties.put(CONSENSUS_STRUCTURE, NO);
704 * type of transition occuring
705 * @return index value representing position along stateTransition array.
707 public Integer getTransitionType(String transition)
740 * find the index of the node in a hidden Markov model based on the column in
743 * @param alignmentColumn
746 public Integer findNodeIndex(int alignmentColumn)
749 index = nodeLookup.get(alignmentColumn);
753 public static String findStringFromBoolean(boolean value)
766 * creates the HMM annotation
770 public AlignmentAnnotation createAnnotation(int length)
772 Annotation[] annotations = new Annotation[length];
774 for (int i = 0; i < length; i++)
776 Float content = getInformationContent(i);
781 String description = String.format("%.3f", content);
782 description += " bits";
783 annotations[i] = new Annotation(null, description, ' ', content);
786 AlignmentAnnotation annotation = new AlignmentAnnotation(
787 "Information Content",
788 "The information content of each column, measured in bits",
790 0f, max, AlignmentAnnotation.BAR_GRAPH);
794 public float getInformationContent(int column)
796 float informationContent = 0f;
798 for (char symbol : symbols)
801 if (symbols.size() == 20)
803 freq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
805 if (symbols.size() == 4)
807 freq = ResidueProperties.nucleotideBackgroundFrequencies
810 Double hmmProb = getMatchEmissionProbability(column, symbol);
811 float prob = hmmProb.floatValue();
812 informationContent += prob * (Math.log(prob / freq) / Math.log(2));
816 return informationContent;