1 package jalview.datamodel;
3 import jalview.schemes.ResidueProperties;
5 import java.util.ArrayList;
6 import java.util.HashMap;
9 import java.util.Scanner;
12 * Data structure which stores a hidden Markov model. Currently contains file properties as well, not sure whether these should be transferred to the HMMFile class
17 public class HiddenMarkovModel
21 // Stores file properties. Do not directly access this field as it contains
22 // only string value - use the getter methods. For example, to find the length
23 // of theHMM, use getModelLength()to return an int value
24 Map<String, String> fileProperties = new HashMap<>();
26 //contains all of the symbols used in this model. The index of each symbol represents its lookup value
27 List<Character> symbols = new ArrayList<>();
29 // contains information for each node in the model. The begin node is at index
30 // 0. Node 0 contains average emission probabilities for each symbol
31 List<HMMNode> nodes = new ArrayList<>();
33 // contains the HMM node for each alignment column
34 Map<Integer, Integer> nodeLookup = new HashMap<>();
36 //contains the symbol index for each symbol
37 Map<Character, Integer> symbolIndexLookup = new HashMap<>();
39 Map<Character, Double> backgroundFrequencies = new HashMap();
44 final static String YES = "yes";
46 final static String NO = "no";
50 //keys for file properties hashmap
51 private final String NAME = "NAME";
53 private final String ACCESSION_NUMBER = "ACC";
55 private final String DESCRIPTION = "DESC";
57 private final String LENGTH = "LENG";
59 private final String MAX_LENGTH = "MAXL";
61 private final String ALPHABET = "ALPH";
63 private final String DATE = "DATE";
65 private final String COMMAND_LOG = "COM";
67 private final String NUMBER_OF_SEQUENCES = "NSEQ";
69 private final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
71 private final String CHECK_SUM = "CKSUM";
73 private final String GATHERING_THRESHOLDS = "GA";
75 private final String TRUSTED_CUTOFFS = "TC";
77 private final String NOISE_CUTOFFS = "NC";
79 private final String STATISTICS = "STATS";
81 private final String COMPO = "COMPO";
83 private final String GATHERING_THRESHOLD = "GA";
85 private final String TRUSTED_CUTOFF = "TC";
87 private final String NOISE_CUTOFF = "NC";
89 private final String VITERBI = "VITERBI";
91 private final String MSV = "MSV";
93 private final String FORWARD = "FORWARD";
95 private final String MAP = "MAP";
97 private final String REFERENCE_ANNOTATION = "RF";
99 private final String CONSENSUS_RESIDUE = "CONS";
101 private final String CONSENSUS_STRUCTURE = "CS";
103 private final String MASKED_VALUE = "MM";
105 final static String[] TRANSITION_TYPES = new String[] { "m->m", "m->i",
106 "m->d", "i->m", "i->i", "d->m", "d->d" };
108 public String getTransitionType(int index)
110 return TRANSITION_TYPES[index];
113 public Map<Integer, Integer> getNodeLookup()
118 public void setNodeLookup(Map<Integer, Integer> nodeLookup)
120 this.nodeLookup = nodeLookup;
123 public String[] getTransitionTypes()
125 return TRANSITION_TYPES;
128 public List<Character> getSymbols()
133 public Map<String, String> getFileProperties()
135 return fileProperties;
138 public HMMNode getNode(int nodeIndex)
140 return getNodes().get(nodeIndex);
143 public void setSymbols(List<Character> symbolsL)
145 this.symbols = symbolsL;
148 public String getName()
150 return fileProperties.get(NAME);
152 public String getAccessionNumber()
154 return fileProperties.get(ACCESSION_NUMBER);
157 public void setAccessionNumber(String value)
159 fileProperties.put(ACCESSION_NUMBER, value);
162 public String getDescription()
164 return fileProperties.get(DESCRIPTION);
167 public void setDescription(String value)
169 fileProperties.put(DESCRIPTION, value);
172 public Integer getLength()
174 if (fileProperties.get(LENGTH) == null)
178 return Integer.parseInt(fileProperties.get(LENGTH));
181 public void setLength(int value)
183 fileProperties.put(LENGTH, String.valueOf(value));
186 public Integer getMaxInstanceLength()
188 if (fileProperties.get(MAX_LENGTH) == null)
192 return Integer.parseInt(fileProperties.get(MAX_LENGTH));
195 public void setMaxInstanceLength(int value)
197 fileProperties.put(MAX_LENGTH, String.valueOf(value));
200 // gets type of symbol alphabet - "amino", "DNA", "RNA"
201 public String getAlphabetType()
203 return fileProperties.get(ALPHABET);
206 public void setAlphabetType(String value)
208 fileProperties.put(ALPHABET, value);
211 // not sure whether to implement this with Date object
212 public String getDate()
214 return fileProperties.get(DATE);
217 public void setDate(String value)
219 fileProperties.put(DATE, value);
222 // not sure whether to implement this
223 public String getCommandLineLog()
225 return fileProperties.get(COMMAND_LOG);
228 public void setCommandLineLog(String value)
230 fileProperties.put(COMMAND_LOG, value);
233 // gets the number of sequences that the HMM was trained on
234 public Integer getNumberOfSequences()
236 if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
240 return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
243 public void setNumberOfSequences(int value)
245 fileProperties.put(NUMBER_OF_SEQUENCES, String.valueOf(value));
248 // gets the effective number determined during sequence weighting
249 public Double getEffectiveNumberOfSequences()
251 if (fileProperties.get(LENGTH) == null)
255 return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
258 public void setEffectiveNumberOfSequences(double value)
260 fileProperties.put(EFF_NUMBER_OF_SEQUENCES, String.valueOf(value));
263 public Long getCheckSum()
265 if (fileProperties.get(LENGTH) == null)
269 return Long.parseLong(fileProperties.get(CHECK_SUM));
272 public void setCheckSum(long value)
274 fileProperties.put(CHECK_SUM, String.valueOf(value));
277 public List<HMMNode> getNodes()
282 public void setNodes(List<HMMNode> nodes)
288 * get match emission probability for a given symbol at a column in the
296 public Double getMatchEmissionProbability(int alignColumn, char symbol)
301 if (!symbolIndexLookup.containsKey(symbol))
305 symbolIndex = symbolIndexLookup.get(symbol);
306 if (nodeLookup.containsKey(alignColumn + 1))
308 nodeIndex = nodeLookup.get(alignColumn + 1);
309 probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex);
320 * get insert emission probability for a given symbol at a column in the
327 public Double getInsertEmissionProbability(int alignColumn, char symbol)
332 if (!symbolIndexLookup.containsKey(symbol))
336 symbolIndex = symbolIndexLookup.get(symbol);
337 if (nodeLookup.containsKey(alignColumn + 1))
339 nodeIndex = nodeLookup.get(alignColumn + 1);
340 probability = getNode(nodeIndex).getInsertEmissions()
352 * get state transition probability for a given transition type at a column in
359 public Double getStateTransitionProbability(int alignColumn,
365 transitionIndex = getTransitionType(transition);
366 if (nodeLookup.containsKey(alignColumn + 1))
368 nodeIndex = nodeLookup.get(alignColumn + 1);
369 probability = getNode(nodeIndex).getStateTransitions()
370 .get(transitionIndex);
380 public Integer getNodeAlignmentColumn(int nodeIndex)
382 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
386 public char getConsensusResidue(int nodeIndex)
388 char value = nodes.get(nodeIndex).getConsensusResidue();
392 public char getConsensusAtAlignColumn(int columnIndex)
395 Integer index = findNodeIndex(columnIndex + 1);
400 value = getNodes().get(index).getConsensusResidue();
404 public char getReferenceAnnotation(int nodeIndex)
406 char value = nodes.get(nodeIndex).getReferenceAnnotation();
410 public char getMaskedValue(int nodeIndex)
412 char value = nodes.get(nodeIndex).getMaskValue();
416 public char getConsensusStructure(int nodeIndex)
418 char value = nodes.get(nodeIndex).getConsensusStructure();
423 * returns the average match emission for a given symbol
427 * average negative log propbability of a match emission of the given symbol
429 public double getAverageMatchEmission(int symbolIndex)
431 double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
435 public int getNumberOfSymbols()
437 return numberOfSymbols;
440 public void setNumberOfSymbols(int numberOfSymbols)
442 this.numberOfSymbols = numberOfSymbols;
448 * fills symbol array and also finds numberOfSymbols
451 * scanner scanning symbol line in file
453 public void fillSymbols(Scanner parser)
456 while (parser.hasNext())
458 String strSymbol = parser.next();
459 char[] symbol = strSymbol.toCharArray();
460 symbols.add(symbol[0]);
461 symbolIndexLookup.put(symbol[0], i);
464 numberOfSymbols = symbols.size();
473 public void addFileProperty(String key, String value)
475 fileProperties.put(key, value);
478 public boolean referenceAnnotationIsActive()
481 status = fileProperties.get(REFERENCE_ANNOTATION);
498 public boolean maskValueIsActive()
501 status = fileProperties.get(MASKED_VALUE);
518 public boolean consensusResidueIsActive()
521 status = fileProperties.get(CONSENSUS_RESIDUE);
538 public boolean consensusStructureIsActive()
541 status = fileProperties.get(CONSENSUS_STRUCTURE);
558 public boolean mapIsActive()
561 status = fileProperties.get(MAP);
578 public void setAlignmentColumn(int nodeIndex, int column)
580 nodes.get(nodeIndex).setAlignmentColumn(column);
583 public void setReferenceAnnotation(int nodeIndex, char value)
585 nodes.get(nodeIndex).setReferenceAnnotation(value);
588 public void setConsensusResidue(int nodeIndex, char value)
590 nodes.get(nodeIndex).setConsensusResidue(value);
593 public void setConsensusStructure(int nodeIndex, char value)
595 nodes.get(nodeIndex).setConsensusStructure(value);
598 public void setMaskValue(int nodeIndex, char value)
600 nodes.get(nodeIndex).setMaskValue(value);
603 public String getGatheringThreshold()
606 value = fileProperties.get("GA");
610 public String getNoiseCutoff()
613 value = fileProperties.get("NC");
617 public String getTrustedCutoff()
620 value = fileProperties.get("TC");
624 public String getViterbi()
627 value = fileProperties.get(VITERBI);
631 public String getMSV()
634 value = fileProperties.get(MSV);
638 public String getForward()
641 value = fileProperties.get(FORWARD);
645 public void setMAPStatus(boolean status)
649 fileProperties.put(MAP, YES);
653 fileProperties.put(MAP, NO);
657 public void setReferenceAnnotationStatus(boolean status)
661 fileProperties.put(REFERENCE_ANNOTATION, YES);
665 fileProperties.put(REFERENCE_ANNOTATION, NO);
669 public void setMaskedValueStatus(boolean status)
673 fileProperties.put(MASKED_VALUE, YES);
677 fileProperties.put(MASKED_VALUE, NO);
681 public void setConsensusResidueStatus(boolean status)
685 fileProperties.put(CONSENSUS_RESIDUE, YES);
689 fileProperties.put(CONSENSUS_RESIDUE, NO);
693 public void setConsensusStructureStatus(boolean status)
697 fileProperties.put(CONSENSUS_STRUCTURE, YES);
701 fileProperties.put(CONSENSUS_STRUCTURE, NO);
708 * type of transition occuring
709 * @return index value representing position along stateTransition array.
711 public Integer getTransitionType(String transition)
744 * find the index of the node in a hidden Markov model based on the column in
747 * @param alignmentColumn
750 public Integer findNodeIndex(int alignmentColumn)
753 index = nodeLookup.get(alignmentColumn);
757 public static String findStringFromBoolean(boolean value)
770 * creates the HMM annotation
774 public AlignmentAnnotation createAnnotation(int length)
776 Annotation[] annotations = new Annotation[length];
778 for (int alignPos = 0; alignPos < length; alignPos++)
780 Float content = getInformationContent(alignPos);
787 cons = getConsensusAtAlignColumn(alignPos);
788 cons = Character.toUpperCase(cons);
790 String description = String.format("%.3f", content);
791 description += " bits";
792 annotations[alignPos] = new Annotation(cons.toString(), description,
797 AlignmentAnnotation annotation = new AlignmentAnnotation(
798 "Information Content",
799 "The information content of each column, measured in bits",
801 0f, max, AlignmentAnnotation.BAR_GRAPH);
805 public float getInformationContent(int column)
807 float informationContent = 0f;
809 for (char symbol : symbols)
812 if (symbols.size() == 20)
814 freq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
816 if (symbols.size() == 4)
818 freq = ResidueProperties.nucleotideBackgroundFrequencies
821 Double hmmProb = getMatchEmissionProbability(column, symbol);
822 float prob = hmmProb.floatValue();
823 informationContent += prob * (Math.log(prob / freq) / Math.log(2));
827 return informationContent;