1 package jalview.datamodel;
3 import java.util.ArrayList;
4 import java.util.HashMap;
7 import java.util.Scanner;
10 * Data structure which stores a hidden Markov model. Currently contains file properties as well, not sure whether these should be transferred to the HMMFile class
15 public class HiddenMarkovModel
17 // Stores file properties. Do not directly access this field as it contains
18 // only string value - use the getter methods. For example, to find the length
19 // of theHMM, use getModelLength()to return an int value
20 Map<String, String> fileProperties = new HashMap<>();
22 //contains all of the symbols used in this model. The index of each symbol represents its lookup value
23 List<Character> symbols = new ArrayList<>();
25 // contains information for each node in the model. The begin node is at index
26 // 0. Node 0 contains average emission probabilities for each symbol
27 List<HMMNode> nodes = new ArrayList<>();
29 // contains the HMM node for each alignment column
30 Map<Integer, Integer> nodeLookup = new HashMap<>();
32 //contains the symbol index for each symbol
33 Map<Character, Integer> symbolIndexLookup = new HashMap<>();
36 final static String YES = "yes";
38 final static String NO = "no";
42 //keys for file properties hashmap
43 private final String NAME = "NAME";
45 private final String ACCESSION_NUMBER = "ACC";
47 private final String DESCRIPTION = "DESC";
49 private final String LENGTH = "LENG";
51 private final String MAX_LENGTH = "MAXL";
53 private final String ALPHABET = "ALPH";
55 private final String DATE = "DATE";
57 private final String COMMAND_LOG = "COM";
59 private final String NUMBER_OF_SEQUENCES = "NSEQ";
61 private final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
63 private final String CHECK_SUM = "CKSUM";
65 private final String GATHERING_THRESHOLDS = "GA";
67 private final String TRUSTED_CUTOFFS = "TC";
69 private final String NOISE_CUTOFFS = "NC";
71 private final String STATISTICS = "STATS";
73 private final String COMPO = "COMPO";
75 private final String GATHERING_THRESHOLD = "GA";
77 private final String TRUSTED_CUTOFF = "TC";
79 private final String NOISE_CUTOFF = "NC";
81 private final String VITERBI = "VITERBI";
83 private final String MSV = "MSV";
85 private final String FORWARD = "FORWARD";
87 private final String MAP = "MAP";
89 private final String REFERENCE_ANNOTATION = "RF";
91 private final String CONSENSUS_RESIDUE = "CONS";
93 private final String CONSENSUS_STRUCTURE = "CS";
95 private final String MASKED_VALUE = "MM";
97 final static String[] TRANSITION_TYPES = new String[] { "m->m", "m->i",
98 "m->d", "i->m", "i->i", "d->m", "d->d" };
100 public String getTransitionType(int index)
102 return TRANSITION_TYPES[index];
105 public Map<Integer, Integer> getNodeLookup()
110 public void setNodeLookup(Map<Integer, Integer> nodeLookup)
112 this.nodeLookup = nodeLookup;
115 public String[] getTransitionTypes()
117 return TRANSITION_TYPES;
120 public List<Character> getSymbols()
125 public Map<String, String> getFileProperties()
127 return fileProperties;
130 public HMMNode getNode(int nodeIndex)
132 return getNodes().get(nodeIndex);
135 public void setSymbols(List<Character> symbolsL)
137 this.symbols = symbolsL;
140 public String getName()
142 return fileProperties.get(NAME);
144 public String getAccessionNumber()
146 return fileProperties.get(ACCESSION_NUMBER);
149 public void setAccessionNumber(String value)
151 fileProperties.put(ACCESSION_NUMBER, value);
154 public String getDescription()
156 return fileProperties.get(DESCRIPTION);
159 public void setDescription(String value)
161 fileProperties.put(DESCRIPTION, value);
164 public Integer getLength()
166 if (fileProperties.get(LENGTH) == null)
170 return Integer.parseInt(fileProperties.get(LENGTH));
173 public void setLength(int value)
175 fileProperties.put(LENGTH, String.valueOf(value));
178 public Integer getMaxInstanceLength()
180 if (fileProperties.get(MAX_LENGTH) == null)
184 return Integer.parseInt(fileProperties.get(MAX_LENGTH));
187 public void setMaxInstanceLength(int value)
189 fileProperties.put(MAX_LENGTH, String.valueOf(value));
192 // gets type of symbol alphabet - "amino", "DNA", "RNA"
193 public String getAlphabetType()
195 return fileProperties.get(ALPHABET);
198 public void setAlphabetType(String value)
200 fileProperties.put(ALPHABET, value);
203 // not sure whether to implement this with Date object
204 public String getDate()
206 return fileProperties.get(DATE);
209 public void setDate(String value)
211 fileProperties.put(DATE, value);
214 // not sure whether to implement this
215 public String getCommandLineLog()
217 return fileProperties.get(COMMAND_LOG);
220 public void setCommandLineLog(String value)
222 fileProperties.put(COMMAND_LOG, value);
225 // gets the number of sequences that the HMM was trained on
226 public Integer getNumberOfSequences()
228 if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
232 return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
235 public void setNumberOfSequences(int value)
237 fileProperties.put(NUMBER_OF_SEQUENCES, String.valueOf(value));
240 // gets the effective number determined during sequence weighting
241 public Double getEffectiveNumberOfSequences()
243 if (fileProperties.get(LENGTH) == null)
247 return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
250 public void setEffectiveNumberOfSequences(double value)
252 fileProperties.put(EFF_NUMBER_OF_SEQUENCES, String.valueOf(value));
255 public Long getCheckSum()
257 if (fileProperties.get(LENGTH) == null)
261 return Long.parseLong(fileProperties.get(CHECK_SUM));
264 public void setCheckSum(long value)
266 fileProperties.put(CHECK_SUM, String.valueOf(value));
269 public List<HMMNode> getNodes()
274 public void setNodes(List<HMMNode> nodes)
280 * get match emission probability for a given symbol at a column in the
288 public Double getMatchEmissionProbability(int alignColumn, char symbol)
297 symbolIndex = symbolIndexLookup.get(symbol);
298 if (nodeLookup.containsKey(alignColumn + 1))
300 nodeIndex = nodeLookup.get(alignColumn + 1);
301 probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex);
302 probability = Math.pow(Math.E, -probability);
313 * get insert emission probability for a given symbol at a column in the
320 public Double getInsertEmissionProbability(int alignColumn, char symbol)
329 symbolIndex = symbolIndexLookup.get(symbol);
330 if (nodeLookup.containsKey(alignColumn + 1))
332 nodeIndex = nodeLookup.get(alignColumn + 1);
333 probability = getNode(nodeIndex).getInsertEmissions()
335 probability = Math.pow(Math.E, -probability);
346 * get state transition probability for a given transition type at a column in
353 public Double getStateTransitionProbability(int alignColumn,
359 transitionIndex = getTransitionType(transition);
360 if (nodeLookup.containsKey(alignColumn + 1))
362 nodeIndex = nodeLookup.get(alignColumn + 1);
363 probability = getNode(nodeIndex).getStateTransitions()
364 .get(transitionIndex);
365 probability = Math.pow(Math.E, -probability);
375 public Integer getNodeAlignmentColumn(int nodeIndex)
377 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
381 public char getConsensusResidue(int nodeIndex)
383 char value = nodes.get(nodeIndex).getConsensusResidue();
387 public char getReferenceAnnotation(int nodeIndex)
389 char value = nodes.get(nodeIndex).getReferenceAnnotation();
393 public char getMaskedValue(int nodeIndex)
395 char value = nodes.get(nodeIndex).getMaskValue();
399 public char getConsensusStructure(int nodeIndex)
401 char value = nodes.get(nodeIndex).getConsensusStructure();
406 * returns the average match emission for a given symbol
410 * average negative log propbability of a match emission of the given symbol
412 public double getAverageMatchEmission(int symbolIndex)
414 double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
418 public int getNumberOfSymbols()
420 return numberOfSymbols;
423 public void setNumberOfSymbols(int numberOfSymbols)
425 this.numberOfSymbols = numberOfSymbols;
431 * fills symbol array and also finds numberOfSymbols
434 * scanner scanning symbol line in file
436 public void fillSymbols(Scanner parser)
439 while (parser.hasNext())
441 String strSymbol = parser.next();
442 char[] symbol = strSymbol.toCharArray();
443 symbols.add(symbol[0]);
444 symbolIndexLookup.put(symbol[0], i);
447 numberOfSymbols = symbols.size();
456 public void addFileProperty(String key, String value)
458 fileProperties.put(key, value);
461 public boolean referenceAnnotationIsActive()
464 status = fileProperties.get(REFERENCE_ANNOTATION);
481 public boolean maskValueIsActive()
484 status = fileProperties.get(MASKED_VALUE);
501 public boolean consensusResidueIsActive()
504 status = fileProperties.get(CONSENSUS_RESIDUE);
521 public boolean consensusStructureIsActive()
524 status = fileProperties.get(CONSENSUS_STRUCTURE);
541 public boolean mapIsActive()
544 status = fileProperties.get(MAP);
561 public void setAlignmentColumn(int nodeIndex, int column)
563 nodes.get(nodeIndex).setAlignmentColumn(column);
566 public void setReferenceAnnotation(int nodeIndex, char value)
568 nodes.get(nodeIndex).setReferenceAnnotation(value);
571 public void setConsensusResidue(int nodeIndex, char value)
573 nodes.get(nodeIndex).setConsensusResidue(value);
576 public void setConsensusStructure(int nodeIndex, char value)
578 nodes.get(nodeIndex).setConsensusStructure(value);
581 public void setMaskValue(int nodeIndex, char value)
583 nodes.get(nodeIndex).setMaskValue(value);
586 public String getGatheringThreshold()
589 value = fileProperties.get("GA");
593 public String getNoiseCutoff()
596 value = fileProperties.get("NC");
600 public String getTrustedCutoff()
603 value = fileProperties.get("TC");
607 public String getViterbi()
610 value = fileProperties.get(VITERBI);
614 public String getMSV()
617 value = fileProperties.get(MSV);
621 public String getForward()
624 value = fileProperties.get(FORWARD);
628 public void setMAPStatus(boolean status)
632 fileProperties.put(MAP, YES);
636 fileProperties.put(MAP, NO);
640 public void setReferenceAnnotationStatus(boolean status)
644 fileProperties.put(REFERENCE_ANNOTATION, YES);
648 fileProperties.put(REFERENCE_ANNOTATION, NO);
652 public void setMaskedValueStatus(boolean status)
656 fileProperties.put(MASKED_VALUE, YES);
660 fileProperties.put(MASKED_VALUE, NO);
664 public void setConsensusResidueStatus(boolean status)
668 fileProperties.put(CONSENSUS_RESIDUE, YES);
672 fileProperties.put(CONSENSUS_RESIDUE, NO);
676 public void setConsensusStructureStatus(boolean status)
680 fileProperties.put(CONSENSUS_STRUCTURE, YES);
684 fileProperties.put(CONSENSUS_STRUCTURE, NO);
691 * type of transition occuring
692 * @return index value representing position along stateTransition array.
694 public Integer getTransitionType(String transition)
727 * find the index of the node in a hidden Markov model based on the column in
730 * @param alignmentColumn
733 public Integer findNodeIndex(int alignmentColumn)
736 index = nodeLookup.get(alignmentColumn);
740 public static String findStringFromBoolean(boolean value)