1 package jalview.datamodel;
3 import jalview.schemes.ResidueProperties;
5 import java.util.ArrayList;
6 import java.util.HashMap;
9 import java.util.Scanner;
12 * Data structure which stores a hidden Markov model. Currently contains file properties as well, not sure whether these should be transferred to the HMMFile class
17 public class HiddenMarkovModel
21 // Stores file properties. Do not directly access this field as it contains
22 // only string value - use the getter methods. For example, to find the length
23 // of theHMM, use getModelLength()to return an int value
24 Map<String, String> fileProperties = new HashMap<>();
26 //contains all of the symbols used in this model. The index of each symbol represents its lookup value
27 List<Character> symbols = new ArrayList<>();
29 // contains information for each node in the model. The begin node is at index
30 // 0. Node 0 contains average emission probabilities for each symbol
31 List<HMMNode> nodes = new ArrayList<>();
33 // contains the HMM node for each alignment column
34 Map<Integer, Integer> nodeLookup = new HashMap<>();
36 //contains the symbol index for each symbol
37 Map<Character, Integer> symbolIndexLookup = new HashMap<>();
39 Map<Character, Double> backgroundFrequencies = new HashMap();
44 final static String YES = "yes";
46 final static String NO = "no";
50 //keys for file properties hashmap
51 private final String NAME = "NAME";
53 private final String ACCESSION_NUMBER = "ACC";
55 private final String DESCRIPTION = "DESC";
57 private final String LENGTH = "LENG";
59 private final String MAX_LENGTH = "MAXL";
61 private final String ALPHABET = "ALPH";
63 private final String DATE = "DATE";
65 private final String COMMAND_LOG = "COM";
67 private final String NUMBER_OF_SEQUENCES = "NSEQ";
69 private final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
71 private final String CHECK_SUM = "CKSUM";
73 private final String GATHERING_THRESHOLDS = "GA";
75 private final String TRUSTED_CUTOFFS = "TC";
77 private final String NOISE_CUTOFFS = "NC";
79 private final String STATISTICS = "STATS";
81 private final String COMPO = "COMPO";
83 private final String GATHERING_THRESHOLD = "GA";
85 private final String TRUSTED_CUTOFF = "TC";
87 private final String NOISE_CUTOFF = "NC";
89 private final String VITERBI = "VITERBI";
91 private final String MSV = "MSV";
93 private final String FORWARD = "FORWARD";
95 private final String MAP = "MAP";
97 private final String REFERENCE_ANNOTATION = "RF";
99 private final String CONSENSUS_RESIDUE = "CONS";
101 private final String CONSENSUS_STRUCTURE = "CS";
103 private final String MASKED_VALUE = "MM";
105 public static final int MATCHTOMATCH = 0;
107 public static final int MATCHTOINSERT = 1;
109 public static final int MATCHTODELETE = 2;
111 public static final int INSERTTOMATCH = 3;
113 public static final int INSERTTOINSERT = 4;
115 public static final int DELETETOMATCH = 5;
117 public static final int DELETETODELETE = 6;
119 public Map<Integer, Integer> getNodeLookup()
124 public void setNodeLookup(Map<Integer, Integer> nodeLookup)
126 this.nodeLookup = nodeLookup;
129 public List<Character> getSymbols()
134 public Map<String, String> getFileProperties()
136 return fileProperties;
139 public HMMNode getNode(int nodeIndex)
141 return getNodes().get(nodeIndex);
144 public void setSymbols(List<Character> symbolsL)
146 this.symbols = symbolsL;
149 public String getName()
151 return fileProperties.get(NAME);
153 public String getAccessionNumber()
155 return fileProperties.get(ACCESSION_NUMBER);
158 public void setAccessionNumber(String value)
160 fileProperties.put(ACCESSION_NUMBER, value);
163 public String getDescription()
165 return fileProperties.get(DESCRIPTION);
168 public void setDescription(String value)
170 fileProperties.put(DESCRIPTION, value);
173 public Integer getLength()
175 if (fileProperties.get(LENGTH) == null)
179 return Integer.parseInt(fileProperties.get(LENGTH));
182 public void setLength(int value)
184 fileProperties.put(LENGTH, String.valueOf(value));
187 public Integer getMaxInstanceLength()
189 if (fileProperties.get(MAX_LENGTH) == null)
193 return Integer.parseInt(fileProperties.get(MAX_LENGTH));
196 public void setMaxInstanceLength(int value)
198 fileProperties.put(MAX_LENGTH, String.valueOf(value));
201 // gets type of symbol alphabet - "amino", "DNA", "RNA"
202 public String getAlphabetType()
204 return fileProperties.get(ALPHABET);
207 public void setAlphabetType(String value)
209 fileProperties.put(ALPHABET, value);
212 // not sure whether to implement this with Date object
213 public String getDate()
215 return fileProperties.get(DATE);
218 public void setDate(String value)
220 fileProperties.put(DATE, value);
223 // not sure whether to implement this
224 public String getCommandLineLog()
226 return fileProperties.get(COMMAND_LOG);
229 public void setCommandLineLog(String value)
231 fileProperties.put(COMMAND_LOG, value);
234 // gets the number of sequences that the HMM was trained on
235 public Integer getNumberOfSequences()
237 if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
241 return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
244 public void setNumberOfSequences(int value)
246 fileProperties.put(NUMBER_OF_SEQUENCES, String.valueOf(value));
249 // gets the effective number determined during sequence weighting
250 public Double getEffectiveNumberOfSequences()
252 if (fileProperties.get(LENGTH) == null)
256 return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
259 public void setEffectiveNumberOfSequences(double value)
261 fileProperties.put(EFF_NUMBER_OF_SEQUENCES, String.valueOf(value));
264 public Long getCheckSum()
266 if (fileProperties.get(LENGTH) == null)
270 return Long.parseLong(fileProperties.get(CHECK_SUM));
273 public void setCheckSum(long value)
275 fileProperties.put(CHECK_SUM, String.valueOf(value));
278 public List<HMMNode> getNodes()
283 public void setNodes(List<HMMNode> nodes)
289 * get match emission probability for a given symbol at a column in the
297 public Double getMatchEmissionProbability(int alignColumn, char symbol)
302 if (!symbolIndexLookup.containsKey(symbol))
306 symbolIndex = symbolIndexLookup.get(symbol);
307 if (nodeLookup.containsKey(alignColumn + 1))
309 nodeIndex = nodeLookup.get(alignColumn + 1);
310 probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex);
321 * get insert emission probability for a given symbol at a column in the
328 public Double getInsertEmissionProbability(int alignColumn, char symbol)
333 if (!symbolIndexLookup.containsKey(symbol))
337 symbolIndex = symbolIndexLookup.get(symbol);
338 if (nodeLookup.containsKey(alignColumn + 1))
340 nodeIndex = nodeLookup.get(alignColumn + 1);
341 probability = getNode(nodeIndex).getInsertEmissions()
353 * get state transition probability for a given transition type at a column in
360 public Double getStateTransitionProbability(int alignColumn,
366 if (nodeLookup.containsKey(alignColumn + 1))
368 nodeIndex = nodeLookup.get(alignColumn + 1);
369 probability = getNode(nodeIndex).getStateTransitions()
380 public Integer getNodeAlignmentColumn(int nodeIndex)
382 Integer value = nodes.get(nodeIndex).getAlignmentColumn();
386 public char getConsensusResidue(int nodeIndex)
388 char value = nodes.get(nodeIndex).getConsensusResidue();
392 public char getConsensusAtAlignColumn(int columnIndex)
395 Integer index = findNodeIndex(columnIndex + 1);
400 value = getNodes().get(index).getConsensusResidue();
404 public char getReferenceAnnotation(int nodeIndex)
406 char value = nodes.get(nodeIndex).getReferenceAnnotation();
410 public char getMaskedValue(int nodeIndex)
412 char value = nodes.get(nodeIndex).getMaskValue();
416 public char getConsensusStructure(int nodeIndex)
418 char value = nodes.get(nodeIndex).getConsensusStructure();
423 * returns the average match emission for a given symbol
427 * average negative log propbability of a match emission of the given symbol
429 public double getAverageMatchEmission(int symbolIndex)
431 double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
435 public int getNumberOfSymbols()
437 return numberOfSymbols;
440 public void setNumberOfSymbols(int numberOfSymbols)
442 this.numberOfSymbols = numberOfSymbols;
448 * fills symbol array and also finds numberOfSymbols
451 * scanner scanning symbol line in file
453 public void fillSymbols(Scanner parser)
456 while (parser.hasNext())
458 String strSymbol = parser.next();
459 char[] symbol = strSymbol.toCharArray();
460 symbols.add(symbol[0]);
461 symbolIndexLookup.put(symbol[0], i);
464 numberOfSymbols = symbols.size();
473 public void addFileProperty(String key, String value)
475 fileProperties.put(key, value);
478 public boolean referenceAnnotationIsActive()
481 status = fileProperties.get(REFERENCE_ANNOTATION);
498 public boolean maskValueIsActive()
501 status = fileProperties.get(MASKED_VALUE);
518 public boolean consensusResidueIsActive()
521 status = fileProperties.get(CONSENSUS_RESIDUE);
538 public boolean consensusStructureIsActive()
541 status = fileProperties.get(CONSENSUS_STRUCTURE);
558 public boolean mapIsActive()
561 status = fileProperties.get(MAP);
578 public void setAlignmentColumn(int nodeIndex, int column)
580 nodes.get(nodeIndex).setAlignmentColumn(column);
583 public void setReferenceAnnotation(int nodeIndex, char value)
585 nodes.get(nodeIndex).setReferenceAnnotation(value);
588 public void setConsensusResidue(int nodeIndex, char value)
590 nodes.get(nodeIndex).setConsensusResidue(value);
593 public void setConsensusStructure(int nodeIndex, char value)
595 nodes.get(nodeIndex).setConsensusStructure(value);
598 public void setMaskValue(int nodeIndex, char value)
600 nodes.get(nodeIndex).setMaskValue(value);
603 public String getGatheringThreshold()
606 value = fileProperties.get("GA");
610 public String getNoiseCutoff()
613 value = fileProperties.get("NC");
617 public String getTrustedCutoff()
620 value = fileProperties.get("TC");
624 public String getViterbi()
627 value = fileProperties.get(VITERBI);
631 public String getMSV()
634 value = fileProperties.get(MSV);
638 public String getForward()
641 value = fileProperties.get(FORWARD);
645 public void setMAPStatus(boolean status)
649 fileProperties.put(MAP, YES);
653 fileProperties.put(MAP, NO);
657 public void setReferenceAnnotationStatus(boolean status)
661 fileProperties.put(REFERENCE_ANNOTATION, YES);
665 fileProperties.put(REFERENCE_ANNOTATION, NO);
669 public void setMaskedValueStatus(boolean status)
673 fileProperties.put(MASKED_VALUE, YES);
677 fileProperties.put(MASKED_VALUE, NO);
681 public void setConsensusResidueStatus(boolean status)
685 fileProperties.put(CONSENSUS_RESIDUE, YES);
689 fileProperties.put(CONSENSUS_RESIDUE, NO);
693 public void setConsensusStructureStatus(boolean status)
697 fileProperties.put(CONSENSUS_STRUCTURE, YES);
701 fileProperties.put(CONSENSUS_STRUCTURE, NO);
706 * find the index of the node in a hidden Markov model based on the column in
709 * @param alignmentColumn
712 public Integer findNodeIndex(int alignmentColumn)
715 index = nodeLookup.get(alignmentColumn);
719 public static String findStringFromBoolean(boolean value)
734 public AlignmentAnnotation createAnnotation(int length)
736 Annotation[] annotations = new Annotation[length];
738 for (int alignPos = 0; alignPos < length; alignPos++)
740 Float content = getInformationContent(alignPos);
747 cons = getConsensusAtAlignColumn(alignPos);
748 cons = Character.toUpperCase(cons);
750 String description = String.format("%.3f", content);
751 description += " bits";
752 annotations[alignPos] = new Annotation(cons.toString(), description,
757 AlignmentAnnotation annotation = new AlignmentAnnotation(
758 "Information Content",
759 "The information content of each column, measured in bits",
761 0f, max, AlignmentAnnotation.BAR_GRAPH);
765 public float getInformationContent(int column)
767 float informationContent = 0f;
769 for (char symbol : symbols)
772 if (symbols.size() == 20)
774 freq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
776 if (symbols.size() == 4)
778 freq = ResidueProperties.nucleotideBackgroundFrequencies
781 Double hmmProb = getMatchEmissionProbability(column, symbol);
782 float prob = hmmProb.floatValue();
783 informationContent += prob * (Math.log(prob / freq) / Math.log(2));
787 return informationContent;