package jalview.datamodel; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * Data structure which stores a hidden Markov model. Currently contains file * properties as well, not sure whether these should be transferred to the * HMMFile class * * @author TZVanaalten * */ public class HiddenMarkovModel { // Stores file properties. Do not directly access this field as it contains // only string value - use the getter methods. For example, to find the length // of theHMM, use getModelLength()to return an int value Map fileProperties = new HashMap<>(); // contains all of the symbols used in this model. The index of each symbol // represents its lookup value List symbols = new ArrayList<>(); // contains information for each node in the model. The begin node is at index // 0. Node 0 contains average emission probabilities for each symbol List nodes = new ArrayList<>(); // contains the HMM node for each alignment column, alignment columns start at // index 0; Map nodeLookup = new HashMap<>(); // contains the symbol index for each symbol Map symbolIndexLookup = new HashMap<>(); final static String YES = "yes"; final static String NO = "no"; // keys for file properties hashmap private final String NAME = "NAME"; private final String ACCESSION_NUMBER = "ACC"; private final String DESCRIPTION = "DESC"; private final String LENGTH = "LENG"; private final String MAX_LENGTH = "MAXL"; private final String ALPHABET = "ALPH"; private final String DATE = "DATE"; private final String COMMAND_LOG = "COM"; private final String NUMBER_OF_SEQUENCES = "NSEQ"; private final String EFF_NUMBER_OF_SEQUENCES = "EFFN"; private final String CHECK_SUM = "CKSUM"; private final String GATHERING_THRESHOLDS = "GA"; private final String TRUSTED_CUTOFFS = "TC"; private final String NOISE_CUTOFFS = "NC"; private final String STATISTICS = "STATS"; private final String COMPO = "COMPO"; private final String GATHERING_THRESHOLD = "GA"; private final String TRUSTED_CUTOFF = "TC"; private final String NOISE_CUTOFF = "NC"; private final String VITERBI = "VITERBI"; private final String MSV = "MSV"; private final String FORWARD = "FORWARD"; private final String MAP = "MAP"; private final String REFERENCE_ANNOTATION = "RF"; private final String CONSENSUS_RESIDUE = "CONS"; private final String CONSENSUS_STRUCTURE = "CS"; private final String MASKED_VALUE = "MM"; public static final int MATCHTOMATCH = 0; public static final int MATCHTOINSERT = 1; public static final int MATCHTODELETE = 2; public static final int INSERTTOMATCH = 3; public static final int INSERTTOINSERT = 4; public static final int DELETETOMATCH = 5; public static final int DELETETODELETE = 6; String fileHeader; public HiddenMarkovModel() { } public HiddenMarkovModel(HiddenMarkovModel hmm) { super(); this.fileProperties = new HashMap<>(hmm.fileProperties); this.symbols = new ArrayList<>(hmm.symbols); this.nodes = new ArrayList<>(hmm.nodes); this.nodeLookup = new HashMap<>(hmm.nodeLookup); this.symbolIndexLookup = new HashMap<>( hmm.symbolIndexLookup); this.fileHeader = new String(hmm.fileHeader); } /** * Gets the file header of the .hmm file this model came from. * * @return */ public String getFileHeader() { return fileHeader; } /** * Sets the file header of this model. * * @param header */ public void setFileHeader(String header) { fileHeader = header; } /** * Returns the map containing the matches between nodes and alignment column * indexes. * * @return * */ public Map getNodeLookup() { return nodeLookup; } /** * Returns the list of symbols used in this hidden Markov model. * * @return */ public List getSymbols() { return symbols; } /** * Returns the file properties. * * @return */ public Map getFileProperties() { return fileProperties; } /** * Gets the node in the hidden Markov model at the specified position. * * @param nodeIndex * The index of the node requested. Node 0 optionally contains the * average match emission probabilities across the entire model, and * always contains the insert emission probabilities and state * transition probabilities for the begin node. Node 1 contains the * first node in the HMM that can correspond to a column in the * alignment. * @return */ public HMMNode getNode(int nodeIndex) { return getNodes().get(nodeIndex); } /** * Sets the list of symbols used in the hidden Markov model to the list * specified. * * @param symbolsL * The list of symbols to which the current list is to be changed. * */ public void setSymbols(List symbolsL) { this.symbols = symbolsL; } /** * Returns the name of the sequence alignment on which the HMM is based. * * @return */ public String getName() { return fileProperties.get(NAME); } /** * Returns the accession number. * @return */ public String getAccessionNumber() { return fileProperties.get(ACCESSION_NUMBER); } /** * Returns a description of the sequence alignment on which the hidden Markov * model is based. * * @return */ public String getDescription() { return fileProperties.get(DESCRIPTION); } /** * Returns the length of the hidden Markov model. * * @return */ public Integer getLength() { if (fileProperties.get(LENGTH) == null) { return null; } return Integer.parseInt(fileProperties.get(LENGTH)); } /** * Returns the max instance length within the hidden Markov model. * * @return */ public Integer getMaxInstanceLength() { if (fileProperties.get(MAX_LENGTH) == null) { return null; } return Integer.parseInt(fileProperties.get(MAX_LENGTH)); } /** * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the * options. Other alphabets may be added. * * @return */ public String getAlphabetType() { return fileProperties.get(ALPHABET); } /** * Returns the date as a String. * * @return */ public String getDate() { return fileProperties.get(DATE); } /** * Returns the command line log. * * @return */ public String getCommandLineLog() { return fileProperties.get(COMMAND_LOG); } /** * Returns the number of sequences on which the HMM was trained. * * @return */ public Integer getNumberOfSequences() { if (fileProperties.get(NUMBER_OF_SEQUENCES) == null) { return null; } return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES)); } /** * Returns the effective number of sequences on which the HMM was based. * * @param value */ public Double getEffectiveNumberOfSequences() { if (fileProperties.get(LENGTH) == null) { return null; } return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES)); } /** * Returns the checksum. * * @return */ public Long getCheckSum() { if (fileProperties.get(LENGTH) == null) { return null; } return Long.parseLong(fileProperties.get(CHECK_SUM)); } /** * Returns the list of nodes in this HMM. * * @return */ public List getNodes() { return nodes; } /** * Sets the list of nodes in this HMM to the given list. * * @param nodes * The list of nodes to which the current list of nodes is being * changed. */ public void setNodes(List nodes) { this.nodes = nodes; } /** * Gets the match emission probability for a given symbol at a column in the * alignment. * * @param alignColumn * The index of the alignment column, starting at index 0. Index 0 * usually corresponds to index 1 in the HMM. * @param symbol * The symbol for which the desired probability is being requested. * @return * */ public Double getMatchEmissionProbability(int alignColumn, char symbol) { int symbolIndex; int nodeIndex; Double probability; if (!symbolIndexLookup.containsKey(symbol)) { return 0d; } symbolIndex = symbolIndexLookup.get(symbol); if (nodeLookup.containsKey(alignColumn)) { nodeIndex = nodeLookup.get(alignColumn); probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex); return probability; } else { return 0d; } } /** * Gets the insert emission probability for a given symbol at a column in the * alignment. * * @param alignColumn * The index of the alignment column, starting at index 0. Index 0 * usually corresponds to index 1 in the HMM. * @param symbol * The symbol for which the desired probability is being requested. * @return * */ public Double getInsertEmissionProbability(int alignColumn, char symbol) { int symbolIndex; int nodeIndex; Double probability; if (!symbolIndexLookup.containsKey(symbol)) { return 0d; } symbolIndex = symbolIndexLookup.get(symbol); if (nodeLookup.containsKey(alignColumn)) { nodeIndex = nodeLookup.get(alignColumn); probability = getNode(nodeIndex).getInsertEmissions() .get(symbolIndex); return probability; } else { return 0d; } } /** * Gets the state transition probability for a given symbol at a column in the * alignment. * * @param alignColumn * The index of the alignment column, starting at index 0. Index 0 * usually corresponds to index 1 in the HMM. * @param symbol * The symbol for which the desired probability is being requested. * @return * */ public Double getStateTransitionProbability(int alignColumn, int transition) { int nodeIndex; Double probability; if (nodeLookup.containsKey(alignColumn)) { nodeIndex = nodeLookup.get(alignColumn); probability = getNode(nodeIndex).getStateTransitions() .get(transition); return probability; } else { return 0d; } } /** * Returns the alignment column linked to the node at the given index. * * @param nodeIndex * The index of the node, starting from index 1. Index 0 is the begin * node, which does not correspond to a column in the alignment. * @return */ public Integer getNodeAlignmentColumn(int nodeIndex) { Integer value = nodes.get(nodeIndex).getAlignmentColumn(); return value; } /** * Returns the consensus residue at the specified node. * * @param nodeIndex * The index of the specified node. * @return */ public char getConsensusResidue(int nodeIndex) { char value = nodes.get(nodeIndex).getConsensusResidue(); return value; } /** * Returns the consensus at a given alignment column. If the character is * lower case, its emission probability is less than 0.5. * * @param columnIndex * The index of the column in the alignment for which the consensus * is desired. The list of columns starts at index 0. * @return */ public char getConsensusAtAlignColumn(int columnIndex) { char mostLikely = '-'; if (consensusResidueIsActive()) { Integer index = findNodeIndex(columnIndex); if (index == null) { return '-'; } mostLikely = getNodes().get(index).getConsensusResidue(); return mostLikely; } else { double highestProb = 0; for (char character : symbols) { Double prob = getMatchEmissionProbability(columnIndex, character); if (prob > highestProb) { highestProb = prob; mostLikely = character; } } if (highestProb < 0.5) { mostLikely = Character.toLowerCase(mostLikely); } return mostLikely; } } /** * Returns the reference annotation at the specified node. * * @param nodeIndex * The index of the specified node. * @return */ public char getReferenceAnnotation(int nodeIndex) { char value = nodes.get(nodeIndex).getReferenceAnnotation(); return value; } /** * Returns the mask value at the specified node. * * @param nodeIndex * The index of the specified node. * @return */ public char getMaskedValue(int nodeIndex) { char value = nodes.get(nodeIndex).getMaskValue(); return value; } /** * Returns the consensus structure at the specified node. * * @param nodeIndex * The index of the specified node. * @return */ public char getConsensusStructure(int nodeIndex) { char value = nodes.get(nodeIndex).getConsensusStructure(); return value; } /** * Returns the average match emission probability for a given symbol * * @param symbolIndex * The index of the symbol. * @return * */ public double getAverageMatchEmission(int symbolIndex) { double value = nodes.get(0).getMatchEmissions().get(symbolIndex); return value; } /** * Returns the number of symbols in the alphabet used in this HMM. * * @return */ public int getNumberOfSymbols() { return symbols.size(); } /** * Adds a file property. * * @param key * @param value */ public void addFileProperty(String key, String value) { fileProperties.put(key, value); } /** * Returns a boolean indicating whether the reference annotation is active. * * @return */ public boolean referenceAnnotationIsActive() { String status; status = fileProperties.get(REFERENCE_ANNOTATION); if (status == null) { return false; } switch (status) { case YES: return true; case NO: return false; default: return false; } } /** * Returns a boolean indicating whether the mask value annotation is active. * * @return */ public boolean maskValueIsActive() { String status; status = fileProperties.get(MASKED_VALUE); if (status == null) { return false; } switch (status) { case YES: return true; case NO: return false; default: return false; } } /** * Returns a boolean indicating whether the consensus residue annotation is * active. * * @return */ public boolean consensusResidueIsActive() { String status; status = fileProperties.get(CONSENSUS_RESIDUE); if (status == null) { return false; } switch (status) { case YES: return true; case NO: return false; default: return false; } } /** * Returns a boolean indicating whether the consensus structure annotation is * active. * * @return */ public boolean consensusStructureIsActive() { String status; status = fileProperties.get(CONSENSUS_STRUCTURE); if (status == null) { return false; } switch (status) { case YES: return true; case NO: return false; default: return false; } } /** * Returns a boolean indicating whether the MAP annotation is active. * * @return */ public boolean mapIsActive() { String status; status = fileProperties.get(MAP); if (status == null) { return false; } switch (status) { case YES: return true; case NO: return false; default: return false; } } /** * Sets the alignment column of the specified node. * * @param nodeIndex * * @param column * */ public void setAlignmentColumn(int nodeIndex, int column) { nodes.get(nodeIndex).setAlignmentColumn(column); } /** * Sets the reference annotation at a given node. * * @param nodeIndex * @param value */ public void setReferenceAnnotation(int nodeIndex, char value) { nodes.get(nodeIndex).setReferenceAnnotation(value); } /** * Sets the consensus residue at a given node. * * @param nodeIndex * @param value */ public void setConsensusResidue(int nodeIndex, char value) { nodes.get(nodeIndex).setConsensusResidue(value); } /** * Sets the consensus structure at a given node. * * @param nodeIndex * @param value */ public void setConsensusStructure(int nodeIndex, char value) { nodes.get(nodeIndex).setConsensusStructure(value); } /** * Sets the mask value at a given node. * * @param nodeIndex * @param value */ public void setMaskValue(int nodeIndex, char value) { nodes.get(nodeIndex).setMaskValue(value); } /** * Temporary implementation, should not be used. * * @return */ public String getGatheringThreshold() { String value; value = fileProperties.get("GA"); return value; } /** * Temporary implementation, should not be used. * * @return */ public String getNoiseCutoff() { String value; value = fileProperties.get("NC"); return value; } /** * Temporary implementation, should not be used. * * @return */ public String getTrustedCutoff() { String value; value = fileProperties.get("TC"); return value; } /** * Temporary implementation, should not be used. * * @return */ public String getViterbi() { String value; value = fileProperties.get(VITERBI); return value; } /** * Temporary implementation, should not be used. * * @return */ public String getMSV() { String value; value = fileProperties.get(MSV); return value; } /** * Temporary implementation, should not be used. * * @return */ public String getForward() { String value; value = fileProperties.get(FORWARD); return value; } /** * Sets the activation status of the MAP annotation. * * @param status */ public void setMAPStatus(boolean status) { fileProperties.put(MAP, status ? YES : NO); } /** * Sets the activation status of the reference annotation. * * @param status */ public void setReferenceAnnotationStatus(boolean status) { fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO); } /** * Sets the activation status of the mask value annotation. * * @param status */ public void setMaskedValueStatus(boolean status) { fileProperties.put(MASKED_VALUE, status ? YES : NO); } /** * Sets the activation status of the consensus residue annotation. * * @param status */ public void setConsensusResidueStatus(boolean status) { fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO); } /** * Sets the activation status of the consensus structure annotation. * * @param status */ public void setConsensusStructureStatus(boolean status) { fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO); } /** * Finds the index of the node in a hidden Markov model based on the column in * the alignment * * @param alignmentColumn * The index of the column in the alignment, with the indexes * starting from 0. */ public Integer findNodeIndex(int alignmentColumn) { Integer index; index = nodeLookup.get(alignmentColumn); return index; } /** * Finds the String values of a boolean. "yes" for true and "no" for false. * * @param value * @return */ public static String findStringFromBoolean(boolean value) { if (value) { return YES; } else { return NO; } } /** * Returns the consensus sequence based on the most probable symbol at each * position. The sequence is adjusted to match the length of the existing * sequence alignment. Gap characters are used as padding. * * @param length * The length of the longest sequence in the existing alignment. * @return */ public Sequence getConsensusSequence() { int start; int end; int modelLength; start = getNodeAlignmentColumn(1); modelLength = getLength(); end = getNodeAlignmentColumn(modelLength); char[] sequence = new char[end + 1]; for (int index = 0; index < end + 1; index++) { Character character; character = getConsensusAtAlignColumn(index); if (character == null || character == '-') { sequence[index] = '-'; } else { sequence[index] = Character.toUpperCase(character); } } Sequence seq = new Sequence(getName() + "_HMM", sequence, start, end); return seq; } /** * Initiates a HMM consensus sequence * * @return A new HMM consensus sequence */ public SequenceI initHMMSequence() { Sequence consensus = getConsensusSequence(); consensus.setIsHMMConsensusSequence(true); consensus.setHMM(this); return consensus; } public int getSymbolIndex(char c) { return symbolIndexLookup.get(c); } public void setSymbolIndex(Character c, Integer i) { symbolIndexLookup.put(c, i); } }