X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2FHiddenMarkovModel.java;h=e74d8261693634fadb0a5d2627d16343ccfb8fd2;hb=d6cace53173ae859bfd93f5e8a13be427864afd1;hp=5331f3d7c5da0336db9c7258732f9331639d907b;hpb=293afc4e4e080e8d48cfa99760667b2b1dfe1da0;p=jalview.git diff --git a/src/jalview/datamodel/HiddenMarkovModel.java b/src/jalview/datamodel/HiddenMarkovModel.java index 5331f3d..e74d826 100644 --- a/src/jalview/datamodel/HiddenMarkovModel.java +++ b/src/jalview/datamodel/HiddenMarkovModel.java @@ -1,5 +1,7 @@ package jalview.datamodel; +import jalview.gui.AlignFrame; + import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -7,39 +9,44 @@ import java.util.Map; import java.util.Scanner; /** - * Data structure which stores a hidden Markov model. Currently contains file properties as well, not sure whether these should be transferred to the HMMFile class + * Data structure which stores a hidden Markov model. Currently contains file + * properties as well, not sure whether these should be transferred to the + * HMMFile class * * @author TZVanaalten * */ public class HiddenMarkovModel { + + // Stores file properties. Do not directly access this field as it contains // only string value - use the getter methods. For example, to find the length // of theHMM, use getModelLength()to return an int value Map fileProperties = new HashMap<>(); - //contains all of the symbols used in this model. The index of each symbol represents its lookup value + // contains all of the symbols used in this model. The index of each symbol + // represents its lookup value List symbols = new ArrayList<>(); // contains information for each node in the model. The begin node is at index // 0. Node 0 contains average emission probabilities for each symbol List nodes = new ArrayList<>(); - // contains the HMM node for each alignment column + // contains the HMM node for each alignment column, alignment columns start at + // index 0; Map nodeLookup = new HashMap<>(); - //contains the symbol index for each symbol + // contains the symbol index for each symbol Map symbolIndexLookup = new HashMap<>(); - final static String YES = "yes"; final static String NO = "no"; int numberOfSymbols; - //keys for file properties hashmap + // keys for file properties hashmap private final String NAME = "NAME"; private final String ACCESSION_NUMBER = "ACC"; @@ -94,73 +101,157 @@ public class HiddenMarkovModel private final String MASKED_VALUE = "MM"; - final static String[] TRANSITION_TYPES = new String[] { "m->m", "m->i", - "m->d", "i->m", "i->i", "d->m", "d->d" }; + public static final int MATCHTOMATCH = 0; + + public static final int MATCHTOINSERT = 1; + + public static final int MATCHTODELETE = 2; + + public static final int INSERTTOMATCH = 3; + + public static final int INSERTTOINSERT = 4; + + public static final int DELETETOMATCH = 5; + + public static final int DELETETODELETE = 6; + + String fileHeader; - public String getTransitionType(int index) + public HiddenMarkovModel() { - return TRANSITION_TYPES[index]; + } - public Map getNodeLookup() + public HiddenMarkovModel(HiddenMarkovModel hmm) { - return nodeLookup; + super(); + this.fileProperties = new HashMap<>(hmm.fileProperties); + this.symbols = new ArrayList<>(hmm.symbols); + this.nodes = new ArrayList<>(hmm.nodes); + this.nodeLookup = new HashMap<>(hmm.nodeLookup); + this.symbolIndexLookup = new HashMap<>( + hmm.symbolIndexLookup); + this.numberOfSymbols = hmm.numberOfSymbols; + this.fileHeader = new String(hmm.fileHeader); } - public void setNodeLookup(Map nodeLookup) + /** + * Gets the file header of the .hmm file this model came from. + * + * @return + */ + public String getFileHeader() { - this.nodeLookup = nodeLookup; + return fileHeader; } - public String[] getTransitionTypes() + /** + * Sets the file header of this model. + * + * @param header + */ + public void setFileHeader(String header) { - return TRANSITION_TYPES; + fileHeader = header; } + /** + * Returns the map containing the matches between nodes and alignment column + * indexes. + * + * @return + * + */ + public Map getNodeLookup() + { + return nodeLookup; + } + + /** + * Returns the list of symbols used in this hidden Markov model. + * + * @return + */ public List getSymbols() { return symbols; } - + + /** + * Returns the file properties. + * + * @return + */ public Map getFileProperties() { return fileProperties; } + /** + * Gets the node in the hidden Markov model at the specified position. + * + * @param nodeIndex + * The index of the node requested. Node 0 optionally contains the + * average match emission probabilities across the entire model, and + * always contains the insert emission probabilities and state + * transition probabilities for the begin node. Node 1 contains the + * first node in the HMM that can correspond to a column in the + * alignment. + * @return + */ public HMMNode getNode(int nodeIndex) { return getNodes().get(nodeIndex); } + /** + * Sets the list of symbols used in the hidden Markov model to the list + * specified. + * + * @param symbolsL + * The list of symbols to which the current list is to be changed. + * + */ public void setSymbols(List symbolsL) { this.symbols = symbolsL; } + /** + * Returns the name of the sequence alignment on which the HMM is based. + * + * @return + */ public String getName() { return fileProperties.get(NAME); } + + /** + * Returns the accession number. + * @return + */ public String getAccessionNumber() { return fileProperties.get(ACCESSION_NUMBER); } - public void setAccessionNumber(String value) - { - fileProperties.put(ACCESSION_NUMBER, value); - } - + /** + * Returns a description of the sequence alignment on which the hidden Markov + * model is based. + * + * @return + */ public String getDescription() { return fileProperties.get(DESCRIPTION); } - public void setDescription(String value) - { - fileProperties.put(DESCRIPTION, value); - } - + /** + * Returns the length of the hidden Markov model. + * + * @return + */ public Integer getLength() { if (fileProperties.get(LENGTH) == null) @@ -170,11 +261,11 @@ public class HiddenMarkovModel return Integer.parseInt(fileProperties.get(LENGTH)); } - public void setLength(int value) - { - fileProperties.put(LENGTH, String.valueOf(value)); - } - + /** + * Returns the max instance length within the hidden Markov model. + * + * @return + */ public Integer getMaxInstanceLength() { if (fileProperties.get(MAX_LENGTH) == null) @@ -184,45 +275,42 @@ public class HiddenMarkovModel return Integer.parseInt(fileProperties.get(MAX_LENGTH)); } - public void setMaxInstanceLength(int value) - { - fileProperties.put(MAX_LENGTH, String.valueOf(value)); - } - - // gets type of symbol alphabet - "amino", "DNA", "RNA" + /** + * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the + * options. Other alphabets may be added. + * + * @return + */ public String getAlphabetType() { return fileProperties.get(ALPHABET); } - public void setAlphabetType(String value) - { - fileProperties.put(ALPHABET, value); - } - - // not sure whether to implement this with Date object + /** + * Returns the date as a String. + * + * @return + */ public String getDate() { return fileProperties.get(DATE); } - public void setDate(String value) - { - fileProperties.put(DATE, value); - } - - // not sure whether to implement this + /** + * Returns the command line log. + * + * @return + */ public String getCommandLineLog() { return fileProperties.get(COMMAND_LOG); } - public void setCommandLineLog(String value) - { - fileProperties.put(COMMAND_LOG, value); - } - - // gets the number of sequences that the HMM was trained on + /** + * Returns the number of sequences on which the HMM was trained. + * + * @return + */ public Integer getNumberOfSequences() { if (fileProperties.get(NUMBER_OF_SEQUENCES) == null) @@ -232,12 +320,11 @@ public class HiddenMarkovModel return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES)); } - public void setNumberOfSequences(int value) - { - fileProperties.put(NUMBER_OF_SEQUENCES, String.valueOf(value)); - } - - // gets the effective number determined during sequence weighting + /** + * Returns the effective number of sequences on which the HMM was based. + * + * @param value + */ public Double getEffectiveNumberOfSequences() { if (fileProperties.get(LENGTH) == null) @@ -247,11 +334,11 @@ public class HiddenMarkovModel return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES)); } - public void setEffectiveNumberOfSequences(double value) - { - fileProperties.put(EFF_NUMBER_OF_SEQUENCES, String.valueOf(value)); - } - + /** + * Returns the checksum. + * + * @return + */ public Long getCheckSum() { if (fileProperties.get(LENGTH) == null) @@ -261,27 +348,37 @@ public class HiddenMarkovModel return Long.parseLong(fileProperties.get(CHECK_SUM)); } - public void setCheckSum(long value) - { - fileProperties.put(CHECK_SUM, String.valueOf(value)); - } - + /** + * Returns the list of nodes in this HMM. + * + * @return + */ public List getNodes() { return nodes; } + /** + * Sets the list of nodes in this HMM to the given list. + * + * @param nodes + * The list of nodes to which the current list of nodes is being + * changed. + */ public void setNodes(List nodes) { this.nodes = nodes; } /** - * get match emission probability for a given symbol at a column in the - * alignment + * Gets the match emission probability for a given symbol at a column in the + * alignment. * * @param alignColumn + * The index of the alignment column, starting at index 0. Index 0 + * usually corresponds to index 1 in the HMM. * @param symbol + * The symbol for which the desired probability is being requested. * @return * */ @@ -295,11 +392,10 @@ public class HiddenMarkovModel return 0d; } symbolIndex = symbolIndexLookup.get(symbol); - if (nodeLookup.containsKey(alignColumn + 1)) + if (nodeLookup.containsKey(alignColumn)) { - nodeIndex = nodeLookup.get(alignColumn + 1); + nodeIndex = nodeLookup.get(alignColumn); probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex); - probability = Math.pow(Math.E, -probability); return probability; } else @@ -310,12 +406,16 @@ public class HiddenMarkovModel } /** - * get insert emission probability for a given symbol at a column in the - * alignment + * Gets the insert emission probability for a given symbol at a column in the + * alignment. * * @param alignColumn + * The index of the alignment column, starting at index 0. Index 0 + * usually corresponds to index 1 in the HMM. * @param symbol + * The symbol for which the desired probability is being requested. * @return + * */ public Double getInsertEmissionProbability(int alignColumn, char symbol) { @@ -327,12 +427,11 @@ public class HiddenMarkovModel return 0d; } symbolIndex = symbolIndexLookup.get(symbol); - if (nodeLookup.containsKey(alignColumn + 1)) + if (nodeLookup.containsKey(alignColumn)) { - nodeIndex = nodeLookup.get(alignColumn + 1); + nodeIndex = nodeLookup.get(alignColumn); probability = getNode(nodeIndex).getInsertEmissions() .get(symbolIndex); - probability = Math.pow(Math.E, -probability); return probability; } else @@ -343,26 +442,28 @@ public class HiddenMarkovModel } /** - * get state transition probability for a given transition type at a column in - * the alignment + * Gets the state transition probability for a given symbol at a column in the + * alignment. * * @param alignColumn - * @param transition + * The index of the alignment column, starting at index 0. Index 0 + * usually corresponds to index 1 in the HMM. + * @param symbol + * The symbol for which the desired probability is being requested. * @return + * */ public Double getStateTransitionProbability(int alignColumn, - String transition) + int transition) { int transitionIndex; int nodeIndex; Double probability; - transitionIndex = getTransitionType(transition); - if (nodeLookup.containsKey(alignColumn + 1)) + if (nodeLookup.containsKey(alignColumn)) { - nodeIndex = nodeLookup.get(alignColumn + 1); + nodeIndex = nodeLookup.get(alignColumn); probability = getNode(nodeIndex).getStateTransitions() - .get(transitionIndex); - probability = Math.pow(Math.E, -probability); + .get(transition); return probability; } else @@ -372,30 +473,105 @@ public class HiddenMarkovModel } + /** + * Returns the alignment column linked to the node at the given index. + * + * @param nodeIndex + * The index of the node, starting from index 1. Index 0 is the begin + * node, which does not correspond to a column in the alignment. + * @return + */ public Integer getNodeAlignmentColumn(int nodeIndex) { Integer value = nodes.get(nodeIndex).getAlignmentColumn(); - return value; + return value; } + /** + * Returns the consensus residue at the specified node. + * + * @param nodeIndex + * The index of the specified node. + * @return + */ public char getConsensusResidue(int nodeIndex) { char value = nodes.get(nodeIndex).getConsensusResidue(); return value; } + /** + * Returns the consensus at a given alignment column. + * + * @param columnIndex + * The index of the column in the alignment for which the consensus + * is desired. The list of columns starts at index 0. + * @return + */ + public char getConsensusAtAlignColumn(int columnIndex) + { + char mostLikely = '-'; + if (consensusResidueIsActive()) + { + + Integer index = findNodeIndex(columnIndex); + if (index == null) + { + return '-'; + } + mostLikely = getNodes().get(index).getConsensusResidue(); + return mostLikely; + } + else + { + double highestProb = 0; + for (char character : symbols) + { + Double prob = getMatchEmissionProbability(columnIndex, character); + if (prob > highestProb) + { + highestProb = prob; + mostLikely = character; + } + } + return mostLikely; + } + + } + + /** + * Returns the reference annotation at the specified node. + * + * @param nodeIndex + * The index of the specified node. + * @return + */ public char getReferenceAnnotation(int nodeIndex) { char value = nodes.get(nodeIndex).getReferenceAnnotation(); return value; } + /** + * Returns the mask value at the specified node. + * + * @param nodeIndex + * The index of the specified node. + * @return + */ public char getMaskedValue(int nodeIndex) { char value = nodes.get(nodeIndex).getMaskValue(); return value; } + /** + * Returns the consensus structure at the specified node. + * + * @param nodeIndex + * The index of the specified node. + * @return + */ public char getConsensusStructure(int nodeIndex) { char value = nodes.get(nodeIndex).getConsensusStructure(); @@ -403,11 +579,12 @@ public class HiddenMarkovModel } /** - * returns the average match emission for a given symbol + * Returns the average match emission probability for a given symbol + * * @param symbolIndex - * index of symbol + * The index of the symbol. * @return - * average negative log propbability of a match emission of the given symbol + * */ public double getAverageMatchEmission(int symbolIndex) { @@ -415,23 +592,22 @@ public class HiddenMarkovModel return value; } + /** + * Returns the number of symbols in the alphabet used in this HMM. + * + * @return + */ public int getNumberOfSymbols() { return numberOfSymbols; } - public void setNumberOfSymbols(int numberOfSymbols) - { - this.numberOfSymbols = numberOfSymbols; - } - - - /** - * fills symbol array and also finds numberOfSymbols + * Fills symbol array and whilst doing so, updates the value of the number of + * symbols. * * @param parser - * scanner scanning symbol line in file + * The scanner scanning the symbol line in the file. */ public void fillSymbols(Scanner parser) { @@ -448,7 +624,7 @@ public class HiddenMarkovModel } /** - * adds file property + * Adds a file property. * * @param key * @param value @@ -458,6 +634,11 @@ public class HiddenMarkovModel fileProperties.put(key, value); } + /** + * Returns a boolean indicating whether the reference annotation is active. + * + * @return + */ public boolean referenceAnnotationIsActive() { String status; @@ -478,6 +659,11 @@ public class HiddenMarkovModel } + /** + * Returns a boolean indicating whether the mask value annotation is active. + * + * @return + */ public boolean maskValueIsActive() { String status; @@ -498,6 +684,12 @@ public class HiddenMarkovModel } + /** + * Returns a boolean indicating whether the consensus residue annotation is + * active. + * + * @return + */ public boolean consensusResidueIsActive() { String status; @@ -518,6 +710,12 @@ public class HiddenMarkovModel } + /** + * Returns a boolean indicating whether the consensus structure annotation is + * active. + * + * @return + */ public boolean consensusStructureIsActive() { String status; @@ -538,6 +736,11 @@ public class HiddenMarkovModel } + /** + * Returns a boolean indicating whether the MAP annotation is active. + * + * @return + */ public boolean mapIsActive() { String status; @@ -558,31 +761,68 @@ public class HiddenMarkovModel } + /** + * Sets the alignment column of the specified node. + * + * @param nodeIndex + * + * @param column + * + */ public void setAlignmentColumn(int nodeIndex, int column) { nodes.get(nodeIndex).setAlignmentColumn(column); } + /** + * Sets the reference annotation at a given node. + * + * @param nodeIndex + * @param value + */ public void setReferenceAnnotation(int nodeIndex, char value) { nodes.get(nodeIndex).setReferenceAnnotation(value); } + /** + * Sets the consensus residue at a given node. + * + * @param nodeIndex + * @param value + */ public void setConsensusResidue(int nodeIndex, char value) { nodes.get(nodeIndex).setConsensusResidue(value); } + /** + * Sets the consensus structure at a given node. + * + * @param nodeIndex + * @param value + */ public void setConsensusStructure(int nodeIndex, char value) { nodes.get(nodeIndex).setConsensusStructure(value); } + /** + * Sets the mask value at a given node. + * + * @param nodeIndex + * @param value + */ public void setMaskValue(int nodeIndex, char value) { nodes.get(nodeIndex).setMaskValue(value); } + /** + * Temporary implementation, should not be used. + * + * @return + */ public String getGatheringThreshold() { String value; @@ -590,6 +830,11 @@ public class HiddenMarkovModel return value; } + /** + * Temporary implementation, should not be used. + * + * @return + */ public String getNoiseCutoff() { String value; @@ -597,6 +842,11 @@ public class HiddenMarkovModel return value; } + /** + * Temporary implementation, should not be used. + * + * @return + */ public String getTrustedCutoff() { String value; @@ -604,6 +854,11 @@ public class HiddenMarkovModel return value; } + /** + * Temporary implementation, should not be used. + * + * @return + */ public String getViterbi() { String value; @@ -611,6 +866,11 @@ public class HiddenMarkovModel return value; } + /** + * Temporary implementation, should not be used. + * + * @return + */ public String getMSV() { String value; @@ -618,6 +878,11 @@ public class HiddenMarkovModel return value; } + /** + * Temporary implementation, should not be used. + * + * @return + */ public String getForward() { String value; @@ -625,109 +890,63 @@ public class HiddenMarkovModel return value; } + /** + * Sets the activation status of the MAP annotation. + * + * @param status + */ public void setMAPStatus(boolean status) { - if (status == true) - { - fileProperties.put(MAP, YES); - } - else - { - fileProperties.put(MAP, NO); - } + fileProperties.put(MAP, status ? YES : NO); } + /** + * Sets the activation status of the reference annotation. + * + * @param status + */ public void setReferenceAnnotationStatus(boolean status) { - if (status == true) - { - fileProperties.put(REFERENCE_ANNOTATION, YES); - } - else - { - fileProperties.put(REFERENCE_ANNOTATION, NO); - } + fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO); } + /** + * Sets the activation status of the mask value annotation. + * + * @param status + */ public void setMaskedValueStatus(boolean status) { - if (status == true) - { - fileProperties.put(MASKED_VALUE, YES); - } - else - { - fileProperties.put(MASKED_VALUE, NO); - } + fileProperties.put(MASKED_VALUE, status ? YES : NO); } + /** + * Sets the activation status of the consensus residue annotation. + * + * @param status + */ public void setConsensusResidueStatus(boolean status) { - if (status == true) - { - fileProperties.put(CONSENSUS_RESIDUE, YES); - } - else - { - fileProperties.put(CONSENSUS_RESIDUE, NO); - } - } - - public void setConsensusStructureStatus(boolean status) - { - if (status == true) - { - fileProperties.put(CONSENSUS_STRUCTURE, YES); - } - else - { - fileProperties.put(CONSENSUS_STRUCTURE, NO); - } + fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO); } /** + * Sets the activation status of the consensus structure annotation. * - * @param transition - * type of transition occuring - * @return index value representing position along stateTransition array. + * @param status */ - public Integer getTransitionType(String transition) + public void setConsensusStructureStatus(boolean status) { - Integer index; - switch (transition) - { - case "mm": - index = 0; - break; - case "mi": - index = 1; - break; - case "md": - index = 2; - break; - case "im": - index = 3; - break; - case "ii": - index = 4; - break; - case "dm": - index = 5; - break; - case "dd": - index = 6; - break; - default: - index = null; - } - return index; + fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO); } /** - * find the index of the node in a hidden Markov model based on the column in + * Finds the index of the node in a hidden Markov model based on the column in * the alignment * * @param alignmentColumn + * The index of the column in the alignment, with the indexes + * starting from 0. */ public Integer findNodeIndex(int alignmentColumn) @@ -737,6 +956,12 @@ public class HiddenMarkovModel return index; } + /** + * Finds the String values of a boolean. "yes" for true and "no" for false. + * + * @param value + * @return + */ public static String findStringFromBoolean(boolean value) { if (value) @@ -748,5 +973,164 @@ public class HiddenMarkovModel return NO; } } + + + + /** + * Returns the consensus sequence based on the most probable symbol at each + * position. The sequence is adjusted to match the length of the existing + * sequence alignment. Gap characters are used as padding. + * + * @param length + * The length of the longest sequence in the existing alignment. + * @return + */ + public Sequence getConsensusSequence() + { + int start; + int end; + int modelLength; + start = getNodeAlignmentColumn(1); + modelLength = getLength(); + end = getNodeAlignmentColumn(modelLength); + char[] sequence = new char[end]; + for (int index = 0; index < end; index++) + { + Character character; + + character = getConsensusAtAlignColumn(index); + + if (character == null || character == '-') + { + sequence[index] = '-'; + } + else + { + sequence[index] = Character.toUpperCase(character); + } + } + + + Sequence seq = new Sequence(getName() + "_HMM", sequence, start, end); + return seq; + } + + + /** + * Maps the nodes of the hidden Markov model to the reference annotation and + * then deletes this annotation. + */ + public void mapToReferenceAnnotation(AlignFrame af, SequenceI seq) + { + AlignmentAnnotation annotArray[] = af.getViewport().getAlignment() + .getAlignmentAnnotation(); + + AlignmentAnnotation reference = null; + for (AlignmentAnnotation annot : annotArray) + { + if (annot.label.contains("Reference")) + { + reference = annot; + } + } + + if (reference == null) + { + return; + } + + mapToReferenceAnnotation(reference, seq); + af.getViewport().getAlignment().deleteAnnotation(reference); + } + + public void mapToReferenceAnnotation(AlignmentAnnotation reference, + SequenceI seq) + { + HiddenMarkovModel hmm = seq.getHMM(); + Annotation[] annots = reference.annotations; + { + int nodeIndex = 0; + for (int col = 0; col < annots.length; col++) + { + String character = annots[col].displayCharacter; + if ("x".equals(character) || "X".equals(character)) + { + nodeIndex++; + if (nodeIndex < hmm.getNodes().size()) + { + HMMNode node = hmm.getNode(nodeIndex); + int alignPos = getNodeAlignmentColumn(nodeIndex); + char seqCharacter = seq.getCharAt(alignPos); + if (alignPos >= seq.getLength() || col >= seq.getLength()) + { + seq.insertCharAt(seq.getLength(), + (alignPos + 1) - seq.getLength(), + '-'); + } + seq.getSequence()[alignPos] = '-'; + seq.getSequence()[col] = seqCharacter; + node.setAlignmentColumn(col); + hmm.nodeLookup.put(col, nodeIndex); + } + else + { + System.out.println( + "The reference annotation contains more consensus columns than the hidden Markov model"); + break; + } + } + else + { + hmm.nodeLookup.remove(col); + } + } + + } + + } + + public void mapToReferenceAnnotation(AlignmentAnnotation reference) + { + Annotation[] annots = reference.annotations; + { + int nodeIndex = 0; + for (int col = 0; col < annots.length; col++) + { + String character = annots[col].displayCharacter; + if ("x".equals(character) || "X".equals(character)) + { + nodeIndex++; + if (nodeIndex < nodes.size()) + { + HMMNode node = nodes.get(nodeIndex); + node.setAlignmentColumn(col + 1); + nodeLookup.put(col, nodeIndex); + } + else + { + System.out.println( + "The reference annotation contains more consensus columns than the hidden Markov model"); + break; + } + } + else + { + nodeLookup.remove(col); + } + } + + } + + } + + public SequenceI initHMMSequence() + { + Sequence consensus = getConsensusSequence(); + consensus.setIsHMMConsensusSequence(true); + consensus.setHMM(this); + return consensus; + } + + }