X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2FHiddenMarkovModel.java;h=6b8b09562dcdc9011c9d910201630d068ed37c78;hb=refs%2Fheads%2Fmerge%2FJAL-3285_mchmmer_with_211_develop;hp=b422ef1b4325d4d55eeee1cfe08db4bdb1c43b98;hpb=76a84b9909fa7d0f6b74c6fb6d30cac296d59586;p=jalview.git diff --git a/src/jalview/datamodel/HiddenMarkovModel.java b/src/jalview/datamodel/HiddenMarkovModel.java index b422ef1..6b8b095 100644 --- a/src/jalview/datamodel/HiddenMarkovModel.java +++ b/src/jalview/datamodel/HiddenMarkovModel.java @@ -1,143 +1,672 @@ package jalview.datamodel; +import jalview.io.HMMFile; +import jalview.schemes.ResidueProperties; +import jalview.util.Comparison; +import jalview.util.MapList; + +import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; +import java.util.List; import java.util.Map; /** - * Data structure to hold a HMM file - */ -/** + * Data structure which stores a hidden Markov model + * * @author TZVanaalten * */ public class HiddenMarkovModel { + private static final char GAP_DASH = '-'; + + public final static String YES = "yes"; + + public final static String NO = "no"; + + public static final int MATCHTOMATCH = 0; + + public static final int MATCHTOINSERT = 1; + + public static final int MATCHTODELETE = 2; + + public static final int INSERTTOMATCH = 3; + + public static final int INSERTTOINSERT = 4; + + public static final int DELETETOMATCH = 5; + + public static final int DELETETODELETE = 6; + + private static final double LOG2 = Math.log(2); - // Stores file properties + /* + * properties read from HMM file header lines + */ private Map fileProperties = new HashMap<>(); - public String getAccessionNumber() + private String fileHeader; + + /* + * the symbols used in this model e.g. "ACGT" + */ + private String alphabet; + + /* + * symbol lookup index into the alphabet for 'A' to 'Z' + */ + private int[] symbolIndexLookup = new int['Z' - 'A' + 1]; + + /* + * Nodes in the model. The begin node is at index 0, and contains + * average emission probabilities for each symbol. + */ + private List nodes = new ArrayList<>(); + + /* + * the aligned HMM consensus sequence extracted from the HMM profile + */ + private SequenceI hmmSeq; + + /* + * mapping from HMM nodes to residues of the hmm consensus sequence + */ + private Mapping mapToHmmConsensus; + + // stores background frequencies of alignment from which this model came + private Map backgroundFrequencies; + + /** + * Constructor + */ + public HiddenMarkovModel() { - return fileProperties.get("ACC"); } - public String getDescription() + /** + * Copy constructor given a new aligned sequence with which to associate the + * HMM profile + * + * @param hmm + * @param sq + */ + public HiddenMarkovModel(HiddenMarkovModel hmm, SequenceI sq) { - return fileProperties.get("DESC"); + super(); + this.fileProperties = new HashMap<>(hmm.fileProperties); + this.alphabet = hmm.alphabet; + this.nodes = new ArrayList<>(hmm.nodes); + this.symbolIndexLookup = hmm.symbolIndexLookup; + this.fileHeader = new String(hmm.fileHeader); + this.hmmSeq = sq; + this.backgroundFrequencies = hmm.getBackgroundFrequencies(); + if (sq.getDatasetSequence() == hmm.mapToHmmConsensus.getTo()) + { + // same dataset sequence e.g. after realigning search results + this.mapToHmmConsensus = hmm.mapToHmmConsensus; + } + else + { + // different dataset sequence e.g. after loading HMM from project + this.mapToHmmConsensus = new Mapping(sq.getDatasetSequence(), + hmm.mapToHmmConsensus.getMap()); + } } - public int getModelLength() + /** + * Returns the information content at a specified column, calculated as the + * sum (over possible symbols) of the log ratio + * + *
+   *  log(emission probability / background probability) / log(2)
+   * 
+ * + * @param column + * column position (base 0) + * @return + */ + public float getInformationContent(int column) { - return Integer.parseInt(fileProperties.get("LENG")); + float informationContent = 0f; + + for (char symbol : getSymbols().toCharArray()) + { + float freq = ResidueProperties.backgroundFrequencies + .get(getAlphabetType()).get(symbol); + float prob = (float) getMatchEmissionProbability(column, symbol); + informationContent += prob * Math.log(prob / freq); + } + + informationContent = informationContent / (float) LOG2; + + return informationContent; } - public int getMaxInstanceLength() + /** + * Gets the file header of the .hmm file this model came from + * + * @return + */ + public String getFileHeader() { - return Integer.parseInt(fileProperties.get("MAXL")); + return fileHeader; } - // gets type of symbol alphabet - "amino", "DNA", "RNA" - public String getAlphabetType() + /** + * Sets the file header of this model. + * + * @param header + */ + public void setFileHeader(String header) + { + fileHeader = header; + } + + /** + * Returns the symbols used in this hidden Markov model + * + * @return + */ + public String getSymbols() + { + return alphabet; + } + + /** + * Gets the node in the hidden Markov model at the specified position. + * + * @param nodeIndex + * The index of the node requested. Node 0 optionally contains the + * average match emission probabilities across the entire model, and + * always contains the insert emission probabilities and state + * transition probabilities for the begin node. Node 1 contains the + * first node in the HMM that can correspond to a column in the + * alignment. + * @return + */ + public HMMNode getNode(int nodeIndex) + { + return nodes.get(nodeIndex); + } + + /** + * Returns the name of the sequence alignment on which the HMM is based. + * + * @return + */ + public String getName() + { + return fileProperties.get(HMMFile.NAME); + } + + /** + * Answers the string value of the property (parsed from an HMM file) for the + * given key, or null if the property is not present + * + * @param key + * @return + */ + public String getProperty(String key) { - return fileProperties.get("ALPH"); + return fileProperties.get(key); } - // returns boolean indicating whether the reference annotation character field - // for each match state is valid or ignored - public boolean getReferenceAnnotationFlag() + /** + * Answers true if the property with the given key is present with a value of + * "yes" (not case-sensitive), else false + * + * @param key + * @return + */ + public boolean getBooleanProperty(String key) { - if (fileProperties.get("RF") == "yes") + return YES.equalsIgnoreCase(fileProperties.get(key)); + } + + /** + * Returns the length of the hidden Markov model. The value returned is the + * LENG property if specified, else the number of nodes, excluding the begin + * node (which should be the same thing). + * + * @return + */ + public int getLength() + { + if (fileProperties.get(HMMFile.LENGTH) == null) { - return true; + return nodes.size() - 1; // not counting BEGIN node } - return false; + return Integer.parseInt(fileProperties.get(HMMFile.LENGTH)); + } + + /** + * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are + * the options. Other alphabets may be added. + * + * @return + */ + public String getAlphabetType() + { + return fileProperties.get(HMMFile.ALPHABET); } - // returns boolean indicating whether the model mask annotation character - // field - // for each match state is valid or ignored - public boolean getModelMaskedFlag() + /** + * Sets the model alphabet to the symbols in the given string (ignoring any + * whitespace), and returns the number of symbols + * + * @param symbols + */ + public int setAlphabet(String symbols) { - if (fileProperties.get("MM") == "yes") + String trimmed = symbols.toUpperCase().replaceAll("\\s", ""); + int count = trimmed.length(); + alphabet = trimmed; + symbolIndexLookup = new int['Z' - 'A' + 1]; + Arrays.fill(symbolIndexLookup, -1); + int ignored = 0; + + /* + * save the symbols in order, and a quick lookup of symbol position + */ + for (short i = 0; i < count; i++) { - return true; + char symbol = trimmed.charAt(i); + if (symbol >= 'A' && symbol <= 'Z' + && symbolIndexLookup[symbol - 'A'] == -1) + { + symbolIndexLookup[symbol - 'A'] = i; + } + else + { + System.err + .println( + "Unexpected or duplicated character in HMM ALPHabet: " + + symbol); + ignored++; + } } - return false; + return count - ignored; } - // returns boolean indicating whether the consensus residue field - // for each match state is valid or ignored - public boolean getConsensusResidueAnnotationFlag() + /** + * Answers the node of the model corresponding to an aligned column position + * (0...), or null if there is no such node + * + * @param column + * @return + */ + HMMNode getNodeForColumn(int column) { - if (fileProperties.get("CONS") == "yes") + /* + * if the hmm consensus is gapped at the column, + * there is no corresponding node + */ + if (Comparison.isGap(hmmSeq.getCharAt(column))) + { + return null; + } + + /* + * find the node (if any) that is mapped to the + * consensus sequence residue position at the column + */ + int seqPos = hmmSeq.findPosition(column); + int[] nodeNo = mapToHmmConsensus.getMap().locateInFrom(seqPos, seqPos); + if (nodeNo != null) { - return true; + return getNode(nodeNo[0]); } - return false; + return null; } - // returns boolean indicating whether the consensus structure character field - // for each match state is valid or ignored - public boolean getConsensusStructureAnnotationFlag() + /** + * Gets the match emission probability for a given symbol at a column in the + * alignment. + * + * @param alignColumn + * The index of the alignment column, starting at index 0. Index 0 + * usually corresponds to index 1 in the HMM. + * @param symbol + * The symbol for which the desired probability is being requested. + * @return + * + */ + public double getMatchEmissionProbability(int alignColumn, char symbol) { - if (fileProperties.get("CS") == "yes") + HMMNode node = getNodeForColumn(alignColumn); + int symbolIndex = getSymbolIndex(symbol); + if (node != null && symbolIndex != -1) { - return true; + return node.getMatchEmission(symbolIndex); } - return false; + return 0D; } - // returns boolean indicating whether the model mask annotation character - // field - // for each match state is valid or ignored - public boolean getMapAnnotationFlag() + /** + * Gets the insert emission probability for a given symbol at a column in the + * alignment. + * + * @param alignColumn + * The index of the alignment column, starting at index 0. Index 0 + * usually corresponds to index 1 in the HMM. + * @param symbol + * The symbol for which the desired probability is being requested. + * @return + * + */ + public double getInsertEmissionProbability(int alignColumn, char symbol) { - if (fileProperties.get("MAP") == "yes") + HMMNode node = getNodeForColumn(alignColumn); + int symbolIndex = getSymbolIndex(symbol); + if (node != null && symbolIndex != -1) { - return true; + return node.getInsertEmission(symbolIndex); } - return false; + return 0D; } + + /** + * Gets the state transition probability for a given symbol at a column in the + * alignment. + * + * @param alignColumn + * The index of the alignment column, starting at index 0. Index 0 + * usually corresponds to index 1 in the HMM. + * @param symbol + * The symbol for which the desired probability is being requested. + * @return + * + */ + public double getStateTransitionProbability(int alignColumn, + int transition) + { + HMMNode node = getNodeForColumn(alignColumn); + if (node != null) + { + return node.getStateTransition(transition); + } + return 0D; + } + + /** + * Returns the sequence position linked to the node at the given index. This + * corresponds to an aligned column position (counting from 1). + * + * @param nodeIndex + * The index of the node, starting from index 1. Index 0 is the begin + * node, which does not correspond to a column in the alignment. + * @return + */ + public int getNodeMapPosition(int nodeIndex) + { + return nodes.get(nodeIndex).getResidueNumber(); + } + + /** + * Returns the consensus residue at the specified node. + * + * @param nodeIndex + * The index of the specified node. + * @return + */ + public char getConsensusResidue(int nodeIndex) + { + char value = nodes.get(nodeIndex).getConsensusResidue(); + return value; + } + + /** + * Returns the reference annotation at the specified node. + * + * @param nodeIndex + * The index of the specified node. + * @return + */ + public char getReferenceAnnotation(int nodeIndex) + { + char value = nodes.get(nodeIndex).getReferenceAnnotation(); + return value; + } + + /** + * Returns the mask value at the specified node. + * + * @param nodeIndex + * The index of the specified node. + * @return + */ + public char getMaskedValue(int nodeIndex) + { + char value = nodes.get(nodeIndex).getMaskValue(); + return value; + } + + /** + * Returns the consensus structure at the specified node. + * + * @param nodeIndex + * The index of the specified node. + * @return + */ + public char getConsensusStructure(int nodeIndex) + { + char value = nodes.get(nodeIndex).getConsensusStructure(); + return value; + } + + /** + * Sets a property read from an HMM file + * + * @param key + * @param value + */ + public void setProperty(String key, String value) + { + fileProperties.put(key, value); + } + + /** + * Temporary implementation, should not be used. + * + * @return + */ + public String getViterbi() + { + String value; + value = fileProperties.get(HMMFile.VITERBI); + return value; + } + + /** + * Temporary implementation, should not be used. + * + * @return + */ + public String getMSV() + { + String value; + value = fileProperties.get(HMMFile.MSV); + return value; + } + + /** + * Temporary implementation, should not be used. + * + * @return + */ + public String getForward() + { + String value; + value = fileProperties.get(HMMFile.FORWARD); + return value; + } + + /** + * Constructs the consensus sequence based on the most probable symbol at each + * position. Gap characters are inserted for discontinuities in the node map + * numbering (if provided), else an ungapped sequence is generated. + *

+ * A mapping between the HMM nodes and residue positions of the sequence is + * also built and saved. + * + * @return + */ + void buildConsensusSequence() + { + List toResidues = new ArrayList<>(); - // not sure whether to implement this - // public Date getDate() - // { + /* + * if the HMM provided a map to sequence, use those start/end values, + * else just treat it as for a contiguous sequence numbered from 1 + */ + boolean hasMap = getBooleanProperty(HMMFile.MAP); + int start = hasMap ? getNode(1).getResidueNumber() : 1; + int endResNo = hasMap ? getNode(nodes.size() - 1).getResidueNumber() + : (start + getLength() - 1); + char[] sequence = new char[endResNo]; - // } + int lastResNo = start - 1; + int seqOffset = -1; + int gapCount = 0; - // not sure whether to implement this - // public String getCommandLineLog() - // { - // } + for (int seqN = 0; seqN < start; seqN++) + { + sequence[seqN] = GAP_DASH; + seqOffset++; + } + + for (int nodeNo = 1; nodeNo < nodes.size(); nodeNo++) + { + HMMNode node = nodes.get(nodeNo); + final int resNo = hasMap ? node.getResidueNumber() : lastResNo + 1; + + /* + * insert gaps if map numbering is not continuous + */ + while (resNo > lastResNo + 1) + { + sequence[seqOffset++] = GAP_DASH; + lastResNo++; + gapCount++; + } + char consensusResidue = node.getConsensusResidue(); + if (GAP_DASH == consensusResidue) + { + /* + * no residue annotation in HMM - scan for the symbol + * with the highest match emission probability + */ + int symbolIndex = node.getMaxMatchEmissionIndex(); + consensusResidue = alphabet.charAt(symbolIndex); + if (node.getMatchEmission(symbolIndex) < 0.5D) + { + // follow convention of lower case if match emission prob < 0.5 + consensusResidue = Character.toLowerCase(consensusResidue); + } + } + sequence[seqOffset++] = consensusResidue; + lastResNo = resNo; + } + + Sequence seq = new Sequence(getName(), sequence, start, + lastResNo - gapCount); + seq.createDatasetSequence(); + seq.setHMM(this); + this.hmmSeq = seq; + + /* + * construct and store Mapping of nodes to residues + * note as constructed this is just an identity mapping, + * but it allows for greater flexibility in future + */ + List fromNodes = new ArrayList<>(); + fromNodes.add(new int[] { 1, getLength() }); + toResidues.add(new int[] { seq.getStart(), seq.getEnd() }); + MapList mapList = new MapList(fromNodes, toResidues, 1, 1); + mapToHmmConsensus = new Mapping(seq.getDatasetSequence(), mapList); + } + + + /** + * Answers the aligned consensus sequence for the profile. Note this will + * return null if called before setNodes has been called. + * + * @return + */ + public SequenceI getConsensusSequence() + { + return hmmSeq; + } + + /** + * Answers the index position (0...) of the given symbol, or -1 if not a valid + * symbol for this HMM + * + * @param symbol + * @return + */ + private int getSymbolIndex(char symbol) + { + /* + * symbolIndexLookup holds the index for 'A' to 'Z' + */ + char c = Character.toUpperCase(symbol); + if ('A' <= c && c <= 'Z') + { + return symbolIndexLookup[c - 'A']; + } + return -1; + } - // gets the number of sequences that the HMM was trained on - public int getSequenceNumber() + /** + * Sets the nodes of this HMM, and also extracts the HMM consensus sequence + * and a mapping between node numbers and sequence positions + * + * @param nodeList + */ + public void setNodes(List nodeList) { - return Integer.parseInt(fileProperties.get("NSEQ")); + nodes = nodeList; + if (nodes.size() > 1) + { + buildConsensusSequence(); + } } - // gets the effective number determined during sequence weighting - public int getEffectiveSequenceNumber() + /** + * Sets the aligned consensus sequence this HMM is the model for + * + * @param hmmSeq + */ + public void setHmmSeq(SequenceI hmmSeq) { - return Integer.parseInt(fileProperties.get("EFFN")); + this.hmmSeq = hmmSeq; } - public int getCheckSum() + public void setBackgroundFrequencies(Map bkgdFreqs) { - return Integer.parseInt(fileProperties.get("CKSUM")); + backgroundFrequencies = bkgdFreqs; } - // need to ask if BigDecimal is best decimal type for this purpose - // and how to limit number of decimals - public double getGatheringThresholdGA1() + public void setBackgroundFrequencies(ResidueCount bkgdFreqs) { - return Double.parseDouble((fileProperties.get("GA1"))); + backgroundFrequencies = new HashMap<>(); + + int total = bkgdFreqs.getTotalResidueCount(); + + for (char c : bkgdFreqs.getSymbolCounts().symbols) + { + backgroundFrequencies.put(c, bkgdFreqs.getCount(c) * 1f / total); + } + } - public void put(String key, String value) + public Map getBackgroundFrequencies() { - fileProperties.put(key, value); + return backgroundFrequencies; } } +