X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2FHiddenMarkovModel.java;h=6b8b09562dcdc9011c9d910201630d068ed37c78;hb=b3eead416d4a16141910b7dae1eda4eaf2272b6a;hp=5331f3d7c5da0336db9c7258732f9331639d907b;hpb=293afc4e4e080e8d48cfa99760667b2b1dfe1da0;p=jalview.git diff --git a/src/jalview/datamodel/HiddenMarkovModel.java b/src/jalview/datamodel/HiddenMarkovModel.java index 5331f3d..6b8b095 100644 --- a/src/jalview/datamodel/HiddenMarkovModel.java +++ b/src/jalview/datamodel/HiddenMarkovModel.java @@ -1,752 +1,672 @@ package jalview.datamodel; +import jalview.io.HMMFile; +import jalview.schemes.ResidueProperties; +import jalview.util.Comparison; +import jalview.util.MapList; + import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Scanner; /** - * Data structure which stores a hidden Markov model. Currently contains file properties as well, not sure whether these should be transferred to the HMMFile class + * Data structure which stores a hidden Markov model * * @author TZVanaalten * */ public class HiddenMarkovModel { - // Stores file properties. Do not directly access this field as it contains - // only string value - use the getter methods. For example, to find the length - // of theHMM, use getModelLength()to return an int value - Map fileProperties = new HashMap<>(); - - //contains all of the symbols used in this model. The index of each symbol represents its lookup value - List symbols = new ArrayList<>(); - - // contains information for each node in the model. The begin node is at index - // 0. Node 0 contains average emission probabilities for each symbol - List nodes = new ArrayList<>(); - - // contains the HMM node for each alignment column - Map nodeLookup = new HashMap<>(); - - //contains the symbol index for each symbol - Map symbolIndexLookup = new HashMap<>(); - - - final static String YES = "yes"; - - final static String NO = "no"; - - int numberOfSymbols; - - //keys for file properties hashmap - private final String NAME = "NAME"; - - private final String ACCESSION_NUMBER = "ACC"; + private static final char GAP_DASH = '-'; - private final String DESCRIPTION = "DESC"; + public final static String YES = "yes"; - private final String LENGTH = "LENG"; + public final static String NO = "no"; - private final String MAX_LENGTH = "MAXL"; + public static final int MATCHTOMATCH = 0; - private final String ALPHABET = "ALPH"; + public static final int MATCHTOINSERT = 1; - private final String DATE = "DATE"; + public static final int MATCHTODELETE = 2; - private final String COMMAND_LOG = "COM"; + public static final int INSERTTOMATCH = 3; - private final String NUMBER_OF_SEQUENCES = "NSEQ"; + public static final int INSERTTOINSERT = 4; - private final String EFF_NUMBER_OF_SEQUENCES = "EFFN"; + public static final int DELETETOMATCH = 5; - private final String CHECK_SUM = "CKSUM"; + public static final int DELETETODELETE = 6; - private final String GATHERING_THRESHOLDS = "GA"; + private static final double LOG2 = Math.log(2); - private final String TRUSTED_CUTOFFS = "TC"; - - private final String NOISE_CUTOFFS = "NC"; - - private final String STATISTICS = "STATS"; + /* + * properties read from HMM file header lines + */ + private Map fileProperties = new HashMap<>(); - private final String COMPO = "COMPO"; + private String fileHeader; - private final String GATHERING_THRESHOLD = "GA"; - - private final String TRUSTED_CUTOFF = "TC"; - - private final String NOISE_CUTOFF = "NC"; - - private final String VITERBI = "VITERBI"; - - private final String MSV = "MSV"; - - private final String FORWARD = "FORWARD"; - - private final String MAP = "MAP"; + /* + * the symbols used in this model e.g. "ACGT" + */ + private String alphabet; - private final String REFERENCE_ANNOTATION = "RF"; + /* + * symbol lookup index into the alphabet for 'A' to 'Z' + */ + private int[] symbolIndexLookup = new int['Z' - 'A' + 1]; - private final String CONSENSUS_RESIDUE = "CONS"; + /* + * Nodes in the model. The begin node is at index 0, and contains + * average emission probabilities for each symbol. + */ + private List nodes = new ArrayList<>(); - private final String CONSENSUS_STRUCTURE = "CS"; + /* + * the aligned HMM consensus sequence extracted from the HMM profile + */ + private SequenceI hmmSeq; - private final String MASKED_VALUE = "MM"; - - final static String[] TRANSITION_TYPES = new String[] { "m->m", "m->i", - "m->d", "i->m", "i->i", "d->m", "d->d" }; + /* + * mapping from HMM nodes to residues of the hmm consensus sequence + */ + private Mapping mapToHmmConsensus; - public String getTransitionType(int index) - { - return TRANSITION_TYPES[index]; - } + // stores background frequencies of alignment from which this model came + private Map backgroundFrequencies; - public Map getNodeLookup() + /** + * Constructor + */ + public HiddenMarkovModel() { - return nodeLookup; } - public void setNodeLookup(Map nodeLookup) + /** + * Copy constructor given a new aligned sequence with which to associate the + * HMM profile + * + * @param hmm + * @param sq + */ + public HiddenMarkovModel(HiddenMarkovModel hmm, SequenceI sq) { - this.nodeLookup = nodeLookup; + super(); + this.fileProperties = new HashMap<>(hmm.fileProperties); + this.alphabet = hmm.alphabet; + this.nodes = new ArrayList<>(hmm.nodes); + this.symbolIndexLookup = hmm.symbolIndexLookup; + this.fileHeader = new String(hmm.fileHeader); + this.hmmSeq = sq; + this.backgroundFrequencies = hmm.getBackgroundFrequencies(); + if (sq.getDatasetSequence() == hmm.mapToHmmConsensus.getTo()) + { + // same dataset sequence e.g. after realigning search results + this.mapToHmmConsensus = hmm.mapToHmmConsensus; + } + else + { + // different dataset sequence e.g. after loading HMM from project + this.mapToHmmConsensus = new Mapping(sq.getDatasetSequence(), + hmm.mapToHmmConsensus.getMap()); + } } - public String[] getTransitionTypes() + /** + * Returns the information content at a specified column, calculated as the + * sum (over possible symbols) of the log ratio + * + *
+   *  log(emission probability / background probability) / log(2)
+   * 
+ * + * @param column + * column position (base 0) + * @return + */ + public float getInformationContent(int column) { - return TRANSITION_TYPES; - } + float informationContent = 0f; - public List getSymbols() - { - return symbols; - } + for (char symbol : getSymbols().toCharArray()) + { + float freq = ResidueProperties.backgroundFrequencies + .get(getAlphabetType()).get(symbol); + float prob = (float) getMatchEmissionProbability(column, symbol); + informationContent += prob * Math.log(prob / freq); + } - public Map getFileProperties() - { - return fileProperties; - } + informationContent = informationContent / (float) LOG2; - public HMMNode getNode(int nodeIndex) - { - return getNodes().get(nodeIndex); + return informationContent; } - public void setSymbols(List symbolsL) + /** + * Gets the file header of the .hmm file this model came from + * + * @return + */ + public String getFileHeader() { - this.symbols = symbolsL; + return fileHeader; } - public String getName() - { - return fileProperties.get(NAME); - } - public String getAccessionNumber() + /** + * Sets the file header of this model. + * + * @param header + */ + public void setFileHeader(String header) { - return fileProperties.get(ACCESSION_NUMBER); + fileHeader = header; } - public void setAccessionNumber(String value) + /** + * Returns the symbols used in this hidden Markov model + * + * @return + */ + public String getSymbols() { - fileProperties.put(ACCESSION_NUMBER, value); + return alphabet; } - - public String getDescription() + + /** + * Gets the node in the hidden Markov model at the specified position. + * + * @param nodeIndex + * The index of the node requested. Node 0 optionally contains the + * average match emission probabilities across the entire model, and + * always contains the insert emission probabilities and state + * transition probabilities for the begin node. Node 1 contains the + * first node in the HMM that can correspond to a column in the + * alignment. + * @return + */ + public HMMNode getNode(int nodeIndex) { - return fileProperties.get(DESCRIPTION); + return nodes.get(nodeIndex); } - public void setDescription(String value) + /** + * Returns the name of the sequence alignment on which the HMM is based. + * + * @return + */ + public String getName() { - fileProperties.put(DESCRIPTION, value); + return fileProperties.get(HMMFile.NAME); } - - public Integer getLength() + + /** + * Answers the string value of the property (parsed from an HMM file) for the + * given key, or null if the property is not present + * + * @param key + * @return + */ + public String getProperty(String key) { - if (fileProperties.get(LENGTH) == null) - { - return null; - } - return Integer.parseInt(fileProperties.get(LENGTH)); + return fileProperties.get(key); } - public void setLength(int value) + /** + * Answers true if the property with the given key is present with a value of + * "yes" (not case-sensitive), else false + * + * @param key + * @return + */ + public boolean getBooleanProperty(String key) { - fileProperties.put(LENGTH, String.valueOf(value)); + return YES.equalsIgnoreCase(fileProperties.get(key)); } - public Integer getMaxInstanceLength() + /** + * Returns the length of the hidden Markov model. The value returned is the + * LENG property if specified, else the number of nodes, excluding the begin + * node (which should be the same thing). + * + * @return + */ + public int getLength() { - if (fileProperties.get(MAX_LENGTH) == null) + if (fileProperties.get(HMMFile.LENGTH) == null) { - return null; + return nodes.size() - 1; // not counting BEGIN node } - return Integer.parseInt(fileProperties.get(MAX_LENGTH)); + return Integer.parseInt(fileProperties.get(HMMFile.LENGTH)); } - public void setMaxInstanceLength(int value) - { - fileProperties.put(MAX_LENGTH, String.valueOf(value)); - } - - // gets type of symbol alphabet - "amino", "DNA", "RNA" + /** + * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are + * the options. Other alphabets may be added. + * + * @return + */ public String getAlphabetType() { - return fileProperties.get(ALPHABET); - } - - public void setAlphabetType(String value) - { - fileProperties.put(ALPHABET, value); - } - - // not sure whether to implement this with Date object - public String getDate() - { - return fileProperties.get(DATE); - } - - public void setDate(String value) - { - fileProperties.put(DATE, value); - } - - // not sure whether to implement this - public String getCommandLineLog() - { - return fileProperties.get(COMMAND_LOG); - } - - public void setCommandLineLog(String value) - { - fileProperties.put(COMMAND_LOG, value); + return fileProperties.get(HMMFile.ALPHABET); } - // gets the number of sequences that the HMM was trained on - public Integer getNumberOfSequences() - { - if (fileProperties.get(NUMBER_OF_SEQUENCES) == null) - { - return null; - } - return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES)); - } - - public void setNumberOfSequences(int value) - { - fileProperties.put(NUMBER_OF_SEQUENCES, String.valueOf(value)); + /** + * Sets the model alphabet to the symbols in the given string (ignoring any + * whitespace), and returns the number of symbols + * + * @param symbols + */ + public int setAlphabet(String symbols) + { + String trimmed = symbols.toUpperCase().replaceAll("\\s", ""); + int count = trimmed.length(); + alphabet = trimmed; + symbolIndexLookup = new int['Z' - 'A' + 1]; + Arrays.fill(symbolIndexLookup, -1); + int ignored = 0; + + /* + * save the symbols in order, and a quick lookup of symbol position + */ + for (short i = 0; i < count; i++) + { + char symbol = trimmed.charAt(i); + if (symbol >= 'A' && symbol <= 'Z' + && symbolIndexLookup[symbol - 'A'] == -1) + { + symbolIndexLookup[symbol - 'A'] = i; + } + else + { + System.err + .println( + "Unexpected or duplicated character in HMM ALPHabet: " + + symbol); + ignored++; + } + } + return count - ignored; } - // gets the effective number determined during sequence weighting - public Double getEffectiveNumberOfSequences() + /** + * Answers the node of the model corresponding to an aligned column position + * (0...), or null if there is no such node + * + * @param column + * @return + */ + HMMNode getNodeForColumn(int column) { - if (fileProperties.get(LENGTH) == null) + /* + * if the hmm consensus is gapped at the column, + * there is no corresponding node + */ + if (Comparison.isGap(hmmSeq.getCharAt(column))) { return null; } - return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES)); - } - public void setEffectiveNumberOfSequences(double value) - { - fileProperties.put(EFF_NUMBER_OF_SEQUENCES, String.valueOf(value)); - } - - public Long getCheckSum() - { - if (fileProperties.get(LENGTH) == null) + /* + * find the node (if any) that is mapped to the + * consensus sequence residue position at the column + */ + int seqPos = hmmSeq.findPosition(column); + int[] nodeNo = mapToHmmConsensus.getMap().locateInFrom(seqPos, seqPos); + if (nodeNo != null) { - return null; + return getNode(nodeNo[0]); } - return Long.parseLong(fileProperties.get(CHECK_SUM)); - } - - public void setCheckSum(long value) - { - fileProperties.put(CHECK_SUM, String.valueOf(value)); - } - - public List getNodes() - { - return nodes; + return null; } - public void setNodes(List nodes) - { - this.nodes = nodes; - } - /** - * get match emission probability for a given symbol at a column in the - * alignment + * Gets the match emission probability for a given symbol at a column in the + * alignment. * * @param alignColumn + * The index of the alignment column, starting at index 0. Index 0 + * usually corresponds to index 1 in the HMM. * @param symbol + * The symbol for which the desired probability is being requested. * @return * */ - public Double getMatchEmissionProbability(int alignColumn, char symbol) + public double getMatchEmissionProbability(int alignColumn, char symbol) { - int symbolIndex; - int nodeIndex; - Double probability; - if (!symbolIndexLookup.containsKey(symbol)) - { - return 0d; - } - symbolIndex = symbolIndexLookup.get(symbol); - if (nodeLookup.containsKey(alignColumn + 1)) + HMMNode node = getNodeForColumn(alignColumn); + int symbolIndex = getSymbolIndex(symbol); + if (node != null && symbolIndex != -1) { - nodeIndex = nodeLookup.get(alignColumn + 1); - probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex); - probability = Math.pow(Math.E, -probability); - return probability; + return node.getMatchEmission(symbolIndex); } - else - { - return 0d; - } - + return 0D; } /** - * get insert emission probability for a given symbol at a column in the - * alignment + * Gets the insert emission probability for a given symbol at a column in the + * alignment. * * @param alignColumn + * The index of the alignment column, starting at index 0. Index 0 + * usually corresponds to index 1 in the HMM. * @param symbol + * The symbol for which the desired probability is being requested. * @return + * */ - public Double getInsertEmissionProbability(int alignColumn, char symbol) + public double getInsertEmissionProbability(int alignColumn, char symbol) { - int symbolIndex; - int nodeIndex; - Double probability; - if (!symbolIndexLookup.containsKey(symbol)) + HMMNode node = getNodeForColumn(alignColumn); + int symbolIndex = getSymbolIndex(symbol); + if (node != null && symbolIndex != -1) { - return 0d; + return node.getInsertEmission(symbolIndex); } - symbolIndex = symbolIndexLookup.get(symbol); - if (nodeLookup.containsKey(alignColumn + 1)) - { - nodeIndex = nodeLookup.get(alignColumn + 1); - probability = getNode(nodeIndex).getInsertEmissions() - .get(symbolIndex); - probability = Math.pow(Math.E, -probability); - return probability; - } - else - { - return 0d; - } - + return 0D; } /** - * get state transition probability for a given transition type at a column in - * the alignment + * Gets the state transition probability for a given symbol at a column in the + * alignment. * * @param alignColumn - * @param transition + * The index of the alignment column, starting at index 0. Index 0 + * usually corresponds to index 1 in the HMM. + * @param symbol + * The symbol for which the desired probability is being requested. * @return + * */ - public Double getStateTransitionProbability(int alignColumn, - String transition) + public double getStateTransitionProbability(int alignColumn, + int transition) { - int transitionIndex; - int nodeIndex; - Double probability; - transitionIndex = getTransitionType(transition); - if (nodeLookup.containsKey(alignColumn + 1)) + HMMNode node = getNodeForColumn(alignColumn); + if (node != null) { - nodeIndex = nodeLookup.get(alignColumn + 1); - probability = getNode(nodeIndex).getStateTransitions() - .get(transitionIndex); - probability = Math.pow(Math.E, -probability); - return probability; + return node.getStateTransition(transition); } - else - { - return 0d; - } - + return 0D; } - public Integer getNodeAlignmentColumn(int nodeIndex) + /** + * Returns the sequence position linked to the node at the given index. This + * corresponds to an aligned column position (counting from 1). + * + * @param nodeIndex + * The index of the node, starting from index 1. Index 0 is the begin + * node, which does not correspond to a column in the alignment. + * @return + */ + public int getNodeMapPosition(int nodeIndex) { - Integer value = nodes.get(nodeIndex).getAlignmentColumn(); - return value; + return nodes.get(nodeIndex).getResidueNumber(); } + /** + * Returns the consensus residue at the specified node. + * + * @param nodeIndex + * The index of the specified node. + * @return + */ public char getConsensusResidue(int nodeIndex) { char value = nodes.get(nodeIndex).getConsensusResidue(); return value; } + /** + * Returns the reference annotation at the specified node. + * + * @param nodeIndex + * The index of the specified node. + * @return + */ public char getReferenceAnnotation(int nodeIndex) { char value = nodes.get(nodeIndex).getReferenceAnnotation(); return value; } + /** + * Returns the mask value at the specified node. + * + * @param nodeIndex + * The index of the specified node. + * @return + */ public char getMaskedValue(int nodeIndex) { char value = nodes.get(nodeIndex).getMaskValue(); return value; } - public char getConsensusStructure(int nodeIndex) - { - char value = nodes.get(nodeIndex).getConsensusStructure(); - return value; - } - /** - * returns the average match emission for a given symbol - * @param symbolIndex - * index of symbol + * Returns the consensus structure at the specified node. + * + * @param nodeIndex + * The index of the specified node. * @return - * average negative log propbability of a match emission of the given symbol */ - public double getAverageMatchEmission(int symbolIndex) - { - double value = nodes.get(0).getMatchEmissions().get(symbolIndex); - return value; - } - - public int getNumberOfSymbols() - { - return numberOfSymbols; - } - - public void setNumberOfSymbols(int numberOfSymbols) + public char getConsensusStructure(int nodeIndex) { - this.numberOfSymbols = numberOfSymbols; + char value = nodes.get(nodeIndex).getConsensusStructure(); + return value; } - - - /** - * fills symbol array and also finds numberOfSymbols - * - * @param parser - * scanner scanning symbol line in file - */ - public void fillSymbols(Scanner parser) - { - int i = 0; - while (parser.hasNext()) - { - String strSymbol = parser.next(); - char[] symbol = strSymbol.toCharArray(); - symbols.add(symbol[0]); - symbolIndexLookup.put(symbol[0], i); - i++; - } - numberOfSymbols = symbols.size(); - } - /** - * adds file property + * Sets a property read from an HMM file * * @param key * @param value */ - public void addFileProperty(String key, String value) + public void setProperty(String key, String value) { fileProperties.put(key, value); } - public boolean referenceAnnotationIsActive() - { - String status; - status = fileProperties.get(REFERENCE_ANNOTATION); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - public boolean maskValueIsActive() - { - String status; - status = fileProperties.get(MASKED_VALUE); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - public boolean consensusResidueIsActive() - { - String status; - status = fileProperties.get(CONSENSUS_RESIDUE); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - public boolean consensusStructureIsActive() - { - String status; - status = fileProperties.get(CONSENSUS_STRUCTURE); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - public boolean mapIsActive() - { - String status; - status = fileProperties.get(MAP); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - public void setAlignmentColumn(int nodeIndex, int column) - { - nodes.get(nodeIndex).setAlignmentColumn(column); - } - - public void setReferenceAnnotation(int nodeIndex, char value) - { - nodes.get(nodeIndex).setReferenceAnnotation(value); - } - - public void setConsensusResidue(int nodeIndex, char value) - { - nodes.get(nodeIndex).setConsensusResidue(value); - } - - public void setConsensusStructure(int nodeIndex, char value) - { - nodes.get(nodeIndex).setConsensusStructure(value); - } - - public void setMaskValue(int nodeIndex, char value) - { - nodes.get(nodeIndex).setMaskValue(value); - } - - public String getGatheringThreshold() - { - String value; - value = fileProperties.get("GA"); - return value; - } - - public String getNoiseCutoff() - { - String value; - value = fileProperties.get("NC"); - return value; - } - - public String getTrustedCutoff() - { - String value; - value = fileProperties.get("TC"); - return value; - } - + /** + * Temporary implementation, should not be used. + * + * @return + */ public String getViterbi() { String value; - value = fileProperties.get(VITERBI); + value = fileProperties.get(HMMFile.VITERBI); return value; } + /** + * Temporary implementation, should not be used. + * + * @return + */ public String getMSV() { String value; - value = fileProperties.get(MSV); + value = fileProperties.get(HMMFile.MSV); return value; } + /** + * Temporary implementation, should not be used. + * + * @return + */ public String getForward() { String value; - value = fileProperties.get(FORWARD); + value = fileProperties.get(HMMFile.FORWARD); return value; } - public void setMAPStatus(boolean status) - { - if (status == true) - { - fileProperties.put(MAP, YES); - } - else - { - fileProperties.put(MAP, NO); - } - } - - public void setReferenceAnnotationStatus(boolean status) - { - if (status == true) - { - fileProperties.put(REFERENCE_ANNOTATION, YES); - } - else - { - fileProperties.put(REFERENCE_ANNOTATION, NO); - } + /** + * Constructs the consensus sequence based on the most probable symbol at each + * position. Gap characters are inserted for discontinuities in the node map + * numbering (if provided), else an ungapped sequence is generated. + *

+ * A mapping between the HMM nodes and residue positions of the sequence is + * also built and saved. + * + * @return + */ + void buildConsensusSequence() + { + List toResidues = new ArrayList<>(); + + /* + * if the HMM provided a map to sequence, use those start/end values, + * else just treat it as for a contiguous sequence numbered from 1 + */ + boolean hasMap = getBooleanProperty(HMMFile.MAP); + int start = hasMap ? getNode(1).getResidueNumber() : 1; + int endResNo = hasMap ? getNode(nodes.size() - 1).getResidueNumber() + : (start + getLength() - 1); + char[] sequence = new char[endResNo]; + + int lastResNo = start - 1; + int seqOffset = -1; + int gapCount = 0; + + + for (int seqN = 0; seqN < start; seqN++) + { + sequence[seqN] = GAP_DASH; + seqOffset++; + } + + for (int nodeNo = 1; nodeNo < nodes.size(); nodeNo++) + { + HMMNode node = nodes.get(nodeNo); + final int resNo = hasMap ? node.getResidueNumber() : lastResNo + 1; + + /* + * insert gaps if map numbering is not continuous + */ + while (resNo > lastResNo + 1) + { + sequence[seqOffset++] = GAP_DASH; + lastResNo++; + gapCount++; + } + char consensusResidue = node.getConsensusResidue(); + if (GAP_DASH == consensusResidue) + { + /* + * no residue annotation in HMM - scan for the symbol + * with the highest match emission probability + */ + int symbolIndex = node.getMaxMatchEmissionIndex(); + consensusResidue = alphabet.charAt(symbolIndex); + if (node.getMatchEmission(symbolIndex) < 0.5D) + { + // follow convention of lower case if match emission prob < 0.5 + consensusResidue = Character.toLowerCase(consensusResidue); + } + } + sequence[seqOffset++] = consensusResidue; + lastResNo = resNo; + } + + Sequence seq = new Sequence(getName(), sequence, start, + lastResNo - gapCount); + seq.createDatasetSequence(); + seq.setHMM(this); + this.hmmSeq = seq; + + /* + * construct and store Mapping of nodes to residues + * note as constructed this is just an identity mapping, + * but it allows for greater flexibility in future + */ + List fromNodes = new ArrayList<>(); + fromNodes.add(new int[] { 1, getLength() }); + toResidues.add(new int[] { seq.getStart(), seq.getEnd() }); + MapList mapList = new MapList(fromNodes, toResidues, 1, 1); + mapToHmmConsensus = new Mapping(seq.getDatasetSequence(), mapList); } - public void setMaskedValueStatus(boolean status) - { - if (status == true) - { - fileProperties.put(MASKED_VALUE, YES); - } - else - { - fileProperties.put(MASKED_VALUE, NO); - } - } - public void setConsensusResidueStatus(boolean status) + /** + * Answers the aligned consensus sequence for the profile. Note this will + * return null if called before setNodes has been called. + * + * @return + */ + public SequenceI getConsensusSequence() { - if (status == true) - { - fileProperties.put(CONSENSUS_RESIDUE, YES); - } - else - { - fileProperties.put(CONSENSUS_RESIDUE, NO); - } + return hmmSeq; } - public void setConsensusStructureStatus(boolean status) + /** + * Answers the index position (0...) of the given symbol, or -1 if not a valid + * symbol for this HMM + * + * @param symbol + * @return + */ + private int getSymbolIndex(char symbol) { - if (status == true) + /* + * symbolIndexLookup holds the index for 'A' to 'Z' + */ + char c = Character.toUpperCase(symbol); + if ('A' <= c && c <= 'Z') { - fileProperties.put(CONSENSUS_STRUCTURE, YES); - } - else - { - fileProperties.put(CONSENSUS_STRUCTURE, NO); + return symbolIndexLookup[c - 'A']; } + return -1; } /** + * Sets the nodes of this HMM, and also extracts the HMM consensus sequence + * and a mapping between node numbers and sequence positions * - * @param transition - * type of transition occuring - * @return index value representing position along stateTransition array. + * @param nodeList */ - public Integer getTransitionType(String transition) + public void setNodes(List nodeList) { - Integer index; - switch (transition) + nodes = nodeList; + if (nodes.size() > 1) { - case "mm": - index = 0; - break; - case "mi": - index = 1; - break; - case "md": - index = 2; - break; - case "im": - index = 3; - break; - case "ii": - index = 4; - break; - case "dm": - index = 5; - break; - case "dd": - index = 6; - break; - default: - index = null; + buildConsensusSequence(); } - return index; } /** - * find the index of the node in a hidden Markov model based on the column in - * the alignment + * Sets the aligned consensus sequence this HMM is the model for * - * @param alignmentColumn + * @param hmmSeq */ + public void setHmmSeq(SequenceI hmmSeq) + { + this.hmmSeq = hmmSeq; + } - public Integer findNodeIndex(int alignmentColumn) + public void setBackgroundFrequencies(Map bkgdFreqs) { - Integer index; - index = nodeLookup.get(alignmentColumn); - return index; + backgroundFrequencies = bkgdFreqs; } - public static String findStringFromBoolean(boolean value) + public void setBackgroundFrequencies(ResidueCount bkgdFreqs) { - if (value) - { - return YES; - } - else + backgroundFrequencies = new HashMap<>(); + + int total = bkgdFreqs.getTotalResidueCount(); + + for (char c : bkgdFreqs.getSymbolCounts().symbols) { - return NO; + backgroundFrequencies.put(c, bkgdFreqs.getCount(c) * 1f / total); } + } + + public Map getBackgroundFrequencies() + { + return backgroundFrequencies; + } + }