X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fjalview%2Fdatamodel%2FHiddenMarkovModel.java;h=e3eb160f38fc5d76247efe6d8506716a14851d1e;hb=4edae2ab642aedc93fd044d901df988e8c2bc83a;hp=c96ad8b1b5d13f590a5da953fcce2deaed5f6613;hpb=b48749857263e9c85c93fce54f764453fbce9696;p=jalview.git diff --git a/src/jalview/datamodel/HiddenMarkovModel.java b/src/jalview/datamodel/HiddenMarkovModel.java index c96ad8b..e3eb160 100644 --- a/src/jalview/datamodel/HiddenMarkovModel.java +++ b/src/jalview/datamodel/HiddenMarkovModel.java @@ -1,106 +1,30 @@ package jalview.datamodel; -import jalview.gui.AlignFrame; +import jalview.io.HMMFile; +import jalview.schemes.ResidueProperties; +import jalview.util.Comparison; +import jalview.util.MapList; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Scanner; /** - * Data structure which stores a hidden Markov model. Currently contains file - * properties as well, not sure whether these should be transferred to the - * HMMFile class + * Data structure which stores a hidden Markov model * * @author TZVanaalten * */ public class HiddenMarkovModel { + private static final char GAP_DASH = '-'; + public final static String YES = "yes"; - // Stores file properties. Do not directly access this field as it contains - // only string value - use the getter methods. For example, to find the length - // of theHMM, use getModelLength()to return an int value - Map fileProperties = new HashMap<>(); - - // contains all of the symbols used in this model. The index of each symbol - // represents its lookup value - List symbols = new ArrayList<>(); - - // contains information for each node in the model. The begin node is at index - // 0. Node 0 contains average emission probabilities for each symbol - List nodes = new ArrayList<>(); - - // contains the HMM node for each alignment column, alignment columns start at - // index 0; - Map nodeLookup = new HashMap<>(); - - // contains the symbol index for each symbol - Map symbolIndexLookup = new HashMap<>(); - - final static String YES = "yes"; - - final static String NO = "no"; - - int numberOfSymbols; - - // keys for file properties hashmap - private final String NAME = "NAME"; - - private final String ACCESSION_NUMBER = "ACC"; - - private final String DESCRIPTION = "DESC"; - - private final String LENGTH = "LENG"; - - private final String MAX_LENGTH = "MAXL"; - - private final String ALPHABET = "ALPH"; - - private final String DATE = "DATE"; - - private final String COMMAND_LOG = "COM"; - - private final String NUMBER_OF_SEQUENCES = "NSEQ"; - - private final String EFF_NUMBER_OF_SEQUENCES = "EFFN"; - - private final String CHECK_SUM = "CKSUM"; - - private final String GATHERING_THRESHOLDS = "GA"; - - private final String TRUSTED_CUTOFFS = "TC"; - - private final String NOISE_CUTOFFS = "NC"; - - private final String STATISTICS = "STATS"; - - private final String COMPO = "COMPO"; - - private final String GATHERING_THRESHOLD = "GA"; - - private final String TRUSTED_CUTOFF = "TC"; - - private final String NOISE_CUTOFF = "NC"; - - private final String VITERBI = "VITERBI"; - - private final String MSV = "MSV"; + public final static String NO = "no"; - private final String FORWARD = "FORWARD"; - - private final String MAP = "MAP"; - - private final String REFERENCE_ANNOTATION = "RF"; - - private final String CONSENSUS_RESIDUE = "CONS"; - - private final String CONSENSUS_STRUCTURE = "CS"; - - private final String MASKED_VALUE = "MM"; - public static final int MATCHTOMATCH = 0; public static final int MATCHTOINSERT = 1; @@ -115,78 +39,136 @@ public class HiddenMarkovModel public static final int DELETETODELETE = 6; - String fileHeader; + private static final double LOG2 = Math.log(2); + + /* + * properties read from HMM file header lines + */ + private Map fileProperties = new HashMap<>(); + + private String fileHeader; + + /* + * the symbols used in this model e.g. "ACGT" + */ + private String alphabet; + /* + * symbol lookup index into the alphabet for 'A' to 'Z' + */ + private int[] symbolIndexLookup = new int['Z' - 'A' + 1]; + + /* + * Nodes in the model. The begin node is at index 0, and contains + * average emission probabilities for each symbol. + */ + private List nodes = new ArrayList<>(); + + /* + * the aligned HMM consensus sequence extracted from the HMM profile + */ + private SequenceI hmmSeq; + + /* + * mapping from HMM nodes to residues of the hmm consensus sequence + */ + private Mapping mapToHmmConsensus; + + /** + * Constructor + */ public HiddenMarkovModel() { - } - public HiddenMarkovModel(HiddenMarkovModel hmm) + /** + * Copy constructor given a new aligned sequence with which to associate the + * HMM profile + * + * @param hmm + * @param sq + */ + public HiddenMarkovModel(HiddenMarkovModel hmm, SequenceI sq) { super(); this.fileProperties = new HashMap<>(hmm.fileProperties); - this.symbols = new ArrayList<>(hmm.symbols); + this.alphabet = hmm.alphabet; this.nodes = new ArrayList<>(hmm.nodes); - this.nodeLookup = new HashMap<>(hmm.nodeLookup); - this.symbolIndexLookup = new HashMap<>( - hmm.symbolIndexLookup); - this.numberOfSymbols = hmm.numberOfSymbols; + this.symbolIndexLookup = hmm.symbolIndexLookup; this.fileHeader = new String(hmm.fileHeader); + this.hmmSeq = sq; + if (sq.getDatasetSequence() == hmm.mapToHmmConsensus.getTo()) + { + // same dataset sequence e.g. after realigning search results + this.mapToHmmConsensus = hmm.mapToHmmConsensus; + } + else + { + // different dataset sequence e.g. after loading HMM from project + this.mapToHmmConsensus = new Mapping(sq.getDatasetSequence(), + hmm.mapToHmmConsensus.getMap()); + } } /** - * Gets the file header of the .hmm file this model came from. + * Returns the information content at a specified column, calculated as the + * sum (over possible symbols) of the log ratio + * + *
+   *  log(emission probability / background probability) / log(2)
+   * 
* + * @param column + * column position (base 0) * @return */ - public String getFileHeader() + public float getInformationContent(int column) { - return fileHeader; - } + float informationContent = 0f; - /** - * Sets the file header of this model. - * - * @param header - */ - public void setFileHeader(String header) - { - fileHeader = header; + for (char symbol : getSymbols().toCharArray()) + { + float freq = ResidueProperties.backgroundFrequencies + .get(getAlphabetType()).get(symbol); + float prob = (float) getMatchEmissionProbability(column, symbol); + informationContent += prob * Math.log(prob / freq); + } + + informationContent = informationContent / (float) LOG2; + + return informationContent; } /** - * Returns the map containing the matches between nodes and alignment column - * indexes. + * Gets the file header of the .hmm file this model came from * * @return - * */ - public Map getNodeLookup() + public String getFileHeader() { - return nodeLookup; + return fileHeader; } /** - * Returns the list of symbols used in this hidden Markov model. + * Sets the file header of this model. * - * @return + * @param header */ - public List getSymbols() + public void setFileHeader(String header) { - return symbols; + fileHeader = header; } - + /** - * Returns the file properties. + * Returns the symbols used in this hidden Markov model * * @return */ - public Map getFileProperties() + public String getSymbols() { - return fileProperties; + return alphabet; } - + /** * Gets the node in the hidden Markov model at the specified position. * @@ -201,20 +183,7 @@ public class HiddenMarkovModel */ public HMMNode getNode(int nodeIndex) { - return getNodes().get(nodeIndex); - } - - /** - * Sets the list of symbols used in the hidden Markov model to the list - * specified. - * - * @param symbolsL - * The list of symbols to which the current list is to be changed. - * - */ - public void setSymbols(List symbolsL) - { - this.symbols = symbolsL; + return nodes.get(nodeIndex); } /** @@ -224,153 +193,130 @@ public class HiddenMarkovModel */ public String getName() { - return fileProperties.get(NAME); + return fileProperties.get(HMMFile.NAME); } /** - * Returns the accession number. - * @return - */ - public String getAccessionNumber() - { - return fileProperties.get(ACCESSION_NUMBER); - } - - /** - * Returns a description of the sequence alignment on which the hidden Markov - * model is based. + * Answers the string value of the property (parsed from an HMM file) for the + * given key, or null if the property is not present * + * @param key * @return */ - public String getDescription() + public String getProperty(String key) { - return fileProperties.get(DESCRIPTION); + return fileProperties.get(key); } /** - * Returns the length of the hidden Markov model. + * Answers true if the property with the given key is present with a value of + * "yes" (not case-sensitive), else false * + * @param key * @return */ - public Integer getLength() + public boolean getBooleanProperty(String key) { - if (fileProperties.get(LENGTH) == null) - { - return null; - } - return Integer.parseInt(fileProperties.get(LENGTH)); + return YES.equalsIgnoreCase(fileProperties.get(key)); } /** - * Returns the max instance length within the hidden Markov model. + * Returns the length of the hidden Markov model. The value returned is the + * LENG property if specified, else the number of nodes, excluding the begin + * node (which should be the same thing). * * @return */ - public Integer getMaxInstanceLength() + public int getLength() { - if (fileProperties.get(MAX_LENGTH) == null) + if (fileProperties.get(HMMFile.LENGTH) == null) { - return null; + return nodes.size() - 1; // not counting BEGIN node } - return Integer.parseInt(fileProperties.get(MAX_LENGTH)); + return Integer.parseInt(fileProperties.get(HMMFile.LENGTH)); } /** - * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the - * options. Other alphabets may be added. + * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are + * the options. Other alphabets may be added. * * @return */ public String getAlphabetType() { - return fileProperties.get(ALPHABET); - } - - /** - * Returns the date as a String. - * - * @return - */ - public String getDate() - { - return fileProperties.get(DATE); + return fileProperties.get(HMMFile.ALPHABET); } /** - * Returns the command line log. + * Sets the model alphabet to the symbols in the given string (ignoring any + * whitespace), and returns the number of symbols * - * @return + * @param symbols */ - public String getCommandLineLog() + public int setAlphabet(String symbols) { - return fileProperties.get(COMMAND_LOG); - } + String trimmed = symbols.toUpperCase().replaceAll("\\s", ""); + int count = trimmed.length(); + alphabet = trimmed; + symbolIndexLookup = new int['Z' - 'A' + 1]; + Arrays.fill(symbolIndexLookup, -1); + int ignored = 0; - /** - * Returns the number of sequences on which the HMM was trained. - * - * @return - */ - public Integer getNumberOfSequences() - { - if (fileProperties.get(NUMBER_OF_SEQUENCES) == null) + /* + * save the symbols in order, and a quick lookup of symbol position + */ + for (short i = 0; i < count; i++) { - return null; + char symbol = trimmed.charAt(i); + if (symbol >= 'A' && symbol <= 'Z' + && symbolIndexLookup[symbol - 'A'] == -1) + { + symbolIndexLookup[symbol - 'A'] = i; + } + else + { + System.err + .println( + "Unexpected or duplicated character in HMM ALPHabet: " + + symbol); + ignored++; + } } - return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES)); + return count - ignored; } /** - * Returns the effective number of sequences on which the HMM was based. + * Answers the node of the model corresponding to an aligned column position + * (0...), or null if there is no such node * - * @param value + * @param column + * @return */ - public Double getEffectiveNumberOfSequences() + HMMNode getNodeForColumn(int column) { - if (fileProperties.get(LENGTH) == null) + /* + * if the hmm consensus is gapped at the column, + * there is no corresponding node + */ + if (Comparison.isGap(hmmSeq.getCharAt(column))) { return null; } - return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES)); - } - /** - * Returns the checksum. - * - * @return - */ - public Long getCheckSum() - { - if (fileProperties.get(LENGTH) == null) + /* + * find the node (if any) that is mapped to the + * consensus sequence residue position at the column + */ + int seqPos = hmmSeq.findPosition(column); + int[] nodeNo = mapToHmmConsensus.getMap().locateInFrom(seqPos, seqPos); + if (nodeNo != null) { - return null; + return getNode(nodeNo[0]); } - return Long.parseLong(fileProperties.get(CHECK_SUM)); + return null; } /** - * Returns the list of nodes in this HMM. - * - * @return - */ - public List getNodes() - { - return nodes; - } - - /** - * Sets the list of nodes in this HMM to the given list. - * - * @param nodes - * The list of nodes to which the current list of nodes is being - * changed. - */ - public void setNodes(List nodes) - { - this.nodes = nodes; - } - - /** * Gets the match emission probability for a given symbol at a column in the * alignment. * @@ -382,27 +328,15 @@ public class HiddenMarkovModel * @return * */ - public Double getMatchEmissionProbability(int alignColumn, char symbol) + public double getMatchEmissionProbability(int alignColumn, char symbol) { - int symbolIndex; - int nodeIndex; - Double probability; - if (!symbolIndexLookup.containsKey(symbol)) - { - return 0d; - } - symbolIndex = symbolIndexLookup.get(symbol); - if (nodeLookup.containsKey(alignColumn)) + HMMNode node = getNodeForColumn(alignColumn); + int symbolIndex = getSymbolIndex(symbol); + if (node != null && symbolIndex != -1) { - nodeIndex = nodeLookup.get(alignColumn); - probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex); - return probability; + return node.getMatchEmission(symbolIndex); } - else - { - return 0d; - } - + return 0D; } /** @@ -417,28 +351,15 @@ public class HiddenMarkovModel * @return * */ - public Double getInsertEmissionProbability(int alignColumn, char symbol) + public double getInsertEmissionProbability(int alignColumn, char symbol) { - int symbolIndex; - int nodeIndex; - Double probability; - if (!symbolIndexLookup.containsKey(symbol)) + HMMNode node = getNodeForColumn(alignColumn); + int symbolIndex = getSymbolIndex(symbol); + if (node != null && symbolIndex != -1) { - return 0d; + return node.getInsertEmission(symbolIndex); } - symbolIndex = symbolIndexLookup.get(symbol); - if (nodeLookup.containsKey(alignColumn)) - { - nodeIndex = nodeLookup.get(alignColumn); - probability = getNode(nodeIndex).getInsertEmissions() - .get(symbolIndex); - return probability; - } - else - { - return 0d; - } - + return 0D; } /** @@ -453,38 +374,29 @@ public class HiddenMarkovModel * @return * */ - public Double getStateTransitionProbability(int alignColumn, + public double getStateTransitionProbability(int alignColumn, int transition) { - int transitionIndex; - int nodeIndex; - Double probability; - if (nodeLookup.containsKey(alignColumn)) - { - nodeIndex = nodeLookup.get(alignColumn); - probability = getNode(nodeIndex).getStateTransitions() - .get(transition); - return probability; - } - else + HMMNode node = getNodeForColumn(alignColumn); + if (node != null) { - return 0d; + return node.getStateTransition(transition); } - + return 0D; } /** - * Returns the alignment column linked to the node at the given index. + * Returns the sequence position linked to the node at the given index. This + * corresponds to an aligned column position (counting from 1). * * @param nodeIndex * The index of the node, starting from index 1. Index 0 is the begin * node, which does not correspond to a column in the alignment. * @return */ - public Integer getNodeAlignmentColumn(int nodeIndex) + public int getNodeMapPosition(int nodeIndex) { - Integer value = nodes.get(nodeIndex).getAlignmentColumn(); - return value; + return nodes.get(nodeIndex).getResidueNumber(); } /** @@ -501,45 +413,6 @@ public class HiddenMarkovModel } /** - * Returns the consensus at a given alignment column. - * - * @param columnIndex - * The index of the column in the alignment for which the consensus - * is desired. The list of columns starts at index 0. - * @return - */ - public char getConsensusAtAlignColumn(int columnIndex) - { - char mostLikely = '-'; - if (consensusResidueIsActive()) - { - - Integer index = findNodeIndex(columnIndex); - if (index == null) - { - return '-'; - } - mostLikely = getNodes().get(index).getConsensusResidue(); - return mostLikely; - } - else - { - double highestProb = 0; - for (char character : symbols) - { - Double prob = getMatchEmissionProbability(columnIndex, character); - if (prob > highestProb) - { - highestProb = prob; - mostLikely = character; - } - } - return mostLikely; - } - - } - - /** * Returns the reference annotation at the specified node. * * @param nodeIndex @@ -579,254 +452,25 @@ public class HiddenMarkovModel } /** - * Returns the average match emission probability for a given symbol - * - * @param symbolIndex - * The index of the symbol. - * @return - * - */ - public double getAverageMatchEmission(int symbolIndex) - { - double value = nodes.get(0).getMatchEmissions().get(symbolIndex); - return value; - } - - /** - * Returns the number of symbols in the alphabet used in this HMM. - * - * @return - */ - public int getNumberOfSymbols() - { - return numberOfSymbols; - } - - /** - * Fills symbol array and whilst doing so, updates the value of the number of - * symbols. - * - * @param parser - * The scanner scanning the symbol line in the file. - */ - public void fillSymbols(Scanner parser) - { - int i = 0; - while (parser.hasNext()) - { - String strSymbol = parser.next(); - char[] symbol = strSymbol.toCharArray(); - symbols.add(symbol[0]); - symbolIndexLookup.put(symbol[0], i); - i++; - } - numberOfSymbols = symbols.size(); - } - - /** - * Adds a file property. + * Sets a property read from an HMM file * * @param key * @param value */ - public void addFileProperty(String key, String value) + public void setProperty(String key, String value) { fileProperties.put(key, value); } /** - * Returns a boolean indicating whether the reference annotation is active. - * - * @return - */ - public boolean referenceAnnotationIsActive() - { - String status; - status = fileProperties.get(REFERENCE_ANNOTATION); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - /** - * Returns a boolean indicating whether the mask value annotation is active. - * - * @return - */ - public boolean maskValueIsActive() - { - String status; - status = fileProperties.get(MASKED_VALUE); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - /** - * Returns a boolean indicating whether the consensus residue annotation is - * active. - * - * @return - */ - public boolean consensusResidueIsActive() - { - String status; - status = fileProperties.get(CONSENSUS_RESIDUE); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - /** - * Returns a boolean indicating whether the consensus structure annotation is - * active. - * - * @return - */ - public boolean consensusStructureIsActive() - { - String status; - status = fileProperties.get(CONSENSUS_STRUCTURE); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - /** - * Returns a boolean indicating whether the MAP annotation is active. - * - * @return - */ - public boolean mapIsActive() - { - String status; - status = fileProperties.get(MAP); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - /** - * Sets the alignment column of the specified node. - * - * @param nodeIndex - * - * @param column - * - */ - public void setAlignmentColumn(int nodeIndex, int column) - { - nodes.get(nodeIndex).setAlignmentColumn(column); - } - - /** - * Sets the reference annotation at a given node. - * - * @param nodeIndex - * @param value - */ - public void setReferenceAnnotation(int nodeIndex, char value) - { - nodes.get(nodeIndex).setReferenceAnnotation(value); - } - - /** - * Sets the consensus residue at a given node. - * - * @param nodeIndex - * @param value - */ - public void setConsensusResidue(int nodeIndex, char value) - { - nodes.get(nodeIndex).setConsensusResidue(value); - } - - /** - * Sets the consensus structure at a given node. - * - * @param nodeIndex - * @param value - */ - public void setConsensusStructure(int nodeIndex, char value) - { - nodes.get(nodeIndex).setConsensusStructure(value); - } - - /** - * Sets the mask value at a given node. - * - * @param nodeIndex - * @param value - */ - public void setMaskValue(int nodeIndex, char value) - { - nodes.get(nodeIndex).setMaskValue(value); - } - - /** * Temporary implementation, should not be used. * * @return */ - public String getGatheringThreshold() + public String getViterbi() { String value; - value = fileProperties.get("GA"); + value = fileProperties.get(HMMFile.VITERBI); return value; } @@ -835,10 +479,10 @@ public class HiddenMarkovModel * * @return */ - public String getNoiseCutoff() + public String getMSV() { String value; - value = fileProperties.get("NC"); + value = fileProperties.get(HMMFile.MSV); return value; } @@ -847,294 +491,147 @@ public class HiddenMarkovModel * * @return */ - public String getTrustedCutoff() + public String getForward() { String value; - value = fileProperties.get("TC"); + value = fileProperties.get(HMMFile.FORWARD); return value; } /** - * Temporary implementation, should not be used. + * Constructs the consensus sequence based on the most probable symbol at each + * position. Gap characters are inserted for discontinuities in the node map + * numbering (if provided), else an ungapped sequence is generated. + *

+ * A mapping between the HMM nodes and residue positions of the sequence is + * also built and saved. * * @return */ - public String getViterbi() + void buildConsensusSequence() { - String value; - value = fileProperties.get(VITERBI); - return value; - } + List toResidues = new ArrayList<>(); - /** - * Temporary implementation, should not be used. - * - * @return - */ - public String getMSV() - { - String value; - value = fileProperties.get(MSV); - return value; - } + /* + * if the HMM provided a map to sequence, use those start/end values, + * else just treat it as for a contiguous sequence numbered from 1 + */ + boolean hasMap = getBooleanProperty(HMMFile.MAP); + int start = hasMap ? getNode(1).getResidueNumber() : 1; + int endResNo = hasMap ? getNode(nodes.size() - 1).getResidueNumber() + : (start + getLength() - 1); + char[] sequence = new char[endResNo - start + 1]; - /** - * Temporary implementation, should not be used. - * - * @return - */ - public String getForward() - { - String value; - value = fileProperties.get(FORWARD); - return value; - } + int lastResNo = start - 1; + int seqOffset = 0; + int gapCount = 0; - /** - * Sets the activation status of the MAP annotation. - * - * @param status - */ - public void setMAPStatus(boolean status) - { - fileProperties.put(MAP, status ? YES : NO); - } + for (int nodeNo = 1; nodeNo < nodes.size(); nodeNo++) + { + HMMNode node = nodes.get(nodeNo); + final int resNo = hasMap ? node.getResidueNumber() : lastResNo + 1; - /** - * Sets the activation status of the reference annotation. - * - * @param status - */ - public void setReferenceAnnotationStatus(boolean status) - { - fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO); - } + /* + * insert gaps if map numbering is not continuous + */ + while (resNo > lastResNo + 1) + { + sequence[seqOffset++] = '-'; + lastResNo++; + gapCount++; + } + char consensusResidue = node.getConsensusResidue(); + if (GAP_DASH == consensusResidue) + { + /* + * no residue annotation in HMM - scan for the symbol + * with the highest match emission probability + */ + int symbolIndex = node.getMaxMatchEmissionIndex(); + consensusResidue = alphabet.charAt(symbolIndex); + if (node.getMatchEmission(symbolIndex) < 0.5D) + { + // follow convention of lower case if match emission prob < 0.5 + consensusResidue = Character.toLowerCase(consensusResidue); + } + } + sequence[seqOffset++] = consensusResidue; + lastResNo = resNo; + } - /** - * Sets the activation status of the mask value annotation. - * - * @param status - */ - public void setMaskedValueStatus(boolean status) - { - fileProperties.put(MASKED_VALUE, status ? YES : NO); - } + Sequence seq = new Sequence(getName(), sequence, start, + lastResNo - gapCount); + seq.createDatasetSequence(); + seq.setHMM(this); + this.hmmSeq = seq; - /** - * Sets the activation status of the consensus residue annotation. - * - * @param status - */ - public void setConsensusResidueStatus(boolean status) - { - fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO); + /* + * construct and store Mapping of nodes to residues + * note as constructed this is just an identity mapping, + * but it allows for greater flexibility in future + */ + List fromNodes = new ArrayList<>(); + fromNodes.add(new int[] { 1, getLength() }); + toResidues.add(new int[] { seq.getStart(), seq.getEnd() }); + MapList mapList = new MapList(fromNodes, toResidues, 1, 1); + mapToHmmConsensus = new Mapping(seq.getDatasetSequence(), mapList); } - /** - * Sets the activation status of the consensus structure annotation. - * - * @param status - */ - public void setConsensusStructureStatus(boolean status) - { - fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO); - } /** - * Finds the index of the node in a hidden Markov model based on the column in - * the alignment + * Answers the aligned consensus sequence for the profile. Note this will + * return null if called before setNodes has been called. * - * @param alignmentColumn - * The index of the column in the alignment, with the indexes - * starting from 0. + * @return */ - - public Integer findNodeIndex(int alignmentColumn) + public SequenceI getConsensusSequence() { - Integer index; - index = nodeLookup.get(alignmentColumn); - return index; + return hmmSeq; } /** - * Finds the String values of a boolean. "yes" for true and "no" for false. + * Answers the index position (0...) of the given symbol, or -1 if not a valid + * symbol for this HMM * - * @param value + * @param symbol * @return */ - public static String findStringFromBoolean(boolean value) + private int getSymbolIndex(char symbol) { - if (value) - { - return YES; - } - else + /* + * symbolIndexLookup holds the index for 'A' to 'Z' + */ + char c = Character.toUpperCase(symbol); + if ('A' <= c && c <= 'Z') { - return NO; + return symbolIndexLookup[c - 'A']; } + return -1; } - - /** - * Returns the consensus sequence based on the most probable symbol at each - * position. The sequence is adjusted to match the length of the existing - * sequence alignment. Gap characters are used as padding. + * Sets the nodes of this HMM, and also extracts the HMM consensus sequence + * and a mapping between node numbers and sequence positions * - * @param length - * The length of the longest sequence in the existing alignment. - * @return + * @param nodeList */ - public Sequence getConsensusSequence(int length) + public void setNodes(List nodeList) { - int start; - int end; - int modelLength; - start = getNodeAlignmentColumn(1); - modelLength = getLength(); - end = getNodeAlignmentColumn(modelLength); - char[] sequence = new char[length]; - for (int index = 0; index < length; index++) + nodes = nodeList; + if (nodes.size() > 1) { - Character character; - - character = getConsensusAtAlignColumn(index); - - if (character == null || character == '-') - { - sequence[index] = '-'; - } - else - { - sequence[index] = Character.toUpperCase(character); - } - } - - - Sequence seq = new Sequence(getName() + "_HMM", sequence, start, end); - return seq; + buildConsensusSequence(); + } } - /** - * Maps the nodes of the hidden Markov model to the reference annotation and - * then deletes this annotation. + * Sets the aligned consensus sequence this HMM is the model for + * + * @param hmmSeq */ - public void mapToReferenceAnnotation(AlignFrame af, SequenceI seq) - { - AlignmentAnnotation annotArray[] = af.getViewport().getAlignment() - .getAlignmentAnnotation(); - - AlignmentAnnotation reference = null; - for (AlignmentAnnotation annot : annotArray) - { - if (annot.label.contains("Reference")) - { - reference = annot; - } - } - - if (reference == null) - { - return; - } - - mapToReferenceAnnotation(reference, seq); - af.getViewport().getAlignment().deleteAnnotation(reference); - } - - public void mapToReferenceAnnotation(AlignmentAnnotation reference, - SequenceI seq) - { - HiddenMarkovModel hmm = seq.getHMM(); - Annotation[] annots = reference.annotations; - { - int nodeIndex = 0; - for (int col = 0; col < annots.length; col++) - { - String character = annots[col].displayCharacter; - if ("x".equals(character) || "X".equals(character)) - { - nodeIndex++; - if (nodeIndex < hmm.getNodes().size()) - { - HMMNode node = hmm.getNode(nodeIndex); - int alignPos = getNodeAlignmentColumn(nodeIndex); - char seqCharacter = seq.getCharAt(alignPos); - if (alignPos >= seq.getLength() || col >= seq.getLength()) - { - seq.insertCharAt(seq.getLength(), - (alignPos + 1) - seq.getLength(), - '-'); - } - seq.getSequence()[alignPos] = '-'; - seq.getSequence()[col] = seqCharacter; - node.setAlignmentColumn(col); - hmm.nodeLookup.put(col, nodeIndex); - } - else - { - System.out.println( - "The reference annotation contains more consensus columns than the hidden Markov model"); - break; - } - } - else - { - hmm.nodeLookup.remove(col); - } - } - - } - - } - - public void mapToReferenceAnnotation(AlignmentAnnotation reference) + public void setHmmSeq(SequenceI hmmSeq) { - Annotation[] annots = reference.annotations; - { - int nodeIndex = 0; - for (int col = 0; col < annots.length; col++) - { - String character = annots[col].displayCharacter; - if ("x".equals(character) || "X".equals(character)) - { - nodeIndex++; - if (nodeIndex < nodes.size()) - { - HMMNode node = nodes.get(nodeIndex); - node.setAlignmentColumn(col + 1); - nodeLookup.put(col, nodeIndex); - } - else - { - System.out.println( - "The reference annotation contains more consensus columns than the hidden Markov model"); - break; - } - } - else - { - nodeLookup.remove(col); - } - } - - } - - } - - public SequenceI initHMMSequence(AlignFrame af, int position) - { - AlignmentI alignment = af.getViewport().getAlignment(); - int length = alignment.getWidth(); - Sequence consensus = getConsensusSequence(length); - consensus.setIsHMMConsensusSequence(true); - consensus.setHMM(this); - SequenceI[] consensusArr = new Sequence[] { consensus }; - alignment.getSequences().add(position, consensus); - return consensus; + this.hmmSeq = hmmSeq; } - - }