package jalview.datamodel;
-import jalview.gui.AlignFrame;
+import jalview.io.HMMFile;
+import jalview.schemes.ResidueProperties;
+import jalview.util.Comparison;
+import jalview.util.MapList;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.Scanner;
/**
- * Data structure which stores a hidden Markov model. Currently contains file
- * properties as well, not sure whether these should be transferred to the
- * HMMFile class
+ * Data structure which stores a hidden Markov model
*
* @author TZVanaalten
*
*/
public class HiddenMarkovModel
{
+ private static final char GAP_DASH = '-';
+ public final static String YES = "yes";
- // Stores file properties. Do not directly access this field as it contains
- // only string value - use the getter methods. For example, to find the length
- // of theHMM, use getModelLength()to return an int value
- Map<String, String> fileProperties = new HashMap<>();
-
- // contains all of the symbols used in this model. The index of each symbol
- // represents its lookup value
- List<Character> symbols = new ArrayList<>();
-
- // contains information for each node in the model. The begin node is at index
- // 0. Node 0 contains average emission probabilities for each symbol
- List<HMMNode> nodes = new ArrayList<>();
-
- // contains the HMM node for each alignment column, alignment columns start at
- // index 0;
- Map<Integer, Integer> nodeLookup = new HashMap<>();
-
- // contains the symbol index for each symbol
- Map<Character, Integer> symbolIndexLookup = new HashMap<>();
-
- final static String YES = "yes";
-
- final static String NO = "no";
-
- int numberOfSymbols;
-
- // keys for file properties hashmap
- private final String NAME = "NAME";
-
- private final String ACCESSION_NUMBER = "ACC";
-
- private final String DESCRIPTION = "DESC";
-
- private final String LENGTH = "LENG";
-
- private final String MAX_LENGTH = "MAXL";
-
- private final String ALPHABET = "ALPH";
-
- private final String DATE = "DATE";
+ public final static String NO = "no";
- private final String COMMAND_LOG = "COM";
-
- private final String NUMBER_OF_SEQUENCES = "NSEQ";
-
- private final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
-
- private final String CHECK_SUM = "CKSUM";
-
- private final String GATHERING_THRESHOLDS = "GA";
-
- private final String TRUSTED_CUTOFFS = "TC";
-
- private final String NOISE_CUTOFFS = "NC";
-
- private final String STATISTICS = "STATS";
-
- private final String COMPO = "COMPO";
-
- private final String GATHERING_THRESHOLD = "GA";
-
- private final String TRUSTED_CUTOFF = "TC";
+ public static final int MATCHTOMATCH = 0;
- private final String NOISE_CUTOFF = "NC";
+ public static final int MATCHTOINSERT = 1;
- private final String VITERBI = "VITERBI";
+ public static final int MATCHTODELETE = 2;
- private final String MSV = "MSV";
+ public static final int INSERTTOMATCH = 3;
- private final String FORWARD = "FORWARD";
+ public static final int INSERTTOINSERT = 4;
- private final String MAP = "MAP";
+ public static final int DELETETOMATCH = 5;
- private final String REFERENCE_ANNOTATION = "RF";
+ public static final int DELETETODELETE = 6;
- private final String CONSENSUS_RESIDUE = "CONS";
+ private static final double LOG2 = Math.log(2);
- private final String CONSENSUS_STRUCTURE = "CS";
+ /*
+ * properties read from HMM file header lines
+ */
+ private Map<String, String> fileProperties = new HashMap<>();
- private final String MASKED_VALUE = "MM";
+ private String fileHeader;
- public static final int MATCHTOMATCH = 0;
-
- public static final int MATCHTOINSERT = 1;
-
- public static final int MATCHTODELETE = 2;
+ /*
+ * the symbols used in this model e.g. "ACGT"
+ */
+ private String alphabet;
- public static final int INSERTTOMATCH = 3;
+ /*
+ * symbol lookup index into the alphabet for 'A' to 'Z'
+ */
+ private int[] symbolIndexLookup = new int['Z' - 'A' + 1];
- public static final int INSERTTOINSERT = 4;
+ /*
+ * Nodes in the model. The begin node is at index 0, and contains
+ * average emission probabilities for each symbol.
+ */
+ private List<HMMNode> nodes = new ArrayList<>();
- public static final int DELETETOMATCH = 5;
+ /*
+ * the aligned HMM consensus sequence extracted from the HMM profile
+ */
+ private SequenceI hmmSeq;
- public static final int DELETETODELETE = 6;
+ /*
+ * mapping from HMM nodes to residues of the hmm consensus sequence
+ */
+ private Mapping mapToHmmConsensus;
- String fileHeader;
+ // stores background frequencies of alignment from which this model came
+ private Map<Character, Float> backgroundFrequencies;
+ /**
+ * Constructor
+ */
public HiddenMarkovModel()
{
-
}
- public HiddenMarkovModel(HiddenMarkovModel hmm)
+ /**
+ * Copy constructor given a new aligned sequence with which to associate the
+ * HMM profile
+ *
+ * @param hmm
+ * @param sq
+ */
+ public HiddenMarkovModel(HiddenMarkovModel hmm, SequenceI sq)
{
super();
this.fileProperties = new HashMap<>(hmm.fileProperties);
- this.symbols = new ArrayList<>(hmm.symbols);
+ this.alphabet = hmm.alphabet;
this.nodes = new ArrayList<>(hmm.nodes);
- this.nodeLookup = new HashMap<>(hmm.nodeLookup);
- this.symbolIndexLookup = new HashMap<>(
- hmm.symbolIndexLookup);
- this.numberOfSymbols = hmm.numberOfSymbols;
+ this.symbolIndexLookup = hmm.symbolIndexLookup;
this.fileHeader = new String(hmm.fileHeader);
+ this.hmmSeq = sq;
+ this.backgroundFrequencies = hmm.getBackgroundFrequencies();
+ if (sq.getDatasetSequence() == hmm.mapToHmmConsensus.getTo())
+ {
+ // same dataset sequence e.g. after realigning search results
+ this.mapToHmmConsensus = hmm.mapToHmmConsensus;
+ }
+ else
+ {
+ // different dataset sequence e.g. after loading HMM from project
+ this.mapToHmmConsensus = new Mapping(sq.getDatasetSequence(),
+ hmm.mapToHmmConsensus.getMap());
+ }
}
/**
- * Gets the file header of the .hmm file this model came from.
+ * Returns the information content at a specified column, calculated as the
+ * sum (over possible symbols) of the log ratio
+ *
+ * <pre>
+ * log(emission probability / background probability) / log(2)
+ * </pre>
*
+ * @param column
+ * column position (base 0)
* @return
*/
- public String getFileHeader()
+ public float getInformationContent(int column)
{
- return fileHeader;
- }
+ float informationContent = 0f;
- /**
- * Sets the file header of this model.
- *
- * @param header
- */
- public void setFileHeader(String header)
- {
- fileHeader = header;
+ for (char symbol : getSymbols().toCharArray())
+ {
+ float freq = ResidueProperties.backgroundFrequencies
+ .get(getAlphabetType()).get(symbol);
+ float prob = (float) getMatchEmissionProbability(column, symbol);
+ informationContent += prob * Math.log(prob / freq);
+ }
+
+ informationContent = informationContent / (float) LOG2;
+
+ return informationContent;
}
/**
- * Returns the map containing the matches between nodes and alignment column
- * indexes.
+ * Gets the file header of the .hmm file this model came from
*
* @return
- *
*/
- public Map<Integer, Integer> getNodeLookup()
+ public String getFileHeader()
{
- return nodeLookup;
+ return fileHeader;
}
/**
- * Returns the list of symbols used in this hidden Markov model.
+ * Sets the file header of this model.
*
- * @return
+ * @param header
*/
- public List<Character> getSymbols()
+ public void setFileHeader(String header)
{
- return symbols;
+ fileHeader = header;
}
-
+
/**
- * Returns the file properties.
+ * Returns the symbols used in this hidden Markov model
*
* @return
*/
- public Map<String, String> getFileProperties()
+ public String getSymbols()
{
- return fileProperties;
+ return alphabet;
}
-
+
/**
* Gets the node in the hidden Markov model at the specified position.
*
*/
public HMMNode getNode(int nodeIndex)
{
- return getNodes().get(nodeIndex);
- }
-
- /**
- * Sets the list of symbols used in the hidden Markov model to the list
- * specified.
- *
- * @param symbolsL
- * The list of symbols to which the current list is to be changed.
- *
- */
- public void setSymbols(List<Character> symbolsL)
- {
- this.symbols = symbolsL;
+ return nodes.get(nodeIndex);
}
/**
*/
public String getName()
{
- return fileProperties.get(NAME);
+ return fileProperties.get(HMMFile.NAME);
}
/**
- * Returns the accession number.
- * @return
- */
- public String getAccessionNumber()
- {
- return fileProperties.get(ACCESSION_NUMBER);
- }
-
- /**
- * Returns a description of the sequence alignment on which the hidden Markov
- * model is based.
+ * Answers the string value of the property (parsed from an HMM file) for the
+ * given key, or null if the property is not present
*
+ * @param key
* @return
*/
- public String getDescription()
+ public String getProperty(String key)
{
- return fileProperties.get(DESCRIPTION);
+ return fileProperties.get(key);
}
/**
- * Returns the length of the hidden Markov model.
+ * Answers true if the property with the given key is present with a value of
+ * "yes" (not case-sensitive), else false
*
+ * @param key
* @return
*/
- public Integer getLength()
+ public boolean getBooleanProperty(String key)
{
- if (fileProperties.get(LENGTH) == null)
- {
- return null;
- }
- return Integer.parseInt(fileProperties.get(LENGTH));
+ return YES.equalsIgnoreCase(fileProperties.get(key));
}
/**
- * Returns the max instance length within the hidden Markov model.
+ * Returns the length of the hidden Markov model. The value returned is the
+ * LENG property if specified, else the number of nodes, excluding the begin
+ * node (which should be the same thing).
*
* @return
*/
- public Integer getMaxInstanceLength()
+ public int getLength()
{
- if (fileProperties.get(MAX_LENGTH) == null)
+ if (fileProperties.get(HMMFile.LENGTH) == null)
{
- return null;
+ return nodes.size() - 1; // not counting BEGIN node
}
- return Integer.parseInt(fileProperties.get(MAX_LENGTH));
+ return Integer.parseInt(fileProperties.get(HMMFile.LENGTH));
}
/**
- * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the
- * options. Other alphabets may be added.
+ * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are
+ * the options. Other alphabets may be added.
*
* @return
*/
public String getAlphabetType()
{
- return fileProperties.get(ALPHABET);
- }
-
- /**
- * Returns the date as a String.
- *
- * @return
- */
- public String getDate()
- {
- return fileProperties.get(DATE);
+ return fileProperties.get(HMMFile.ALPHABET);
}
/**
- * Returns the command line log.
+ * Sets the model alphabet to the symbols in the given string (ignoring any
+ * whitespace), and returns the number of symbols
*
- * @return
+ * @param symbols
*/
- public String getCommandLineLog()
+ public int setAlphabet(String symbols)
{
- return fileProperties.get(COMMAND_LOG);
- }
+ String trimmed = symbols.toUpperCase().replaceAll("\\s", "");
+ int count = trimmed.length();
+ alphabet = trimmed;
+ symbolIndexLookup = new int['Z' - 'A' + 1];
+ Arrays.fill(symbolIndexLookup, -1);
+ int ignored = 0;
- /**
- * Returns the number of sequences on which the HMM was trained.
- *
- * @return
- */
- public Integer getNumberOfSequences()
- {
- if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
+ /*
+ * save the symbols in order, and a quick lookup of symbol position
+ */
+ for (short i = 0; i < count; i++)
{
- return null;
+ char symbol = trimmed.charAt(i);
+ if (symbol >= 'A' && symbol <= 'Z'
+ && symbolIndexLookup[symbol - 'A'] == -1)
+ {
+ symbolIndexLookup[symbol - 'A'] = i;
+ }
+ else
+ {
+ System.err
+ .println(
+ "Unexpected or duplicated character in HMM ALPHabet: "
+ + symbol);
+ ignored++;
+ }
}
- return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
+ return count - ignored;
}
/**
- * Returns the effective number of sequences on which the HMM was based.
+ * Answers the node of the model corresponding to an aligned column position
+ * (0...), or null if there is no such node
*
- * @param value
+ * @param column
+ * @return
*/
- public Double getEffectiveNumberOfSequences()
+ HMMNode getNodeForColumn(int column)
{
- if (fileProperties.get(LENGTH) == null)
+ /*
+ * if the hmm consensus is gapped at the column,
+ * there is no corresponding node
+ */
+ if (Comparison.isGap(hmmSeq.getCharAt(column)))
{
return null;
}
- return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
- }
- /**
- * Returns the checksum.
- *
- * @return
- */
- public Long getCheckSum()
- {
- if (fileProperties.get(LENGTH) == null)
+ /*
+ * find the node (if any) that is mapped to the
+ * consensus sequence residue position at the column
+ */
+ int seqPos = hmmSeq.findPosition(column);
+ int[] nodeNo = mapToHmmConsensus.getMap().locateInFrom(seqPos, seqPos);
+ if (nodeNo != null)
{
- return null;
+ return getNode(nodeNo[0]);
}
- return Long.parseLong(fileProperties.get(CHECK_SUM));
- }
-
- /**
- * Returns the list of nodes in this HMM.
- *
- * @return
- */
- public List<HMMNode> getNodes()
- {
- return nodes;
+ return null;
}
/**
- * Sets the list of nodes in this HMM to the given list.
- *
- * @param nodes
- * The list of nodes to which the current list of nodes is being
- * changed.
- */
- public void setNodes(List<HMMNode> nodes)
- {
- this.nodes = nodes;
- }
-
- /**
* Gets the match emission probability for a given symbol at a column in the
* alignment.
*
* @return
*
*/
- public Double getMatchEmissionProbability(int alignColumn, char symbol)
+ public double getMatchEmissionProbability(int alignColumn, char symbol)
{
- int symbolIndex;
- int nodeIndex;
- Double probability;
- if (!symbolIndexLookup.containsKey(symbol))
+ HMMNode node = getNodeForColumn(alignColumn);
+ int symbolIndex = getSymbolIndex(symbol);
+ if (node != null && symbolIndex != -1)
{
- return 0d;
+ return node.getMatchEmission(symbolIndex);
}
- symbolIndex = symbolIndexLookup.get(symbol);
- if (nodeLookup.containsKey(alignColumn))
- {
- nodeIndex = nodeLookup.get(alignColumn);
- probability = getNode(nodeIndex).getMatchEmissions().get(symbolIndex);
- return probability;
- }
- else
- {
- return 0d;
- }
-
+ return 0D;
}
/**
* @return
*
*/
- public Double getInsertEmissionProbability(int alignColumn, char symbol)
+ public double getInsertEmissionProbability(int alignColumn, char symbol)
{
- int symbolIndex;
- int nodeIndex;
- Double probability;
- if (!symbolIndexLookup.containsKey(symbol))
+ HMMNode node = getNodeForColumn(alignColumn);
+ int symbolIndex = getSymbolIndex(symbol);
+ if (node != null && symbolIndex != -1)
{
- return 0d;
+ return node.getInsertEmission(symbolIndex);
}
- symbolIndex = symbolIndexLookup.get(symbol);
- if (nodeLookup.containsKey(alignColumn))
- {
- nodeIndex = nodeLookup.get(alignColumn);
- probability = getNode(nodeIndex).getInsertEmissions()
- .get(symbolIndex);
- return probability;
- }
- else
- {
- return 0d;
- }
-
+ return 0D;
}
/**
* @return
*
*/
- public Double getStateTransitionProbability(int alignColumn,
+ public double getStateTransitionProbability(int alignColumn,
int transition)
{
- int transitionIndex;
- int nodeIndex;
- Double probability;
- if (nodeLookup.containsKey(alignColumn))
+ HMMNode node = getNodeForColumn(alignColumn);
+ if (node != null)
{
- nodeIndex = nodeLookup.get(alignColumn);
- probability = getNode(nodeIndex).getStateTransitions()
- .get(transition);
- return probability;
+ return node.getStateTransition(transition);
}
- else
- {
- return 0d;
- }
-
+ return 0D;
}
/**
- * Returns the alignment column linked to the node at the given index.
+ * Returns the sequence position linked to the node at the given index. This
+ * corresponds to an aligned column position (counting from 1).
*
* @param nodeIndex
* The index of the node, starting from index 1. Index 0 is the begin
* node, which does not correspond to a column in the alignment.
* @return
*/
- public Integer getNodeAlignmentColumn(int nodeIndex)
+ public int getNodeMapPosition(int nodeIndex)
{
- Integer value = nodes.get(nodeIndex).getAlignmentColumn();
- return value;
+ return nodes.get(nodeIndex).getResidueNumber();
}
/**
}
/**
- * Returns the consensus at a given alignment column.
- *
- * @param columnIndex
- * The index of the column in the alignment for which the consensus
- * is desired. The list of columns starts at index 0.
- * @return
- */
- public char getConsensusAtAlignColumn(int columnIndex)
- {
- char mostLikely = '-';
- if (consensusResidueIsActive())
- {
-
- Integer index = findNodeIndex(columnIndex);
- if (index == null)
- {
- return '-';
- }
- mostLikely = getNodes().get(index).getConsensusResidue();
- return mostLikely;
- }
- else
- {
- double highestProb = 0;
- for (char character : symbols)
- {
- Double prob = getMatchEmissionProbability(columnIndex, character);
- if (prob > highestProb)
- {
- highestProb = prob;
- mostLikely = character;
- }
- }
- return mostLikely;
- }
-
- }
-
- /**
* Returns the reference annotation at the specified node.
*
* @param nodeIndex
}
/**
- * Returns the average match emission probability for a given symbol
- *
- * @param symbolIndex
- * The index of the symbol.
- * @return
- *
- */
- public double getAverageMatchEmission(int symbolIndex)
- {
- double value = nodes.get(0).getMatchEmissions().get(symbolIndex);
- return value;
- }
-
- /**
- * Returns the number of symbols in the alphabet used in this HMM.
- *
- * @return
- */
- public int getNumberOfSymbols()
- {
- return numberOfSymbols;
- }
-
- /**
- * Fills symbol array and whilst doing so, updates the value of the number of
- * symbols.
- *
- * @param parser
- * The scanner scanning the symbol line in the file.
- */
- public void fillSymbols(Scanner parser)
- {
- int i = 0;
- while (parser.hasNext())
- {
- String strSymbol = parser.next();
- char[] symbol = strSymbol.toCharArray();
- symbols.add(symbol[0]);
- symbolIndexLookup.put(symbol[0], i);
- i++;
- }
- numberOfSymbols = symbols.size();
- }
-
- /**
- * Adds a file property.
+ * Sets a property read from an HMM file
*
* @param key
* @param value
*/
- public void addFileProperty(String key, String value)
+ public void setProperty(String key, String value)
{
fileProperties.put(key, value);
}
/**
- * Returns a boolean indicating whether the reference annotation is active.
- *
- * @return
- */
- public boolean referenceAnnotationIsActive()
- {
- String status;
- status = fileProperties.get(REFERENCE_ANNOTATION);
- if (status == null)
- {
- return false;
- }
- switch (status)
- {
- case YES:
- return true;
- case NO:
- return false;
- default:
- return false;
- }
-
- }
-
- /**
- * Returns a boolean indicating whether the mask value annotation is active.
- *
- * @return
- */
- public boolean maskValueIsActive()
- {
- String status;
- status = fileProperties.get(MASKED_VALUE);
- if (status == null)
- {
- return false;
- }
- switch (status)
- {
- case YES:
- return true;
- case NO:
- return false;
- default:
- return false;
- }
-
- }
-
- /**
- * Returns a boolean indicating whether the consensus residue annotation is
- * active.
- *
- * @return
- */
- public boolean consensusResidueIsActive()
- {
- String status;
- status = fileProperties.get(CONSENSUS_RESIDUE);
- if (status == null)
- {
- return false;
- }
- switch (status)
- {
- case YES:
- return true;
- case NO:
- return false;
- default:
- return false;
- }
-
- }
-
- /**
- * Returns a boolean indicating whether the consensus structure annotation is
- * active.
- *
- * @return
- */
- public boolean consensusStructureIsActive()
- {
- String status;
- status = fileProperties.get(CONSENSUS_STRUCTURE);
- if (status == null)
- {
- return false;
- }
- switch (status)
- {
- case YES:
- return true;
- case NO:
- return false;
- default:
- return false;
- }
-
- }
-
- /**
- * Returns a boolean indicating whether the MAP annotation is active.
- *
- * @return
- */
- public boolean mapIsActive()
- {
- String status;
- status = fileProperties.get(MAP);
- if (status == null)
- {
- return false;
- }
- switch (status)
- {
- case YES:
- return true;
- case NO:
- return false;
- default:
- return false;
- }
-
- }
-
- /**
- * Sets the alignment column of the specified node.
- *
- * @param nodeIndex
- *
- * @param column
- *
- */
- public void setAlignmentColumn(int nodeIndex, int column)
- {
- nodes.get(nodeIndex).setAlignmentColumn(column);
- }
-
- /**
- * Sets the reference annotation at a given node.
- *
- * @param nodeIndex
- * @param value
- */
- public void setReferenceAnnotation(int nodeIndex, char value)
- {
- nodes.get(nodeIndex).setReferenceAnnotation(value);
- }
-
- /**
- * Sets the consensus residue at a given node.
- *
- * @param nodeIndex
- * @param value
- */
- public void setConsensusResidue(int nodeIndex, char value)
- {
- nodes.get(nodeIndex).setConsensusResidue(value);
- }
-
- /**
- * Sets the consensus structure at a given node.
- *
- * @param nodeIndex
- * @param value
- */
- public void setConsensusStructure(int nodeIndex, char value)
- {
- nodes.get(nodeIndex).setConsensusStructure(value);
- }
-
- /**
- * Sets the mask value at a given node.
- *
- * @param nodeIndex
- * @param value
- */
- public void setMaskValue(int nodeIndex, char value)
- {
- nodes.get(nodeIndex).setMaskValue(value);
- }
-
- /**
* Temporary implementation, should not be used.
*
* @return
*/
- public String getGatheringThreshold()
+ public String getViterbi()
{
String value;
- value = fileProperties.get("GA");
+ value = fileProperties.get(HMMFile.VITERBI);
return value;
}
*
* @return
*/
- public String getNoiseCutoff()
+ public String getMSV()
{
String value;
- value = fileProperties.get("NC");
+ value = fileProperties.get(HMMFile.MSV);
return value;
}
*
* @return
*/
- public String getTrustedCutoff()
+ public String getForward()
{
String value;
- value = fileProperties.get("TC");
+ value = fileProperties.get(HMMFile.FORWARD);
return value;
}
/**
- * Temporary implementation, should not be used.
+ * Constructs the consensus sequence based on the most probable symbol at each
+ * position. Gap characters are inserted for discontinuities in the node map
+ * numbering (if provided), else an ungapped sequence is generated.
+ * <p>
+ * A mapping between the HMM nodes and residue positions of the sequence is
+ * also built and saved.
*
* @return
*/
- public String getViterbi()
+ void buildConsensusSequence()
{
- String value;
- value = fileProperties.get(VITERBI);
- return value;
- }
+ List<int[]> toResidues = new ArrayList<>();
- /**
- * Temporary implementation, should not be used.
- *
- * @return
- */
- public String getMSV()
- {
- String value;
- value = fileProperties.get(MSV);
- return value;
- }
+ /*
+ * if the HMM provided a map to sequence, use those start/end values,
+ * else just treat it as for a contiguous sequence numbered from 1
+ */
+ boolean hasMap = getBooleanProperty(HMMFile.MAP);
+ int start = hasMap ? getNode(1).getResidueNumber() : 1;
+ int endResNo = hasMap ? getNode(nodes.size() - 1).getResidueNumber()
+ : (start + getLength() - 1);
+ char[] sequence = new char[endResNo];
- /**
- * Temporary implementation, should not be used.
- *
- * @return
- */
- public String getForward()
- {
- String value;
- value = fileProperties.get(FORWARD);
- return value;
- }
+ int lastResNo = start - 1;
+ int seqOffset = -1;
+ int gapCount = 0;
- /**
- * Sets the activation status of the MAP annotation.
- *
- * @param status
- */
- public void setMAPStatus(boolean status)
- {
- fileProperties.put(MAP, status ? YES : NO);
- }
- /**
- * Sets the activation status of the reference annotation.
- *
- * @param status
- */
- public void setReferenceAnnotationStatus(boolean status)
- {
- fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO);
- }
+ for (int seqN = 0; seqN < start; seqN++)
+ {
+ sequence[seqN] = GAP_DASH;
+ seqOffset++;
+ }
+
+ for (int nodeNo = 1; nodeNo < nodes.size(); nodeNo++)
+ {
+ HMMNode node = nodes.get(nodeNo);
+ final int resNo = hasMap ? node.getResidueNumber() : lastResNo + 1;
- /**
- * Sets the activation status of the mask value annotation.
- *
- * @param status
- */
- public void setMaskedValueStatus(boolean status)
- {
- fileProperties.put(MASKED_VALUE, status ? YES : NO);
- }
+ /*
+ * insert gaps if map numbering is not continuous
+ */
+ while (resNo > lastResNo + 1)
+ {
+ sequence[seqOffset++] = GAP_DASH;
+ lastResNo++;
+ gapCount++;
+ }
+ char consensusResidue = node.getConsensusResidue();
+ if (GAP_DASH == consensusResidue)
+ {
+ /*
+ * no residue annotation in HMM - scan for the symbol
+ * with the highest match emission probability
+ */
+ int symbolIndex = node.getMaxMatchEmissionIndex();
+ consensusResidue = alphabet.charAt(symbolIndex);
+ if (node.getMatchEmission(symbolIndex) < 0.5D)
+ {
+ // follow convention of lower case if match emission prob < 0.5
+ consensusResidue = Character.toLowerCase(consensusResidue);
+ }
+ }
+ sequence[seqOffset++] = consensusResidue;
+ lastResNo = resNo;
+ }
- /**
- * Sets the activation status of the consensus residue annotation.
- *
- * @param status
- */
- public void setConsensusResidueStatus(boolean status)
- {
- fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO);
- }
+ Sequence seq = new Sequence(getName(), sequence, start,
+ lastResNo - gapCount);
+ seq.createDatasetSequence();
+ seq.setHMM(this);
+ this.hmmSeq = seq;
- /**
- * Sets the activation status of the consensus structure annotation.
- *
- * @param status
- */
- public void setConsensusStructureStatus(boolean status)
- {
- fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO);
+ /*
+ * construct and store Mapping of nodes to residues
+ * note as constructed this is just an identity mapping,
+ * but it allows for greater flexibility in future
+ */
+ List<int[]> fromNodes = new ArrayList<>();
+ fromNodes.add(new int[] { 1, getLength() });
+ toResidues.add(new int[] { seq.getStart(), seq.getEnd() });
+ MapList mapList = new MapList(fromNodes, toResidues, 1, 1);
+ mapToHmmConsensus = new Mapping(seq.getDatasetSequence(), mapList);
}
+
/**
- * Finds the index of the node in a hidden Markov model based on the column in
- * the alignment
+ * Answers the aligned consensus sequence for the profile. Note this will
+ * return null if called before <code>setNodes</code> has been called.
*
- * @param alignmentColumn
- * The index of the column in the alignment, with the indexes
- * starting from 0.
+ * @return
*/
-
- public Integer findNodeIndex(int alignmentColumn)
+ public SequenceI getConsensusSequence()
{
- Integer index;
- index = nodeLookup.get(alignmentColumn);
- return index;
+ return hmmSeq;
}
/**
- * Finds the String values of a boolean. "yes" for true and "no" for false.
+ * Answers the index position (0...) of the given symbol, or -1 if not a valid
+ * symbol for this HMM
*
- * @param value
+ * @param symbol
* @return
*/
- public static String findStringFromBoolean(boolean value)
+ private int getSymbolIndex(char symbol)
{
- if (value)
- {
- return YES;
- }
- else
+ /*
+ * symbolIndexLookup holds the index for 'A' to 'Z'
+ */
+ char c = Character.toUpperCase(symbol);
+ if ('A' <= c && c <= 'Z')
{
- return NO;
+ return symbolIndexLookup[c - 'A'];
}
+ return -1;
}
-
-
/**
- * Returns the consensus sequence based on the most probable symbol at each
- * position. The sequence is adjusted to match the length of the existing
- * sequence alignment. Gap characters are used as padding.
+ * Sets the nodes of this HMM, and also extracts the HMM consensus sequence
+ * and a mapping between node numbers and sequence positions
*
- * @param length
- * The length of the longest sequence in the existing alignment.
- * @return
+ * @param nodeList
*/
- public Sequence getConsensusSequence()
+ public void setNodes(List<HMMNode> nodeList)
{
- int start;
- int end;
- int modelLength;
- start = getNodeAlignmentColumn(1);
- modelLength = getLength();
- end = getNodeAlignmentColumn(modelLength);
- char[] sequence = new char[end + 1];
- for (int index = 0; index < end + 1; index++)
+ nodes = nodeList;
+ if (nodes.size() > 1)
{
- Character character;
-
- character = getConsensusAtAlignColumn(index);
-
- if (character == null || character == '-')
- {
- sequence[index] = '-';
- }
- else
- {
- sequence[index] = Character.toUpperCase(character);
- }
- }
-
-
- Sequence seq = new Sequence(getName() + "_HMM", sequence, start,
- end);
- return seq;
+ buildConsensusSequence();
+ }
}
-
/**
- * Maps the nodes of the hidden Markov model to the reference annotation and
- * then deletes this annotation.
+ * Sets the aligned consensus sequence this HMM is the model for
+ *
+ * @param hmmSeq
*/
- public void mapToReferenceAnnotation(AlignFrame af, SequenceI seq)
+ public void setHmmSeq(SequenceI hmmSeq)
{
- AlignmentAnnotation annotArray[] = af.getViewport().getAlignment()
- .getAlignmentAnnotation();
-
- AlignmentAnnotation reference = null;
- for (AlignmentAnnotation annot : annotArray)
- {
- if (annot.label.contains("Reference"))
- {
- reference = annot;
- }
- }
-
- if (reference == null)
- {
- return;
- }
-
- mapToReferenceAnnotation(reference, seq);
- af.getViewport().getAlignment().deleteAnnotation(reference);
+ this.hmmSeq = hmmSeq;
}
- public void mapToReferenceAnnotation(AlignmentAnnotation reference,
- SequenceI seq)
+ public void setBackgroundFrequencies(Map<Character, Float> bkgdFreqs)
{
- HiddenMarkovModel hmm = seq.getHMM();
- Annotation[] annots = reference.annotations;
- {
- int nodeIndex = 0;
- for (int col = 0; col < annots.length; col++)
- {
- String character = annots[col].displayCharacter;
- if ("x".equals(character) || "X".equals(character))
- {
- nodeIndex++;
- if (nodeIndex < hmm.getNodes().size())
- {
- HMMNode node = hmm.getNode(nodeIndex);
- int alignPos = getNodeAlignmentColumn(nodeIndex);
- char seqCharacter = seq.getCharAt(alignPos);
- if (alignPos >= seq.getLength() || col >= seq.getLength())
- {
- seq.insertCharAt(seq.getLength(),
- (alignPos + 1) - seq.getLength(),
- '-');
- }
- seq.getSequence()[alignPos] = '-';
- seq.getSequence()[col] = seqCharacter;
- node.setAlignmentColumn(col);
- hmm.nodeLookup.put(col, nodeIndex);
- }
- else
- {
- System.out.println(
- "The reference annotation contains more consensus columns than the hidden Markov model");
- break;
- }
- }
- else
- {
- hmm.nodeLookup.remove(col);
- }
- }
-
- }
-
+ backgroundFrequencies = bkgdFreqs;
}
- public void mapToReferenceAnnotation(AlignmentAnnotation reference)
+ public void setBackgroundFrequencies(ResidueCount bkgdFreqs)
{
- Annotation[] annots = reference.annotations;
- {
- int nodeIndex = 0;
- for (int col = 0; col < annots.length; col++)
- {
- String character = annots[col].displayCharacter;
- if ("x".equals(character) || "X".equals(character))
- {
- nodeIndex++;
- if (nodeIndex < nodes.size())
- {
- HMMNode node = nodes.get(nodeIndex);
- node.setAlignmentColumn(col + 1);
- nodeLookup.put(col, nodeIndex);
- }
- else
- {
- System.out.println(
- "The reference annotation contains more consensus columns than the hidden Markov model");
- break;
- }
- }
- else
- {
- nodeLookup.remove(col);
- }
- }
+ backgroundFrequencies = new HashMap<>();
+
+ int total = bkgdFreqs.getTotalResidueCount();
+ for (char c : bkgdFreqs.getSymbolCounts().symbols)
+ {
+ backgroundFrequencies.put(c, bkgdFreqs.getCount(c) * 1f / total);
}
}
- public SequenceI initHMMSequence()
+ public Map<Character, Float> getBackgroundFrequencies()
{
- Sequence consensus = getConsensusSequence();
- consensus.setIsHMMConsensusSequence(true);
- consensus.setHMM(this);
- return consensus;
+ return backgroundFrequencies;
}
-
-
}