import java.util.Scanner;
/**
- * Data structure which stores a hidden Markov model. Currently contains file properties as well, not sure whether these should be transferred to the HMMFile class
+ * Data structure which stores a hidden Markov model. Currently contains file
+ * properties as well, not sure whether these should be transferred to the
+ * HMMFile class
*
* @author TZVanaalten
*
// of theHMM, use getModelLength()to return an int value
Map<String, String> fileProperties = new HashMap<>();
- //contains all of the symbols used in this model. The index of each symbol represents its lookup value
+ // contains all of the symbols used in this model. The index of each symbol
+ // represents its lookup value
List<Character> symbols = new ArrayList<>();
// contains information for each node in the model. The begin node is at index
// contains the HMM node for each alignment column
Map<Integer, Integer> nodeLookup = new HashMap<>();
- //contains the symbol index for each symbol
+ // contains the symbol index for each symbol
Map<Character, Integer> symbolIndexLookup = new HashMap<>();
- Map<Character, Double> backgroundFrequencies = new HashMap();
-
- ProfilesI profiles;
-
-
final static String YES = "yes";
final static String NO = "no";
int numberOfSymbols;
- //keys for file properties hashmap
+ // keys for file properties hashmap
private final String NAME = "NAME";
private final String ACCESSION_NUMBER = "ACC";
private final String MASKED_VALUE = "MM";
- final static String[] TRANSITION_TYPES = new String[] { "m->m", "m->i",
- "m->d", "i->m", "i->i", "d->m", "d->d" };
+ public static final int MATCHTOMATCH = 0;
- public String getTransitionType(int index)
- {
- return TRANSITION_TYPES[index];
- }
+ public static final int MATCHTOINSERT = 1;
- public Map<Integer, Integer> getNodeLookup()
- {
- return nodeLookup;
- }
+ public static final int MATCHTODELETE = 2;
- public void setNodeLookup(Map<Integer, Integer> nodeLookup)
- {
- this.nodeLookup = nodeLookup;
- }
+ public static final int INSERTTOMATCH = 3;
+
+ public static final int INSERTTOINSERT = 4;
+
+ public static final int DELETETOMATCH = 5;
- public String[] getTransitionTypes()
+ public static final int DELETETODELETE = 6;
+
+ /**
+ * Returns the map containing the matches between nodes and alignment column
+ * indexes.
+ *
+ * @return
+ *
+ */
+ public Map<Integer, Integer> getNodeLookup()
{
- return TRANSITION_TYPES;
+ return nodeLookup;
}
+ /**
+ * Returns the list of symbols used in this hidden Markov model.
+ *
+ * @return
+ */
public List<Character> getSymbols()
{
return symbols;
}
-
+
+ /**
+ * Returns the file properties.
+ *
+ * @return
+ */
public Map<String, String> getFileProperties()
{
return fileProperties;
}
+ /**
+ * Gets the node in the hidden Markov model at the specified position.
+ *
+ * @param nodeIndex
+ * The index of the node requested. Node 0 optionally contains the
+ * average match emission probabilities across the entire model, and
+ * always contains the insert emission probabilities and state
+ * transition probabilities for the begin node. Node 1 contains the
+ * first node in the HMM that can correspond to a column in the
+ * alignment.
+ * @return
+ */
public HMMNode getNode(int nodeIndex)
{
return getNodes().get(nodeIndex);
}
+ /**
+ * Sets the list of symbols used in the hidden Markov model to the list
+ * specified.
+ *
+ * @param symbolsL
+ * The list of symbols to which the current list is to be changed.
+ *
+ */
public void setSymbols(List<Character> symbolsL)
{
this.symbols = symbolsL;
}
+ /**
+ * Returns the name of the sequence alignment on which the HMM is based.
+ *
+ * @return
+ */
public String getName()
{
return fileProperties.get(NAME);
}
+
+ /**
+ * Returns the accession number.
+ * @return
+ */
public String getAccessionNumber()
{
return fileProperties.get(ACCESSION_NUMBER);
}
- public void setAccessionNumber(String value)
- {
- fileProperties.put(ACCESSION_NUMBER, value);
- }
-
+ /**
+ * Returns a description of the sequence alignment on which the hidden Markov
+ * model is based.
+ *
+ * @return
+ */
public String getDescription()
{
return fileProperties.get(DESCRIPTION);
}
- public void setDescription(String value)
- {
- fileProperties.put(DESCRIPTION, value);
- }
-
+ /**
+ * Returns the length of the hidden Markov model.
+ *
+ * @return
+ */
public Integer getLength()
{
if (fileProperties.get(LENGTH) == null)
return Integer.parseInt(fileProperties.get(LENGTH));
}
- public void setLength(int value)
- {
- fileProperties.put(LENGTH, String.valueOf(value));
- }
-
+ /**
+ * Returns the max instance length within the hidden Markov model.
+ *
+ * @return
+ */
public Integer getMaxInstanceLength()
{
if (fileProperties.get(MAX_LENGTH) == null)
return Integer.parseInt(fileProperties.get(MAX_LENGTH));
}
- public void setMaxInstanceLength(int value)
- {
- fileProperties.put(MAX_LENGTH, String.valueOf(value));
- }
-
- // gets type of symbol alphabet - "amino", "DNA", "RNA"
+ /**
+ * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the
+ * options. Other alphabets may be added.
+ *
+ * @return
+ */
public String getAlphabetType()
{
return fileProperties.get(ALPHABET);
}
- public void setAlphabetType(String value)
- {
- fileProperties.put(ALPHABET, value);
- }
-
- // not sure whether to implement this with Date object
+ /**
+ * Returns the date as a String.
+ *
+ * @return
+ */
public String getDate()
{
return fileProperties.get(DATE);
}
- public void setDate(String value)
- {
- fileProperties.put(DATE, value);
- }
-
- // not sure whether to implement this
+ /**
+ * Returns the command line log.
+ *
+ * @return
+ */
public String getCommandLineLog()
{
return fileProperties.get(COMMAND_LOG);
}
- public void setCommandLineLog(String value)
- {
- fileProperties.put(COMMAND_LOG, value);
- }
-
- // gets the number of sequences that the HMM was trained on
+ /**
+ * Returns the number of sequences on which the HMM was trained.
+ *
+ * @return
+ */
public Integer getNumberOfSequences()
{
if (fileProperties.get(NUMBER_OF_SEQUENCES) == null)
return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES));
}
- public void setNumberOfSequences(int value)
- {
- fileProperties.put(NUMBER_OF_SEQUENCES, String.valueOf(value));
- }
-
- // gets the effective number determined during sequence weighting
+ /**
+ * Returns the effective number of sequences on which the HMM was based.
+ *
+ * @param value
+ */
public Double getEffectiveNumberOfSequences()
{
if (fileProperties.get(LENGTH) == null)
return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES));
}
- public void setEffectiveNumberOfSequences(double value)
- {
- fileProperties.put(EFF_NUMBER_OF_SEQUENCES, String.valueOf(value));
- }
-
+ /**
+ * Returns the checksum.
+ *
+ * @return
+ */
public Long getCheckSum()
{
if (fileProperties.get(LENGTH) == null)
return Long.parseLong(fileProperties.get(CHECK_SUM));
}
- public void setCheckSum(long value)
- {
- fileProperties.put(CHECK_SUM, String.valueOf(value));
- }
-
+ /**
+ * Returns the list of nodes in this HMM.
+ *
+ * @return
+ */
public List<HMMNode> getNodes()
{
return nodes;
}
+ /**
+ * Sets the list of nodes in this HMM to the given list.
+ *
+ * @param nodes
+ * The list of nodes to which the current list of nodes is being
+ * changed.
+ */
public void setNodes(List<HMMNode> nodes)
{
this.nodes = nodes;
}
/**
- * get match emission probability for a given symbol at a column in the
- * alignment
+ * Gets the match emission probability for a given symbol at a column in the
+ * alignment.
*
* @param alignColumn
+ * The index of the alignment column, starting at index 0. Index 0
+ * usually corresponds to index 1 in the HMM.
* @param symbol
+ * The symbol for which the desired probability is being requested.
* @return
*
*/
}
/**
- * get insert emission probability for a given symbol at a column in the
- * alignment
+ * Gets the insert emission probability for a given symbol at a column in the
+ * alignment.
*
* @param alignColumn
+ * The index of the alignment column, starting at index 0. Index 0
+ * usually corresponds to index 1 in the HMM.
* @param symbol
+ * The symbol for which the desired probability is being requested.
* @return
+ *
*/
public Double getInsertEmissionProbability(int alignColumn, char symbol)
{
}
/**
- * get state transition probability for a given transition type at a column in
- * the alignment
+ * Gets the state transition probability for a given symbol at a column in the
+ * alignment.
*
* @param alignColumn
- * @param transition
+ * The index of the alignment column, starting at index 0. Index 0
+ * usually corresponds to index 1 in the HMM.
+ * @param symbol
+ * The symbol for which the desired probability is being requested.
* @return
+ *
*/
public Double getStateTransitionProbability(int alignColumn,
- String transition)
+ int transition)
{
int transitionIndex;
int nodeIndex;
Double probability;
- transitionIndex = getTransitionType(transition);
if (nodeLookup.containsKey(alignColumn + 1))
{
nodeIndex = nodeLookup.get(alignColumn + 1);
probability = getNode(nodeIndex).getStateTransitions()
- .get(transitionIndex);
+ .get(transition);
return probability;
}
else
}
+ /**
+ * Returns the alignment column linked to the node at the given index.
+ *
+ * @param nodeIndex
+ * The index of the node, starting from index 1. Index 0 is the begin
+ * node, which does not correspond to a column in the alignment.
+ * @return
+ */
public Integer getNodeAlignmentColumn(int nodeIndex)
{
Integer value = nodes.get(nodeIndex).getAlignmentColumn();
return value - 1;
}
+ /**
+ * Returns the consensus residue at the specified node.
+ *
+ * @param nodeIndex
+ * The index of the specified node.
+ * @return
+ */
public char getConsensusResidue(int nodeIndex)
{
char value = nodes.get(nodeIndex).getConsensusResidue();
return value;
}
+ /**
+ * Returns the consensus at a given alignment column.
+ *
+ * @param columnIndex
+ * The index of the column in the alignment for which the consensus
+ * is desired. The list of columns starts at index 0.
+ * @return
+ */
public char getConsensusAtAlignColumn(int columnIndex)
{
char value;
- Integer index = findNodeIndex(columnIndex + 1);
+ Integer index = findNodeIndex(columnIndex);
if (index == null)
{
return '-';
return value;
}
+ /**
+ * Returns the reference annotation at the specified node.
+ *
+ * @param nodeIndex
+ * The index of the specified node.
+ * @return
+ */
public char getReferenceAnnotation(int nodeIndex)
{
char value = nodes.get(nodeIndex).getReferenceAnnotation();
return value;
}
+ /**
+ * Returns the mask value at the specified node.
+ *
+ * @param nodeIndex
+ * The index of the specified node.
+ * @return
+ */
public char getMaskedValue(int nodeIndex)
{
char value = nodes.get(nodeIndex).getMaskValue();
return value;
}
+ /**
+ * Returns the consensus structure at the specified node.
+ *
+ * @param nodeIndex
+ * The index of the specified node.
+ * @return
+ */
public char getConsensusStructure(int nodeIndex)
{
char value = nodes.get(nodeIndex).getConsensusStructure();
}
/**
- * returns the average match emission for a given symbol
+ * Returns the average match emission probability for a given symbol
+ *
* @param symbolIndex
- * index of symbol
+ * The index of the symbol.
* @return
- * average negative log propbability of a match emission of the given symbol
+ *
*/
public double getAverageMatchEmission(int symbolIndex)
{
return value;
}
+ /**
+ * Returns the number of symbols in the alphabet used in this HMM.
+ *
+ * @return
+ */
public int getNumberOfSymbols()
{
return numberOfSymbols;
}
- public void setNumberOfSymbols(int numberOfSymbols)
- {
- this.numberOfSymbols = numberOfSymbols;
- }
-
-
-
/**
- * fills symbol array and also finds numberOfSymbols
+ * Fills symbol array and whilst doing so, updates the value of the number of
+ * symbols.
*
* @param parser
- * scanner scanning symbol line in file
+ * The scanner scanning the symbol line in the file.
*/
public void fillSymbols(Scanner parser)
{
}
/**
- * adds file property
+ * Adds a file property.
*
* @param key
* @param value
fileProperties.put(key, value);
}
+ /**
+ * Returns a boolean indicating whether the reference annotation is active.
+ *
+ * @return
+ */
public boolean referenceAnnotationIsActive()
{
String status;
}
+ /**
+ * Returns a boolean indicating whether the mask value annotation is active.
+ *
+ * @return
+ */
public boolean maskValueIsActive()
{
String status;
}
+ /**
+ * Returns a boolean indicating whether the consensus residue annotation is
+ * active.
+ *
+ * @return
+ */
public boolean consensusResidueIsActive()
{
String status;
}
+ /**
+ * Returns a boolean indicating whether the consensus structure annotation is
+ * active.
+ *
+ * @return
+ */
public boolean consensusStructureIsActive()
{
String status;
}
+ /**
+ * Returns a boolean indicating whether the MAP annotation is active.
+ *
+ * @return
+ */
public boolean mapIsActive()
{
String status;
}
+ /**
+ * Sets the alignment column of the specified node.
+ *
+ * @param nodeIndex
+ *
+ * @param column
+ *
+ */
public void setAlignmentColumn(int nodeIndex, int column)
{
nodes.get(nodeIndex).setAlignmentColumn(column);
}
+ /**
+ * Sets the reference annotation at a given node.
+ *
+ * @param nodeIndex
+ * @param value
+ */
public void setReferenceAnnotation(int nodeIndex, char value)
{
nodes.get(nodeIndex).setReferenceAnnotation(value);
}
+ /**
+ * Sets the consensus residue at a given node.
+ *
+ * @param nodeIndex
+ * @param value
+ */
public void setConsensusResidue(int nodeIndex, char value)
{
nodes.get(nodeIndex).setConsensusResidue(value);
}
+ /**
+ * Sets the consensus structure at a given node.
+ *
+ * @param nodeIndex
+ * @param value
+ */
public void setConsensusStructure(int nodeIndex, char value)
{
nodes.get(nodeIndex).setConsensusStructure(value);
}
+ /**
+ * Sets the mask value at a given node.
+ *
+ * @param nodeIndex
+ * @param value
+ */
public void setMaskValue(int nodeIndex, char value)
{
nodes.get(nodeIndex).setMaskValue(value);
}
+ /**
+ * Temporary implementation, should not be used.
+ *
+ * @return
+ */
public String getGatheringThreshold()
{
String value;
return value;
}
+ /**
+ * Temporary implementation, should not be used.
+ *
+ * @return
+ */
public String getNoiseCutoff()
{
String value;
return value;
}
+ /**
+ * Temporary implementation, should not be used.
+ *
+ * @return
+ */
public String getTrustedCutoff()
{
String value;
return value;
}
+ /**
+ * Temporary implementation, should not be used.
+ *
+ * @return
+ */
public String getViterbi()
{
String value;
return value;
}
+ /**
+ * Temporary implementation, should not be used.
+ *
+ * @return
+ */
public String getMSV()
{
String value;
return value;
}
+ /**
+ * Temporary implementation, should not be used.
+ *
+ * @return
+ */
public String getForward()
{
String value;
return value;
}
+ /**
+ * Sets the activation status of the MAP annotation.
+ *
+ * @param status
+ */
public void setMAPStatus(boolean status)
{
- if (status == true)
- {
- fileProperties.put(MAP, YES);
- }
- else
- {
- fileProperties.put(MAP, NO);
- }
+ fileProperties.put(MAP, status ? YES : NO);
}
+ /**
+ * Sets the activation status of the reference annotation.
+ *
+ * @param status
+ */
public void setReferenceAnnotationStatus(boolean status)
{
- if (status == true)
- {
- fileProperties.put(REFERENCE_ANNOTATION, YES);
- }
- else
- {
- fileProperties.put(REFERENCE_ANNOTATION, NO);
- }
+ fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO);
}
+ /**
+ * Sets the activation status of the mask value annotation.
+ *
+ * @param status
+ */
public void setMaskedValueStatus(boolean status)
{
- if (status == true)
- {
- fileProperties.put(MASKED_VALUE, YES);
- }
- else
- {
- fileProperties.put(MASKED_VALUE, NO);
- }
+ fileProperties.put(MASKED_VALUE, status ? YES : NO);
}
+ /**
+ * Sets the activation status of the consensus residue annotation.
+ *
+ * @param status
+ */
public void setConsensusResidueStatus(boolean status)
{
- if (status == true)
- {
- fileProperties.put(CONSENSUS_RESIDUE, YES);
- }
- else
- {
- fileProperties.put(CONSENSUS_RESIDUE, NO);
- }
- }
-
- public void setConsensusStructureStatus(boolean status)
- {
- if (status == true)
- {
- fileProperties.put(CONSENSUS_STRUCTURE, YES);
- }
- else
- {
- fileProperties.put(CONSENSUS_STRUCTURE, NO);
- }
+ fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO);
}
/**
+ * Sets the activation status of the consensus structure annotation.
*
- * @param transition
- * type of transition occuring
- * @return index value representing position along stateTransition array.
+ * @param status
*/
- public Integer getTransitionType(String transition)
+ public void setConsensusStructureStatus(boolean status)
{
- Integer index;
- switch (transition)
- {
- case "mm":
- index = 0;
- break;
- case "mi":
- index = 1;
- break;
- case "md":
- index = 2;
- break;
- case "im":
- index = 3;
- break;
- case "ii":
- index = 4;
- break;
- case "dm":
- index = 5;
- break;
- case "dd":
- index = 6;
- break;
- default:
- index = null;
- }
- return index;
+ fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO);
}
/**
- * find the index of the node in a hidden Markov model based on the column in
+ * Finds the index of the node in a hidden Markov model based on the column in
* the alignment
*
* @param alignmentColumn
+ * The index of the column in the alignment, with the indexes
+ * starting from 0.
*/
public Integer findNodeIndex(int alignmentColumn)
{
Integer index;
- index = nodeLookup.get(alignmentColumn);
+ index = nodeLookup.get(alignmentColumn + 1);
return index;
}
+ /**
+ * Finds the String values of a boolean. "yes" for true and "no" for false.
+ *
+ * @param value
+ * @return
+ */
public static String findStringFromBoolean(boolean value)
{
if (value)
}
/**
- * creates the HMM annotation
+ * Creates the HMM Logo alignment annotation, and populates it with
+ * information content data.
*
- * @return
+ * @return The alignment annotation.
*/
public AlignmentAnnotation createAnnotation(int length)
{
return annotation;
}
+ /**
+ * Returns the information content at a specified column.
+ *
+ * @param column
+ * Index of the column, starting from 0.
+ * @return
+ */
public float getInformationContent(int column)
{
float informationContent = 0f;
for (char symbol : symbols)
{
float freq = 0f;
- if (symbols.size() == 20)
+ if ("amino".equals(getAlphabetType()))
{
freq = ResidueProperties.aminoBackgroundFrequencies.get(symbol);
}
- if (symbols.size() == 4)
+ if ("DNA".equals(getAlphabetType()))
{
- freq = ResidueProperties.nucleotideBackgroundFrequencies
+ freq = ResidueProperties.dnaBackgroundFrequencies.get(symbol);
+ }
+ if ("RNA".equals(getAlphabetType()))
+ {
+ freq = ResidueProperties.rnaBackgroundFrequencies
.get(symbol);
}
Double hmmProb = getMatchEmissionProbability(column, symbol);
return informationContent;
}
+ /**
+ * Returns the consensus sequence based on the most probable symbol at each
+ * position. The sequence is adjusted to match the length of the existing
+ * sequence alignment. Gap characters are used as padding.
+ *
+ * @param length
+ * The length of the longest sequence in the existing alignment.
+ * @return
+ */
+ public Sequence getConsensusSequence(int length)
+ {
+ int start;
+ int end;
+ int modelLength;
+ start = getNodeAlignmentColumn(1);
+ modelLength = getLength();
+ end = getNodeAlignmentColumn(modelLength);
+ char[] sequence = new char[length];
+ for (int index = 0; index < length; index++)
+ {
+ Character character;
+ if (consensusResidueIsActive())
+ {
+ character = getConsensusAtAlignColumn(index);
+ }
+ else
+ {
+ character = findConsensusCharacter(index);
+ }
+ if (character == null || character == '-')
+ {
+ sequence[index] = '-';
+ }
+ else
+ {
+ sequence[index] = Character.toUpperCase(character);
+ }
+ }
+
+
+ Sequence seq = new Sequence("HMM CONSENSUS", sequence, start, end);
+ return seq;
+ }
+
+ /**
+ * Finds the most probable character at a column in an alignment based on the
+ * HMM.
+ *
+ * @param nodeIndex
+ * The index of the node.
+ * @return
+ */
+ Character findConsensusCharacter(int column)
+ {
+ Character mostLikely = null;
+ double highestProb = 0;
+ for (char character : symbols)
+ {
+ Double prob = getMatchEmissionProbability(column, character);
+ if (prob > highestProb)
+ {
+ highestProb = prob;
+ mostLikely = character;
+ }
+ }
+ return mostLikely;
+ }
+
+ /**
+ * Maps the nodes of the hidden Markov model to the reference annotation.
+ */
+ public void mapToReferenceAnnotation(AlignmentAnnotation alAnnotation)
+ {
+ Annotation[] annots = alAnnotation.annotations;
+ {
+ int nodeIndex = 0;
+ for (int col = 0; col < annots.length; col++)
+ {
+ String character = annots[col].displayCharacter;
+ if ("x".equals(character) || "X".equals(character))
+ {
+ nodeIndex++;
+ if (nodeIndex < nodes.size())
+ {
+ nodes.get(nodeIndex).setAlignmentColumn(col + 1);
+ nodeLookup.put(col + 1, nodeIndex);
+ }
+ else
+ {
+ System.out.println(
+ "The reference annotation contains more consensus columns than the hidden Markov model");
+ break;
+ }
+ }
+ else
+ {
+ nodeLookup.remove(col + 1);
+ }
+ }
+
+ }
+ }
}