From dad3f91c2f9a38ce8c64a688b6f1ba4f539af9fc Mon Sep 17 00:00:00 2001 From: gmungoc Date: Fri, 23 Feb 2018 15:52:36 +0000 Subject: [PATCH] JAL-2599 parser and datamodel use primitive double[] not List --- src/jalview/analysis/AAFrequency.java | 11 +- src/jalview/datamodel/HMMNode.java | 56 +- src/jalview/datamodel/HiddenMarkovModel.java | 621 ++++---------------- src/jalview/io/HMMFile.java | 592 ++++++++----------- test/jalview/datamodel/HiddenMarkovModelTest.java | 4 +- test/jalview/hmmer/HMMERTest.java | 4 +- test/jalview/io/HMMFileTest.java | 336 +++++------ .../HMMProbabilityDistributionAnalyserTest.java | 2 +- 8 files changed, 577 insertions(+), 1049 deletions(-) diff --git a/src/jalview/analysis/AAFrequency.java b/src/jalview/analysis/AAFrequency.java index f77517c..10ae253 100755 --- a/src/jalview/analysis/AAFrequency.java +++ b/src/jalview/analysis/AAFrequency.java @@ -205,11 +205,12 @@ public class AAFrequency boolean removeBelowBackground, boolean infoLetterHeight) { ProfileI[] result = new ProfileI[width]; - int symbolCount = hmm.getNumberOfSymbols(); + char[] symbols = hmm.getSymbols().toCharArray(); + int symbolCount = symbols.length; for (int column = start; column < end; column++) { ResidueCount counts = new ResidueCount(); - for (char symbol : hmm.getSymbols()) + for (char symbol : symbols) { int value = getAnalogueCount(hmm, column, symbol, removeBelowBackground, infoLetterHeight); @@ -867,15 +868,15 @@ public class AAFrequency { return null; } - int size = hmm.getNumberOfSymbols(); + String alphabet = hmm.getSymbols(); + int size = alphabet.length(); char symbols[] = new char[size]; int values[] = new int[size]; - List charList = hmm.getSymbols(); int totalCount = 0; for (int i = 0; i < size; i++) { - char symbol = charList.get(i); + char symbol = alphabet.charAt(i); symbols[i] = symbol; int value = getAnalogueCount(hmm, column, symbol, removeBelowBackground, infoHeight); diff --git a/src/jalview/datamodel/HMMNode.java b/src/jalview/datamodel/HMMNode.java index 04e335f..93fa49e 100644 --- a/src/jalview/datamodel/HMMNode.java +++ b/src/jalview/datamodel/HMMNode.java @@ -1,8 +1,5 @@ package jalview.datamodel; -import java.util.ArrayList; -import java.util.List; - /** * stores data for each node in the hmm model * @author TZVanaalten @@ -11,12 +8,14 @@ import java.util.List; public class HMMNode { //contains the match emissions for each symbol - List matchEmissions = new ArrayList<>(); + double[] matchEmissions; + //contains the insert emissions for each symbol - List insertEmissions = new ArrayList<>(); - //contains the state transitions for each possible transition. These are bm, bi, bd, im, ii, dm and dd in order (0th position in - // the array indicates the probability of a bm transition) - List stateTransitions = new ArrayList<>(); + double[] insertEmissions; + + // contains the state transitions for each possible transition. These are mm, + // mi, md, im, ii, dm and dd in order + double[] stateTransitions; //annotations Integer alignmentColumn = null; @@ -25,47 +24,54 @@ public class HMMNode char maskValue; char consensusStructure; + /** + * Constructor + */ public HMMNode() { } - public HMMNode(HMMNode node) + public double[] getMatchEmissions() { - matchEmissions = new ArrayList<>(node.getMatchEmissions()); - insertEmissions = new ArrayList<>(node.getInsertEmissions()); - stateTransitions = new ArrayList<>(node.getStateTransitions()); - alignmentColumn = new Integer(node.getAlignmentColumn()); - consensusResidue = node.getConsensusResidue(); - referenceAnnotation = node.getReferenceAnnotation(); - maskValue = node.getMaskValue(); - consensusStructure = node.getConsensusStructure(); + return matchEmissions; } - public List getMatchEmissions() + public double getMatchEmission(int symbolIndex) { - return matchEmissions; + return matchEmissions[symbolIndex]; } - public void setMatchEmissions(List matchEmissionsL) + public void setMatchEmissions(double[] matches) { - this.matchEmissions = matchEmissionsL; + this.matchEmissions = matches; } - public List getInsertEmissions() + + public double[] getInsertEmissions() { return insertEmissions; } - public void setInsertEmissions(List insertEmissionsL) + public double getInsertEmission(int symbolIndex) + { + return insertEmissions[symbolIndex]; + } + + public void setInsertEmissions(double[] insertEmissionsL) { this.insertEmissions = insertEmissionsL; } - public List getStateTransitions() + public double[] getStateTransitions() { return stateTransitions; } - public void setStateTransitions(List stateTransitionsM) + public double getStateTransition(int transition) + { + return stateTransitions[transition]; + } + + public void setStateTransitions(double[] stateTransitionsM) { this.stateTransitions = stateTransitionsM; } diff --git a/src/jalview/datamodel/HiddenMarkovModel.java b/src/jalview/datamodel/HiddenMarkovModel.java index 506d73a..36d32ed 100644 --- a/src/jalview/datamodel/HiddenMarkovModel.java +++ b/src/jalview/datamodel/HiddenMarkovModel.java @@ -1,104 +1,27 @@ package jalview.datamodel; +import jalview.io.HMMFile; import jalview.schemes.ResidueProperties; import jalview.util.Comparison; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; /** - * Data structure which stores a hidden Markov model. Currently contains file - * properties as well, not sure whether these should be transferred to the - * HMMFile class + * Data structure which stores a hidden Markov model * * @author TZVanaalten * */ public class HiddenMarkovModel { - private static final double LOG2 = Math.log(2); - - // Stores file properties. Do not directly access this field as it contains - // only string value - use the getter methods. For example, to find the length - // of theHMM, use getModelLength()to return an int value - Map fileProperties = new HashMap<>(); - - // contains all of the symbols used in this model. The index of each symbol - // represents its lookup value - List symbols = new ArrayList<>(); - - // contains information for each node in the model. The begin node is at index - // 0. Node 0 contains average emission probabilities for each symbol - List nodes = new ArrayList<>(); - - // contains the HMM node for each alignment column, alignment columns start at - // index 0; - Map nodeLookup = new HashMap<>(); - - // contains the symbol index for each symbol - Map symbolIndexLookup = new HashMap<>(); - - final static String YES = "yes"; - - final static String NO = "no"; - - // keys for file properties hashmap - private static final String NAME = "NAME"; - - private static final String ACCESSION_NUMBER = "ACC"; - - private static final String DESCRIPTION = "DESC"; - - private static final String LENGTH = "LENG"; - - private static final String MAX_LENGTH = "MAXL"; - - private static final String ALPHABET = "ALPH"; - - private static final String DATE = "DATE"; - - private static final String COMMAND_LOG = "COM"; + public final static String YES = "yes"; - private static final String NUMBER_OF_SEQUENCES = "NSEQ"; + public final static String NO = "no"; - private static final String EFF_NUMBER_OF_SEQUENCES = "EFFN"; - - private static final String CHECK_SUM = "CKSUM"; - - private static final String GATHERING_THRESHOLDS = "GA"; - - private static final String TRUSTED_CUTOFFS = "TC"; - - private static final String NOISE_CUTOFFS = "NC"; - - private static final String STATISTICS = "STATS"; - - private static final String COMPO = "COMPO"; - - private static final String GATHERING_THRESHOLD = "GA"; - - private static final String TRUSTED_CUTOFF = "TC"; - - private final String NOISE_CUTOFF = "NC"; - - private static final String VITERBI = "VITERBI"; - - private static final String MSV = "MSV"; - - private static final String FORWARD = "FORWARD"; - - private static final String MAP = "MAP"; - - private static final String REFERENCE_ANNOTATION = "RF"; - - private static final String CONSENSUS_RESIDUE = "CONS"; - - private static final String CONSENSUS_STRUCTURE = "CS"; - - private static final String MASKED_VALUE = "MM"; - public static final int MATCHTOMATCH = 0; public static final int MATCHTOINSERT = 1; @@ -113,7 +36,35 @@ public class HiddenMarkovModel public static final int DELETETODELETE = 6; + private static final double LOG2 = Math.log(2); + + /* + * properties read from HMM file header lines + */ + Map fileProperties = new HashMap<>(); + String fileHeader; + + /* + * the symbols used in this model e.g. "ACGT" + */ + String alphabet; + + /* + * symbol lookup index into the alphabet for 'A' to 'Z' + */ + int[] symbolIndexLookup = new int['Z' - 'A' + 1]; + + /* + * Nodes in the model. The begin node is at index 0, and contains + * average emission probabilities for each symbol. + */ + List nodes = new ArrayList<>(); + + /* + * lookup of the HMM node for each alignment column (from 0) + */ + Map nodeLookup = new HashMap<>(); /** * Constructor @@ -126,11 +77,10 @@ public class HiddenMarkovModel { super(); this.fileProperties = new HashMap<>(hmm.fileProperties); - this.symbols = new ArrayList<>(hmm.symbols); + this.alphabet = hmm.alphabet; this.nodes = new ArrayList<>(hmm.nodes); this.nodeLookup = new HashMap<>(hmm.nodeLookup); - this.symbolIndexLookup = new HashMap<>( - hmm.symbolIndexLookup); + this.symbolIndexLookup = hmm.symbolIndexLookup; this.fileHeader = new String(hmm.fileHeader); } @@ -150,7 +100,7 @@ public class HiddenMarkovModel { float informationContent = 0f; - for (char symbol : getSymbols()) + for (char symbol : getSymbols().toCharArray()) { float freq = ResidueProperties.backgroundFrequencies .get(getAlphabetType()).get(symbol); @@ -184,38 +134,16 @@ public class HiddenMarkovModel } /** - * Returns the map containing the matches between nodes and alignment column - * indexes. + * Returns the symbols used in this hidden Markov model * * @return - * */ - public Map getNodeLookup() + public String getSymbols() { - return nodeLookup; - } - - /** - * Returns the list of symbols used in this hidden Markov model. - * - * @return - */ - public List getSymbols() - { - return symbols; + return alphabet; } /** - * Returns the file properties. - * - * @return - */ - public Map getFileProperties() - { - return fileProperties; - } - - /** * Gets the node in the hidden Markov model at the specified position. * * @param nodeIndex @@ -229,20 +157,7 @@ public class HiddenMarkovModel */ public HMMNode getNode(int nodeIndex) { - return getNodes().get(nodeIndex); - } - - /** - * Sets the list of symbols used in the hidden Markov model to the list - * specified. - * - * @param symbolsL - * The list of symbols to which the current list is to be changed. - * - */ - public void setSymbols(List symbolsL) - { - this.symbols = symbolsL; + return nodes.get(nodeIndex); } /** @@ -252,27 +167,31 @@ public class HiddenMarkovModel */ public String getName() { - return fileProperties.get(NAME); + return fileProperties.get(HMMFile.NAME); } /** - * Returns the accession number. + * Answers the string value of the property (parsed from an HMM file) for the + * given key, or null if the property is not present + * + * @param key * @return */ - public String getAccessionNumber() + public String getProperty(String key) { - return fileProperties.get(ACCESSION_NUMBER); + return fileProperties.get(key); } /** - * Returns a description of the sequence alignment on which the hidden Markov - * model is based. + * Answers true if the property with the given key is present with a value of + * "yes" (not case-sensitive), else false * + * @param key * @return */ - public String getDescription() + public boolean getBooleanProperty(String key) { - return fileProperties.get(DESCRIPTION); + return YES.equalsIgnoreCase(fileProperties.get(key)); } /** @@ -282,108 +201,60 @@ public class HiddenMarkovModel */ public Integer getLength() { - if (fileProperties.get(LENGTH) == null) + if (fileProperties.get(HMMFile.LENGTH) == null) { return null; } - return Integer.parseInt(fileProperties.get(LENGTH)); + return Integer.parseInt(fileProperties.get(HMMFile.LENGTH)); } /** - * Returns the max instance length within the hidden Markov model. - * - * @return - */ - public Integer getMaxInstanceLength() - { - if (fileProperties.get(MAX_LENGTH) == null) - { - return null; - } - return Integer.parseInt(fileProperties.get(MAX_LENGTH)); - } - - /** - * Returns the type of symbol alphabet - "amino", "DNA", "RNA" are the - * options. Other alphabets may be added. + * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are + * the options. Other alphabets may be added. * * @return */ public String getAlphabetType() { - return fileProperties.get(ALPHABET); + return fileProperties.get(HMMFile.ALPHABET); } /** - * Returns the date as a String. + * Sets the model alphabet to the symbols in the given string (ignoring any + * whitespace), and returns the number of symbols * - * @return + * @param symbols */ - public String getDate() + public int setAlphabet(String symbols) { - return fileProperties.get(DATE); - } + String trimmed = symbols.toUpperCase().replaceAll("\\s", ""); + int count = trimmed.length(); + alphabet = trimmed; + symbolIndexLookup = new int['Z' - 'A' + 1]; + Arrays.fill(symbolIndexLookup, -1); + int ignored = 0; - /** - * Returns the command line log. - * - * @return - */ - public String getCommandLineLog() - { - return fileProperties.get(COMMAND_LOG); - } - - /** - * Returns the number of sequences on which the HMM was trained. - * - * @return - */ - public Integer getNumberOfSequences() - { - if (fileProperties.get(NUMBER_OF_SEQUENCES) == null) - { - return null; - } - return Integer.parseInt(fileProperties.get(NUMBER_OF_SEQUENCES)); - } - - /** - * Returns the effective number of sequences on which the HMM was based. - * - * @param value - */ - public Double getEffectiveNumberOfSequences() - { - if (fileProperties.get(LENGTH) == null) + /* + * save the symbols in order, and a quick lookup of symbol position + */ + for (short i = 0; i < count; i++) { - return null; - } - return Double.parseDouble(fileProperties.get(EFF_NUMBER_OF_SEQUENCES)); - } - - /** - * Returns the checksum. - * - * @return - */ - public Long getCheckSum() - { - if (fileProperties.get(LENGTH) == null) - { - return null; + char symbol = trimmed.charAt(i); + if (symbol >= 'A' && symbol <= 'Z' + && symbolIndexLookup[symbol - 'A'] == -1) + { + symbolIndexLookup[symbol - 'A'] = i; + } + else + { + System.err + .println( + "Unexpected or duplicated character in HMM ALPHabet: " + + symbol); + ignored++; + } } - return Long.parseLong(fileProperties.get(CHECK_SUM)); - } - - /** - * Returns the list of nodes in this HMM. - * - * @return - */ - public List getNodes() - { - return nodes; + return count - ignored; } /** @@ -412,16 +283,12 @@ public class HiddenMarkovModel */ public double getMatchEmissionProbability(int alignColumn, char symbol) { - if (!symbolIndexLookup.containsKey(symbol)) - { - return 0d; - } - int symbolIndex = symbolIndexLookup.get(symbol); + int symbolIndex = getSymbolIndex(symbol); double probability = 0d; - if (nodeLookup.containsKey(alignColumn)) + if (symbolIndex != -1 && nodeLookup.containsKey(alignColumn)) { HMMNode node = nodeLookup.get(alignColumn); - probability = node.getMatchEmissions().get(symbolIndex); + probability = node.getMatchEmission(symbolIndex); } return probability; } @@ -440,16 +307,12 @@ public class HiddenMarkovModel */ public double getInsertEmissionProbability(int alignColumn, char symbol) { - if (!symbolIndexLookup.containsKey(symbol)) - { - return 0d; - } - int symbolIndex = symbolIndexLookup.get(symbol); + int symbolIndex = getSymbolIndex(symbol); double probability = 0d; - if (nodeLookup.containsKey(alignColumn)) + if (symbolIndex != -1 && nodeLookup.containsKey(alignColumn)) { HMMNode node = nodeLookup.get(alignColumn); - probability = node.getInsertEmissions().get(symbolIndex); + probability = node.getInsertEmission(symbolIndex); } return probability; } @@ -473,7 +336,7 @@ public class HiddenMarkovModel if (nodeLookup.containsKey(alignColumn)) { HMMNode node = nodeLookup.get(alignColumn); - probability = node.getStateTransitions().get(transition); + probability = node.getStateTransition(transition); } return probability; } @@ -517,7 +380,7 @@ public class HiddenMarkovModel public char getConsensusAtAlignColumn(int columnIndex) { char mostLikely = '-'; - if (consensusResidueIsActive()) + if (getBooleanProperty(HMMFile.CONSENSUS_RESIDUE)) { HMMNode node = nodeLookup.get(columnIndex); if (node == null) @@ -530,7 +393,7 @@ public class HiddenMarkovModel else { double highestProb = 0; - for (char character : symbols) + for (char character : alphabet.toCharArray()) { double prob = getMatchEmissionProbability(columnIndex, character); if (prob > highestProb) @@ -588,168 +451,27 @@ public class HiddenMarkovModel } /** - * Returns the average match emission probability for a given symbol - * - * @param symbolIndex - * The index of the symbol. - * @return - * - */ - public double getAverageMatchEmission(int symbolIndex) - { - double value = nodes.get(0).getMatchEmissions().get(symbolIndex); - return value; - } - - /** * Returns the number of symbols in the alphabet used in this HMM. * * @return */ public int getNumberOfSymbols() { - return symbols.size(); + return alphabet.length(); } /** - * Adds a file property. + * Sets a property read from an HMM file * * @param key * @param value */ - public void addFileProperty(String key, String value) + public void setProperty(String key, String value) { fileProperties.put(key, value); } /** - * Returns a boolean indicating whether the reference annotation is active. - * - * @return - */ - public boolean referenceAnnotationIsActive() - { - String status; - status = fileProperties.get(REFERENCE_ANNOTATION); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - /** - * Returns a boolean indicating whether the mask value annotation is active. - * - * @return - */ - public boolean maskValueIsActive() - { - String status; - status = fileProperties.get(MASKED_VALUE); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - /** - * Returns a boolean indicating whether the consensus residue annotation is - * active. - * - * @return - */ - public boolean consensusResidueIsActive() - { - String status; - status = fileProperties.get(CONSENSUS_RESIDUE); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - /** - * Returns a boolean indicating whether the consensus structure annotation is - * active. - * - * @return - */ - public boolean consensusStructureIsActive() - { - String status; - status = fileProperties.get(CONSENSUS_STRUCTURE); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - /** - * Returns a boolean indicating whether the MAP annotation is active. - * - * @return - */ - public boolean mapIsActive() - { - String status; - status = fileProperties.get(MAP); - if (status == null) - { - return false; - } - switch (status) - { - case YES: - return true; - case NO: - return false; - default: - return false; - } - - } - - /** * Sets the alignment column of the specified node * * @param nodeIndex @@ -797,7 +519,7 @@ public class HiddenMarkovModel } /** - * Sets the reference annotation at a given node. + * Sets the reference annotation at a given node * * @param nodeIndex * @param value @@ -808,7 +530,7 @@ public class HiddenMarkovModel } /** - * Sets the consensus residue at a given node. + * Sets the consensus residue at a given node * * @param nodeIndex * @param value @@ -819,7 +541,7 @@ public class HiddenMarkovModel } /** - * Sets the consensus structure at a given node. + * Sets the consensus structure at a given node * * @param nodeIndex * @param value @@ -830,7 +552,7 @@ public class HiddenMarkovModel } /** - * Sets the mask value at a given node. + * Sets the mask value at a given node * * @param nodeIndex * @param value @@ -845,46 +567,10 @@ public class HiddenMarkovModel * * @return */ - public String getGatheringThreshold() - { - String value; - value = fileProperties.get("GA"); - return value; - } - - /** - * Temporary implementation, should not be used. - * - * @return - */ - public String getNoiseCutoff() - { - String value; - value = fileProperties.get("NC"); - return value; - } - - /** - * Temporary implementation, should not be used. - * - * @return - */ - public String getTrustedCutoff() - { - String value; - value = fileProperties.get("TC"); - return value; - } - - /** - * Temporary implementation, should not be used. - * - * @return - */ public String getViterbi() { String value; - value = fileProperties.get(VITERBI); + value = fileProperties.get(HMMFile.VITERBI); return value; } @@ -896,7 +582,7 @@ public class HiddenMarkovModel public String getMSV() { String value; - value = fileProperties.get(MSV); + value = fileProperties.get(HMMFile.MSV); return value; } @@ -908,61 +594,11 @@ public class HiddenMarkovModel public String getForward() { String value; - value = fileProperties.get(FORWARD); + value = fileProperties.get(HMMFile.FORWARD); return value; } /** - * Sets the activation status of the MAP annotation. - * - * @param status - */ - public void setMAPStatus(boolean status) - { - fileProperties.put(MAP, status ? YES : NO); - } - - /** - * Sets the activation status of the reference annotation. - * - * @param status - */ - public void setReferenceAnnotationStatus(boolean status) - { - fileProperties.put(REFERENCE_ANNOTATION, status ? YES : NO); - } - - /** - * Sets the activation status of the mask value annotation. - * - * @param status - */ - public void setMaskedValueStatus(boolean status) - { - fileProperties.put(MASKED_VALUE, status ? YES : NO); - } - - /** - * Sets the activation status of the consensus residue annotation. - * - * @param status - */ - public void setConsensusResidueStatus(boolean status) - { - fileProperties.put(CONSENSUS_RESIDUE, status ? YES : NO); - } - - /** - * Sets the activation status of the consensus structure annotation. - * - * @param status - */ - public void setConsensusStructureStatus(boolean status) - { - fileProperties.put(CONSENSUS_STRUCTURE, status ? YES : NO); - } - - /** * Answers the HMMNode mapped to the given alignment column (base 0), or null * if none is mapped * @@ -974,32 +610,10 @@ public class HiddenMarkovModel } /** - * Finds the String values of a boolean. "yes" for true and "no" for false. - * - * @param value - * @return - */ - public static String findStringFromBoolean(boolean value) - { - if (value) - { - return YES; - } - else - { - return NO; - } - } - - - - /** * Returns the consensus sequence based on the most probable symbol at each * position. The sequence is adjusted to match the length of the existing * sequence alignment. Gap characters are used as padding. * - * @param length - * The length of the longest sequence in the existing alignment. * @return */ public Sequence getConsensusSequence() @@ -1047,16 +661,29 @@ public class HiddenMarkovModel return consensus; } - public int getSymbolIndex(char c) + /** + * Answers the index position (0...) of the given symbol, or -1 if not a valid + * symbol for this HMM + * + * @param symbol + * @return + */ + public int getSymbolIndex(char symbol) { - return symbolIndexLookup.get(c); + /* + * symbolIndexLookup holds the index for 'A' to 'Z' + */ + char c = Character.toUpperCase(symbol); + if ('A' <= c && c <= 'Z') + { + return symbolIndexLookup[symbol - 'A']; + } + return -1; } - public void setSymbolIndex(Character c, Integer i) + public void addNode(HMMNode node) { - symbolIndexLookup.put(c, i); + nodes.add(node); } - - } diff --git a/src/jalview/io/HMMFile.java b/src/jalview/io/HMMFile.java index ca5cf1c..2733fb4 100644 --- a/src/jalview/io/HMMFile.java +++ b/src/jalview/io/HMMFile.java @@ -23,28 +23,89 @@ import java.util.Scanner; public class HMMFile extends AlignFile implements AlignmentFileReaderI, AlignmentFileWriterI { - private static final int NUMBER_OF_TRANSITIONS = 7; + /* + * keys to data in HMM file, used to store as properties of the HiddenMarkovModel + */ + private static final String HMM = "HMM"; - private static final String SPACE = " "; + public static final String NAME = "NAME"; + + public static final String ACCESSION_NUMBER = "ACC"; + + public static final String DESCRIPTION = "DESC"; + + public static final String LENGTH = "LENG"; + + public static final String MAX_LENGTH = "MAXL"; + + public static final String ALPHABET = "ALPH"; + + private static final String ALPH_AMINO = "amino"; + + private static final String ALPH_DNA = "DNA"; + + private static final String ALPH_RNA = "RNA"; + + private static final String ALPHABET_AMINO = "ACDEFGHIKLMNPQRSTVWY"; + + private static final String ALPHABET_DNA = "ACGT"; + + private static final String ALPHABET_RNA = "ACGU"; + + public static final String DATE = "DATE"; + + public static final String COMMAND_LOG = "COM"; + + public static final String NUMBER_OF_SEQUENCES = "NSEQ"; + + public static final String EFF_NUMBER_OF_SEQUENCES = "EFFN"; + + public static final String CHECK_SUM = "CKSUM"; - private static final String COMPO = "COMPO"; + public static final String STATISTICS = "STATS"; - private static final String EMPTY = ""; + public static final String COMPO = "COMPO"; + + public static final String GATHERING_THRESHOLD = "GA"; + + public static final String TRUSTED_CUTOFF = "TC"; + + public static final String NOISE_CUTOFF = "NC"; + + public static final String VITERBI = "VITERBI"; + + public static final String MSV = "MSV"; + + public static final String FORWARD = "FORWARD"; + + public static final String MAP = "MAP"; + + public static final String REFERENCE_ANNOTATION = "RF"; + + public static final String CONSENSUS_RESIDUE = "CONS"; + + public static final String CONSENSUS_STRUCTURE = "CS"; + + public static final String MASKED_VALUE = "MM"; + + private static final int NUMBER_OF_TRANSITIONS = 7; + + private static final String SPACE = " "; /* - * guide line added to an output HMMER file, purely for readability + * optional guide line added to an output HMMER file, purely for readability */ private static final String TRANSITIONTYPELINE = " m->m m->i m->d i->m i->i d->m d->d"; - private static String NL = "\n"; + private static String NL = System.lineSeparator(); private HiddenMarkovModel hmm; // number of symbols in the alphabet used in the hidden Markov model - int numberOfSymbols; + private int numberOfSymbols; /** - * Parses immediately. + * Constructor that parses immediately * * @param inFile * @param type @@ -56,7 +117,7 @@ public class HMMFile extends AlignFile } /** - * Parses immediately. + * Constructor that parses immediately * * @param source * @throws IOException @@ -67,15 +128,14 @@ public class HMMFile extends AlignFile } /** - * Default constructor, do not use! + * Default constructor */ public HMMFile() { - } /** - * Constructor for HMMFile used for exporting. + * Constructor for HMMFile used for exporting * * @param hmm * @param exportImmediately @@ -86,17 +146,7 @@ public class HMMFile extends AlignFile } /** - * For testing, do not use. - * - * @param br - */ - HMMFile(BufferedReader br) - { - dataIn = br; - } - - /** - * Returns the HMM produced by reading in a HMMER3 file. + * Returns the HMM produced by parsing a HMMER3 file * * @return */ @@ -106,17 +156,7 @@ public class HMMFile extends AlignFile } /** - * Sets the HMM used in this file. - * - * @param model - */ - public void setHMM(HiddenMarkovModel model) - { - this.hmm = model; - } - - /** - * Gets the name of the hidden Markov model. + * Gets the name of the hidden Markov model * * @return */ @@ -126,17 +166,15 @@ public class HMMFile extends AlignFile } /** - * Reads the data from HMM file into the HMM field on this object. - * - * @throws IOException + * Reads the data from HMM file into the HMM model */ @Override - public void parse() throws IOException + public void parse() { try { hmm = new HiddenMarkovModel(); - parseFileProperties(dataIn); + parseHeaderLines(dataIn); parseModel(dataIn); } catch (Exception e) { @@ -145,100 +183,88 @@ public class HMMFile extends AlignFile } /** - * Reads the data from HMM file into the HMM field on this object. - * - * @throws IOException - */ - - public void parse(BufferedReader br) throws IOException - { - hmm = new HiddenMarkovModel(); - parseFileProperties(br); - parseModel(br); - } - - - - /** - * Imports the file properties from a HMMER3 file. + * Reads the header properties from a HMMER3 file and saves them in the + * HiddeMarkovModel. This method exits after reading the next line after the + * HMM line. * * @param input - * The buffered reader used to read in the file. * @throws IOException */ - void parseFileProperties(BufferedReader input) throws IOException + void parseHeaderLines(BufferedReader input) throws IOException { - boolean readingFile = true; + boolean readingHeaders = true; hmm.setFileHeader(input.readLine()); String line = input.readLine(); - while (readingFile) + while (readingHeaders && line != null) { - if (line != null) + Scanner parser = new Scanner(line); + String next = parser.next(); + if (ALPHABET.equals(next)) { - Scanner parser = new Scanner(line); - String next = parser.next(); - if ("HMM".equals(next)) // indicates start of HMM data (end of file - // properties) - { - readingFile = false; - fillSymbols(parser); - numberOfSymbols = hmm.getNumberOfSymbols(); - } - else if ("STATS".equals(next)) - { - parser.next(); - String key; - String value; - key = parser.next(); - value = parser.next() + SPACE + SPACE + parser.next(); - hmm.addFileProperty(key, value); - } - else + String alphabetType = parser.next(); + hmm.setProperty(ALPHABET, alphabetType); + String alphabet = ALPH_DNA.equalsIgnoreCase(alphabetType) + ? ALPHABET_DNA + : (ALPH_RNA.equalsIgnoreCase(alphabetType) ? ALPHABET_RNA + : ALPHABET_AMINO); + numberOfSymbols = hmm.setAlphabet(alphabet); + } + else if (HMM.equals(next)) + { + readingHeaders = false; + String symbols = line.substring(line.indexOf(HMM) + HMM.length()); + numberOfSymbols = hmm.setAlphabet(symbols); + } + else if (STATISTICS.equals(next)) + { + parser.next(); + String key; + String value; + key = parser.next(); + value = parser.next() + SPACE + SPACE + parser.next(); + hmm.setProperty(key, value); + } + else + { + String key = next; + String value = parser.next(); + while (parser.hasNext()) { - String key = next; - String value = parser.next(); - while (parser.hasNext()) - { - value = value + SPACE + parser.next(); - } - hmm.addFileProperty(key, value); + value = value + SPACE + parser.next(); } - parser.close(); + hmm.setProperty(key, value); } + parser.close(); line = input.readLine(); - if (line == null) - { - readingFile = false; - } } - } /** * Parses the model data from the HMMER3 file * * @param input - * The buffered reader used to read the file. * @throws IOException */ void parseModel(BufferedReader input) throws IOException { boolean first = true; + // specification says there must always be an HMM header + // and one more header which is skipped here String line = input.readLine(); while (!"//".equals(line)) { HMMNode node = new HMMNode(); - hmm.getNodes().add(node); + hmm.addNode(node); Scanner matchReader = new Scanner(line); String next = matchReader.next(); if (next.equals(COMPO) || !first) { // stores match emission line in list - List matches = new ArrayList<>(); - matches = fillList(matchReader, numberOfSymbols); + double[] matches = parseDoubles(matchReader, numberOfSymbols); node.setMatchEmissions(matches); if (!first) { + // TODO handle files with no column map (make our own) int column = parseAnnotations(matchReader, node); hmm.setAlignmentColumn(node, column - 1); } @@ -247,16 +273,15 @@ public class HMMFile extends AlignFile // stores insert emission line in list line = input.readLine(); Scanner insertReader = new Scanner(line); - List inserts = new ArrayList<>(); - inserts = fillList(insertReader, numberOfSymbols); + double[] inserts = parseDoubles(insertReader, numberOfSymbols); node.setInsertEmissions(inserts); insertReader.close(); // stores state transition line in list line = input.readLine(); Scanner transitionReader = new Scanner(line); - List transitions = new ArrayList<>(); - transitions = fillList(transitionReader, NUMBER_OF_TRANSITIONS); + double[] transitions = parseDoubles(transitionReader, + NUMBER_OF_TRANSITIONS); node.setStateTransitions(transitions); transitionReader.close(); line = input.readLine(); @@ -281,7 +306,7 @@ public class HMMFile extends AlignFile * HMM counts columns from 1, convert to base 0 for Jalview */ int column = 0; - if (hmm.mapIsActive() && scanner.hasNext()) + if (hmm.getBooleanProperty(MAP) && scanner.hasNext()) { column = scanner.nextInt(); node.setAlignmentColumn(column - 1); @@ -335,43 +360,36 @@ public class HMMFile extends AlignFile } /** - * Fills a list of doubles from an input line + * Fills an array of doubles parsed from an input line * * @param input - * The scanner for the line containing the data to be transferred to - * the list. * @param numberOfElements - * The number of elements in the list to be filled. - * @return filled list Returns the list of doubles. + * @return * @throws IOException */ - static List fillList(Scanner input, + static double[] parseDoubles(Scanner input, int numberOfElements) throws IOException { - List list = new ArrayList<>(); + double[] values = new double[numberOfElements]; for (int i = 0; i < numberOfElements; i++) { - + if (!input.hasNext()) + { + throw new IOException("Incomplete data"); + } String next = input.next(); - if (next.contains("*")) // state transitions to or from delete states - // occasionally have values of -infinity. These - // values are represented by an * in the .hmm - // file. + if (next.contains("*")) { - list.add(Double.NEGATIVE_INFINITY); + values[i] = Double.NEGATIVE_INFINITY; } else { double prob = Double.valueOf(next); prob = Math.pow(Math.E, -prob); - list.add(prob); + values[i] = prob; } } - if (list.size() < numberOfElements) - { - throw new IOException("Incomplete data"); - } - return list; + return values; } /** @@ -384,25 +402,19 @@ public class HMMFile extends AlignFile * @param columnSeparation * The separation between subsequent data entries. * @param data - * The list fo data to be added to the String. + * The list of data to be added to the String. * @return */ String addData(int initialColumnSeparation, int columnSeparation, List data) { - String line = EMPTY; - int index = 0; + String line = ""; + boolean first = true; for (String value : data) { - if (index == 0) - { - line += String.format("%" + initialColumnSeparation + "s", value); - } - else - { - line += String.format("%" + columnSeparation + "s", value); - } - index++; + int sep = first ? initialColumnSeparation : columnSeparation; + line += String.format("%" + sep + "s", value); + first = false; } return line; } @@ -425,23 +437,22 @@ public class HMMFile extends AlignFile } /** - * Converts a list of doubles into a list of Strings, rounded to the nearest - * 5th decimal place. + * Converts an array of doubles into a list of Strings, rounded to the nearest + * 5th decimal place * - * @param list + * @param doubles * @param noOfDecimals * @return */ - List doubleListToStringList(List list) + List doublesToStringList(double[] doubles) { List strList = new ArrayList<>(); - for (double value : list) + for (double value : doubles) { String strValue; if (value > 0) { strValue = String.format("%.5f", value); - } else if (value == -0.00000d) { @@ -451,226 +462,147 @@ public class HMMFile extends AlignFile { strValue = "*"; } - strList.add(strValue); } return strList; } /** - * Converts a primitive array of Strings to a list of Strings. + * Appends model data in string format to the string builder * - * @param array - * @return + * @param output */ - List stringArrayToStringList(String[] array) + void appendModelAsString(StringBuilder output) { - List list = new ArrayList<>(); - for (String value : array) + output.append(HMM).append(" "); + String charSymbols = hmm.getSymbols(); + for (char c : charSymbols.toCharArray()) { - list.add(value); + output.append(String.format("%9s", c)); } - - return list; - } - - /** - * Returns a string containing the model data. - */ - String getModelAsString() - { - StringBuilder output = new StringBuilder(); - String symbolLine = "HMM"; - List charSymbols = hmm.getSymbols(); - List strSymbols; - strSymbols = charListToStringList(charSymbols); - symbolLine += addData(11, 9, strSymbols); - output.append(symbolLine); output.append(NL).append(TRANSITIONTYPELINE); int length = hmm.getLength(); - for (int node = 0; node <= length; node++) + for (int nodeNo = 0; nodeNo <= length; nodeNo++) { - String matchLine; - if (node == 0) - { - matchLine = String.format("%7s", "COMPO"); - } - else - { - matchLine = String.format("%7s", node); - } + String matchLine = String.format("%7s", + nodeNo == 0 ? "COMPO" : Integer.toString(nodeNo)); - List strMatches; - List doubleMatches; - doubleMatches = convertListToLogSpace( - hmm.getNode(node).getMatchEmissions()); - strMatches = doubleListToStringList(doubleMatches); + double[] doubleMatches = convertToLogSpace( + hmm.getNode(nodeNo).getMatchEmissions()); + List strMatches = doublesToStringList(doubleMatches); matchLine += addData(10, 9, strMatches); - - if (node != 0) + if (nodeNo != 0) { - matchLine += SPACE + (hmm.getNodeAlignmentColumn(node) + 1); - matchLine += SPACE + hmm.getConsensusResidue(node); - matchLine += SPACE + hmm.getReferenceAnnotation(node); + matchLine += SPACE + (hmm.getNodeAlignmentColumn(nodeNo) + 1); + matchLine += SPACE + hmm.getConsensusResidue(nodeNo); + matchLine += SPACE + hmm.getReferenceAnnotation(nodeNo); if (hmm.getFileHeader().contains("HMMER3/f")) { - matchLine += SPACE + hmm.getMaskedValue(node); - matchLine += SPACE + hmm.getConsensusStructure(node); + matchLine += SPACE + hmm.getMaskedValue(nodeNo); + matchLine += SPACE + hmm.getConsensusStructure(nodeNo); } - } output.append(NL).append(matchLine); - String insertLine = EMPTY; - List strInserts; - List doubleInserts; - doubleInserts = convertListToLogSpace( - hmm.getNode(node).getInsertEmissions()); - strInserts = doubleListToStringList(doubleInserts); + String insertLine = ""; + + double[] doubleInserts = convertToLogSpace( + hmm.getNode(nodeNo).getInsertEmissions()); + List strInserts = doublesToStringList(doubleInserts); insertLine += addData(17, 9, strInserts); output.append(NL).append(insertLine); - String transitionLine = EMPTY; - List strTransitions; - List doubleTransitions; - doubleTransitions = convertListToLogSpace( - hmm.getNode(node).getStateTransitions()); - strTransitions = doubleListToStringList(doubleTransitions); + String transitionLine = ""; + double[] doubleTransitions = convertToLogSpace( + hmm.getNode(nodeNo).getStateTransitions()); + List strTransitions = doublesToStringList( + doubleTransitions); transitionLine += addData(17, 9, strTransitions); output.append(NL).append(transitionLine); } - return output.toString(); } /** - * Returns a String containing the HMM file properties + * Appends formatted HMM file properties to the string builder + * + * @param output */ - String getFilePropertiesAsString() + void appendProperties(StringBuilder output) { - StringBuffer output = new StringBuffer(); - String line; - output.append(hmm.getFileHeader()); - - line = String.format("%-5s %1s", "NAME", hmm.getName()); - output.append(NL + line); - if (hmm.getAccessionNumber() != null) - { - line = String.format("%-5s %1s", "ACC", hmm.getAccessionNumber()); - output.append(NL + line); - } + String format = "%n%-5s %1s"; + appendProperty(output, format, NAME); + appendProperty(output, format, ACCESSION_NUMBER); + appendProperty(output, format, DESCRIPTION); + appendProperty(output, format, LENGTH); + appendProperty(output, format, MAX_LENGTH); + appendProperty(output, format, ALPHABET); + appendBooleanProperty(output, format, REFERENCE_ANNOTATION); + appendBooleanProperty(output, format, MASKED_VALUE); + appendBooleanProperty(output, format, CONSENSUS_RESIDUE); + appendBooleanProperty(output, format, CONSENSUS_STRUCTURE); + appendBooleanProperty(output, format, MAP); + appendProperty(output, format, DATE); + appendProperty(output, format, NUMBER_OF_SEQUENCES); + appendProperty(output, format, EFF_NUMBER_OF_SEQUENCES); + appendProperty(output, format, CHECK_SUM); + appendProperty(output, format, GATHERING_THRESHOLD); + appendProperty(output, format, TRUSTED_CUTOFF); + appendProperty(output, format, NOISE_CUTOFF); - if (hmm.getDescription() != null) + if (hmm.getMSV() != null) { - line = String.format("%-5s %1s", "DESC", hmm.getDescription()); - output.append(NL + line); - } - line = String.format("%-5s %1s", "LENG", hmm.getLength()); - output.append(NL + line); + output.append(String.format("%n%-19s %18s", "STATS LOCAL MSV", + hmm.getMSV())); - if (hmm.getMaxInstanceLength() != null) - { - line = String.format("%-5s %1s", "MAXL", hmm.getMaxInstanceLength()); - output.append(NL + line); - } - line = String.format("%-5s %1s", "ALPH", hmm.getAlphabetType()); - output.append(NL + line); - - boolean status; - String statusStr; - - status = hmm.referenceAnnotationIsActive(); - statusStr = HiddenMarkovModel.findStringFromBoolean(status); - line = String.format("%-5s %1s", "RF", - statusStr); - output.append(NL + line); - - status = hmm.maskValueIsActive(); - statusStr = HiddenMarkovModel.findStringFromBoolean(status); - line = String.format("%-5s %1s", "MM", - statusStr); - output.append(NL + line); - - status = hmm.consensusResidueIsActive(); - statusStr = HiddenMarkovModel.findStringFromBoolean(status); - line = String.format("%-5s %1s", "CONS", - statusStr); - output.append(NL + line); - - status = hmm.consensusStructureIsActive(); - statusStr = HiddenMarkovModel.findStringFromBoolean(status); - line = String.format("%-5s %1s", "CS", - statusStr); - output.append(NL + line); - - status = hmm.mapIsActive(); - statusStr = HiddenMarkovModel.findStringFromBoolean(status); - line = String.format("%-5s %1s", "MAP", - statusStr); - output.append(NL + line); - - - if (hmm.getDate() != null) - { - line = String.format("%-5s %1s", "DATE", hmm.getDate()); - output.append(NL + line); - } - if (hmm.getNumberOfSequences() != null) - { - line = String.format("%-5s %1s", "NSEQ", hmm.getNumberOfSequences()); - output.append(NL + line); - } - if (hmm.getEffectiveNumberOfSequences() != null) - { - line = String.format("%-5s %1s", "EFFN", - hmm.getEffectiveNumberOfSequences()); - output.append(NL + line); - } - if (hmm.getCheckSum() != null) - { - line = String.format("%-5s %1s", "CKSUM", hmm.getCheckSum()); - output.append(NL + line); - } - if (hmm.getGatheringThreshold() != null) - { - line = String.format("%-5s %1s", "GA", hmm.getGatheringThreshold()); - output.append(NL + line); - } + output.append(String.format("%n%-19s %18s", "STATS LOCAL VITERBI", + hmm.getViterbi())); - if (hmm.getTrustedCutoff() != null) - { - line = String.format("%-5s %1s", "TC", hmm.getTrustedCutoff()); - output.append(NL + line); + output.append(String.format("%n%-19s %18s", "STATS LOCAL FORWARD", + hmm.getForward())); } - if (hmm.getNoiseCutoff() != null) - { - line = String.format("%-5s %1s", "NC", hmm.getNoiseCutoff()); - output.append(NL + line); - } - if (hmm.getMSV() != null) + } + + /** + * Appends 'yes' or 'no' for the given property, according to whether or not + * it is set in the HMM + * + * @param output + * @param format + * @param propertyName + */ + private void appendBooleanProperty(StringBuilder output, String format, + String propertyName) + { + boolean set = hmm.getBooleanProperty(propertyName); + output.append(String.format(format, propertyName, + set ? HiddenMarkovModel.YES : HiddenMarkovModel.NO)); + } + + /** + * Appends the value of the given property to the output, if not null + * + * @param output + * @param format + * @param propertyName + */ + private void appendProperty(StringBuilder output, String format, + String propertyName) + { + String value = hmm.getProperty(propertyName); + if (value != null) { - line = String.format("%-19s %18s", "STATS LOCAL MSV", hmm.getMSV()); - output.append(NL + line); - - line = String.format("%-19s %18s", "STATS LOCAL VITERBI", - hmm.getViterbi()); - output.append(NL + line); - - line = String.format("%-19s %18s", "STATS LOCAL FORWARD", - hmm.getForward()); - output.append(NL + line); + output.append(String.format(format, propertyName, value)); } - return output.toString(); } - /** * Returns the char value of a single lettered String. * @@ -682,7 +614,6 @@ public class HMMFile extends AlignFile char character; character = string.charAt(0); return character; - } @Override @@ -702,33 +633,30 @@ public class HMMFile extends AlignFile */ public String print() { - StringBuffer output = new StringBuffer(); - output.append(getFilePropertiesAsString()); + StringBuilder output = new StringBuilder(); + appendProperties(output); output.append(NL); - output.append(getModelAsString()); + appendModelAsString(output); output.append(NL + "//"); return output.toString(); } /** - * Converts the probabilities contained in a list into log space. + * Converts the probabilities contained in an array into log space * - * @param list + * @param ds */ - List convertListToLogSpace(List list) + double[] convertToLogSpace(double[] ds) { - - List convertedList = new ArrayList<>(); - for (int i = 0; i < list.size(); i++) + double[] converted = new double[ds.length]; + for (int i = 0; i < ds.length; i++) { - double prob = list.get(i); + double prob = ds[i]; double logProb = -1 * Math.log(prob); - convertedList.add(logProb); + converted[i] = logProb; } - return convertedList; - - + return converted; } /** @@ -741,26 +669,6 @@ public class HMMFile extends AlignFile SequenceI[] seq = new SequenceI[1]; seq[0] = hmmSeq; return seq; - - } - - /** - * Fills symbol array and adds each symbol to an index lookup - * - * @param parser - * The scanner scanning the symbol line in the file. - */ - public void fillSymbols(Scanner parser) - { - int i = 0; - while (parser.hasNext()) - { - String strSymbol = parser.next(); - char[] symbol = strSymbol.toCharArray(); - hmm.getSymbols().add(symbol[0]); - hmm.setSymbolIndex(symbol[0], i); - i++; - } } @Override diff --git a/test/jalview/datamodel/HiddenMarkovModelTest.java b/test/jalview/datamodel/HiddenMarkovModelTest.java index b283809..d13b0bc 100644 --- a/test/jalview/datamodel/HiddenMarkovModelTest.java +++ b/test/jalview/datamodel/HiddenMarkovModelTest.java @@ -110,7 +110,7 @@ public class HiddenMarkovModelTest { { assertEquals(hmm.getConsensusAtAlignColumn(10), 's'); assertEquals(hmm.getConsensusAtAlignColumn(50), 'k'); - hmm.setConsensusResidueStatus(false); + hmm.setProperty(HMMFile.CONSENSUS_RESIDUE, "no"); assertEquals(hmm.getConsensusAtAlignColumn(100), 'l'); assertEquals(hmm.getConsensusAtAlignColumn(400), 'k'); } @@ -140,7 +140,7 @@ public class HiddenMarkovModelTest { .get("amino"); int col = 4; float expected = 0f; - for (char aa : hmm.getSymbols()) + for (char aa : hmm.getSymbols().toCharArray()) { double mep = hmm.getMatchEmissionProbability(col, aa); float background = uniprotFreqs.get(aa); diff --git a/test/jalview/hmmer/HMMERTest.java b/test/jalview/hmmer/HMMERTest.java index 112ce21..d084d77 100644 --- a/test/jalview/hmmer/HMMERTest.java +++ b/test/jalview/hmmer/HMMERTest.java @@ -11,6 +11,7 @@ import jalview.datamodel.HiddenMarkovModel; import jalview.datamodel.SequenceI; import jalview.gui.AlignFrame; import jalview.gui.Desktop; +import jalview.io.HMMFile; import jalview.ws.params.ArgumentI; import java.io.IOException; @@ -84,7 +85,8 @@ public class HMMERTest { assertEquals(hmm.getLength().intValue(), 148); assertEquals(hmm.getAlphabetType(), "amino"); assertEquals(hmm.getName(), "Alignment"); - assertEquals(hmm.getEffectiveNumberOfSequences(), 0.648193, 0.0001); + assertEquals(hmm.getProperty(HMMFile.EFF_NUMBER_OF_SEQUENCES), + "0.648193"); assertEquals(hmm.getConsensusAtAlignColumn(15), 's'); } diff --git a/test/jalview/io/HMMFileTest.java b/test/jalview/io/HMMFileTest.java index dadc1c7..387a915 100644 --- a/test/jalview/io/HMMFileTest.java +++ b/test/jalview/io/HMMFileTest.java @@ -1,7 +1,10 @@ package jalview.io; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; import jalview.datamodel.HMMNode; import jalview.datamodel.HiddenMarkovModel; @@ -13,13 +16,13 @@ import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; import java.util.Scanner; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import junit.extensions.PA; + public class HMMFileTest { HMMFile fn3; @@ -29,49 +32,46 @@ public class HMMFileTest { HMMFile made1; @BeforeClass(alwaysRun = true) - public void setUp() throws FileNotFoundException + public void setUp() throws IOException { - fn3 = new HMMFile(new BufferedReader( - new FileReader(("test/jalview/io/test_fn3_hmm.txt")))); + fn3 = new HMMFile("test/jalview/io/test_fn3_hmm.txt", + DataSourceType.FILE); - pKinase = new HMMFile(new BufferedReader( - new FileReader(("test/jalview/io/test_PKinase_hmm.txt")))); + pKinase = new HMMFile("test/jalview/io/test_PKinase_hmm.txt", + DataSourceType.FILE); - made1 = new HMMFile(new BufferedReader( - new FileReader(("test/jalview/io/test_MADE1_hmm.txt")))); + made1 = new HMMFile("test/jalview/io/test_MADE1_hmm.txt", + DataSourceType.FILE); } @Test(groups = "Functional") public void testParse() throws IOException { - pKinase.parse(); HiddenMarkovModel hmm = pKinase.getHMM(); assertEquals(hmm.getName(), "Pkinase"); - assertEquals(hmm.getAccessionNumber(), "PF00069.17"); - assertEquals(hmm.getDescription(), "Protein kinase domain"); + assertEquals(hmm.getProperty(HMMFile.ACCESSION_NUMBER), "PF00069.17"); + assertEquals(hmm.getProperty(HMMFile.DESCRIPTION), + "Protein kinase domain"); assertEquals(hmm.getLength().intValue(), 260); - assertNull(hmm.getMaxInstanceLength()); + assertNull(hmm.getProperty(HMMFile.MAX_LENGTH)); assertEquals(hmm.getAlphabetType(), "amino"); - assertEquals(hmm.referenceAnnotationIsActive(), false); - assertEquals(hmm.maskValueIsActive(), false); - assertEquals(hmm.consensusResidueIsActive(), true); - assertEquals(hmm.consensusStructureIsActive(), - true); - assertEquals(hmm.mapIsActive(), true); - assertEquals(hmm.getDate(), "Thu Jun 16 11:44:06 2011"); - assertNull(hmm.getCommandLineLog()); - assertEquals(hmm.getNumberOfSequences().intValue(), 54); - assertEquals(hmm.getEffectiveNumberOfSequences(), 3.358521, 4d); - assertEquals(hmm.getCheckSum().longValue(), 3106786190l); - assertEquals(hmm.getGatheringThreshold(), "70.30 70.30"); - assertEquals(hmm.getTrustedCutoff(), "70.30 70.30"); - assertEquals(hmm.getNoiseCutoff(), "70.20 70.20"); - - List symbols = Arrays - .asList(new Character[] - { 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', - 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' }); - assertEquals(hmm.getSymbols(), symbols); + assertFalse(hmm.getBooleanProperty(HMMFile.REFERENCE_ANNOTATION)); + assertFalse(hmm.getBooleanProperty(HMMFile.MASKED_VALUE)); + assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_RESIDUE)); + assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_STRUCTURE)); + assertTrue(hmm.getBooleanProperty(HMMFile.MAP)); + assertEquals(hmm.getProperty(HMMFile.DATE), "Thu Jun 16 11:44:06 2011"); + assertNull(hmm.getProperty(HMMFile.COMMAND_LOG)); + assertEquals(hmm.getProperty(HMMFile.NUMBER_OF_SEQUENCES), "54"); + assertEquals(hmm.getProperty(HMMFile.EFF_NUMBER_OF_SEQUENCES), + "3.358521"); + assertEquals(hmm.getProperty(HMMFile.CHECK_SUM), "3106786190"); + assertEquals(hmm.getProperty(HMMFile.GATHERING_THRESHOLD), + "70.30 70.30"); + assertEquals(hmm.getProperty(HMMFile.TRUSTED_CUTOFF), "70.30 70.30"); + assertEquals(hmm.getProperty(HMMFile.NOISE_CUTOFF), "70.20 70.20"); + + assertEquals(hmm.getSymbols(), "ACDEFGHIKLMNPQRSTVWY"); assertEquals(hmm.getMatchEmissionProbability(0, 'Y'), 0.16102, 0.001d); assertEquals(hmm.getMatchEmissionProbability(11, 'P'), 0.0130, 0.001d); @@ -113,85 +113,89 @@ public class HMMFileTest { assertEquals(hmm.getConsensusResidue(145), 'a'); assertEquals(hmm.getMaskedValue(183), '-'); assertEquals(hmm.getConsensusStructure(240), 'H'); - } - @Test(priority = 0) - public void testParseFileProperties() throws IOException + @Test(groups = "Functional") + public void testParseHeaderLines_amino() throws IOException { FileReader fr = new FileReader( new File("test/jalview/io/test_fn3_hmm.txt")); BufferedReader br = new BufferedReader(fr); - fn3.setHMM(new HiddenMarkovModel()); - fn3.parseFileProperties(br); - fn3.parseModel(br); // this is for a later test - HiddenMarkovModel testHMM = new HiddenMarkovModel(); - testHMM = fn3.getHMM(); + HiddenMarkovModel hmm = new HiddenMarkovModel(); + HMMFile testee = new HMMFile(); + PA.setValue(testee, "hmm", hmm); + testee.parseHeaderLines(br); br.close(); fr.close(); - assertEquals(testHMM.getName(), "fn3"); - assertEquals(testHMM.getAccessionNumber(), "PF00041.13"); - assertEquals(testHMM.getDescription(), + assertEquals(hmm.getName(), "fn3"); + assertEquals(hmm.getProperty(HMMFile.ACCESSION_NUMBER), "PF00041.13"); + assertEquals(hmm.getProperty(HMMFile.DESCRIPTION), "Fibronectin type III domain"); - assertEquals(testHMM.getLength().intValue(), 86); - assertNull(testHMM.getMaxInstanceLength()); - assertEquals(testHMM.getAlphabetType(), "amino"); - assertEquals(testHMM.referenceAnnotationIsActive(), false); - assertEquals(testHMM.maskValueIsActive(), false); - assertEquals(testHMM.consensusResidueIsActive(), true); - assertEquals(testHMM.consensusStructureIsActive(), true); - assertEquals(testHMM.mapIsActive(), true); - assertEquals(testHMM.getDate(), "Fri Jun 20 08:22:31 2014"); - assertNull(testHMM.getCommandLineLog()); - assertEquals(testHMM.getNumberOfSequences().intValue(), 106); - assertEquals(testHMM.getEffectiveNumberOfSequences(), 11.415833, 4d); - assertEquals(testHMM.getCheckSum().longValue(), 3564431818l); - assertEquals(testHMM.getGatheringThreshold(), "8.00 7.20"); - assertEquals(testHMM.getTrustedCutoff(), "8.00 7.20"); - assertEquals(testHMM.getNoiseCutoff(), "7.90 7.90"); - assertEquals(testHMM.getViterbi(), "-9.7737 0.71847"); - assertEquals(testHMM.getMSV(), "-9.4043 0.71847"); - assertEquals(testHMM.getForward(), "-3.8341 0.71847"); - - - FileReader fr3 = new FileReader( + assertEquals(hmm.getProperty(HMMFile.LENGTH), "86"); + assertNull(hmm.getProperty(HMMFile.MAX_LENGTH)); + assertEquals(hmm.getAlphabetType(), "amino"); + assertFalse(hmm.getBooleanProperty(HMMFile.REFERENCE_ANNOTATION)); + assertFalse(hmm.getBooleanProperty(HMMFile.MASKED_VALUE)); + assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_RESIDUE)); + assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_STRUCTURE)); + + assertTrue(hmm.getBooleanProperty(HMMFile.MAP)); + assertEquals(hmm.getProperty(HMMFile.DATE), "Fri Jun 20 08:22:31 2014"); + assertNull(hmm.getProperty(HMMFile.COMMAND_LOG)); + assertEquals(hmm.getProperty(HMMFile.NUMBER_OF_SEQUENCES), "106"); + assertEquals(hmm.getProperty(HMMFile.EFF_NUMBER_OF_SEQUENCES), + "11.415833"); + assertEquals(hmm.getProperty(HMMFile.CHECK_SUM), "3564431818"); + assertEquals(hmm.getProperty(HMMFile.GATHERING_THRESHOLD), "8.00 7.20"); + assertEquals(hmm.getProperty(HMMFile.TRUSTED_CUTOFF), "8.00 7.20"); + assertEquals(hmm.getProperty(HMMFile.NOISE_CUTOFF), "7.90 7.90"); + assertEquals(hmm.getViterbi(), "-9.7737 0.71847"); + assertEquals(hmm.getMSV(), "-9.4043 0.71847"); + assertEquals(hmm.getForward(), "-3.8341 0.71847"); + } + + @Test(groups = "Functional") + public void testParseHeaderLines_dna() throws IOException + { + FileReader fr = new FileReader( new File("test/jalview/io/test_MADE1_hmm.txt")); - BufferedReader br3 = new BufferedReader(fr3); - made1.setHMM(new HiddenMarkovModel()); - made1.parseFileProperties(br3); - testHMM = made1.getHMM(); - br3.close(); - fr3.close(); + BufferedReader br = new BufferedReader(fr); + HiddenMarkovModel hmm = new HiddenMarkovModel(); + HMMFile testee = new HMMFile(); + PA.setValue(testee, "hmm", hmm); + testee.parseHeaderLines(br); + br.close(); + fr.close(); - assertEquals(testHMM.getName(), "MADE1"); - assertEquals(testHMM.getAccessionNumber(), "DF0000629.2"); - assertEquals(testHMM.getDescription(), + assertEquals(hmm.getName(), "MADE1"); + assertEquals(hmm.getProperty(HMMFile.ACCESSION_NUMBER), + "DF0000629.2"); + assertEquals(hmm.getProperty(HMMFile.DESCRIPTION), "MADE1 (MAriner Derived Element 1), a TcMar-Mariner DNA transposon"); - assertEquals(testHMM.getLength().intValue(), 80); - assertEquals(testHMM.getMaxInstanceLength().intValue(), 426); - assertEquals(testHMM.getAlphabetType(), "DNA"); - assertEquals(testHMM.referenceAnnotationIsActive(), true); - assertEquals(testHMM.maskValueIsActive(), false); - assertEquals(testHMM.consensusResidueIsActive(), true); - assertEquals(testHMM.consensusStructureIsActive(), false); - assertEquals(testHMM.mapIsActive(), true); - assertEquals(testHMM.getDate(), "Tue Feb 19 20:33:41 2013"); - assertNull(testHMM.getCommandLineLog()); - assertEquals(testHMM.getNumberOfSequences().intValue(), 1997); - assertEquals(testHMM.getEffectiveNumberOfSequences(), 3.911818, 4d); - assertEquals(testHMM.getCheckSum().longValue(), 3015610723l); - assertEquals(testHMM.getGatheringThreshold(), "2.324 4.234"); - assertEquals(testHMM.getTrustedCutoff(), "2.343 1.212"); - assertEquals(testHMM.getNoiseCutoff(), "2.354 5.456"); - assertEquals(testHMM.getViterbi(), "-9.3632 0.71858"); - assertEquals(testHMM.getMSV(), "-8.5786 0.71858"); - assertEquals(testHMM.getForward(), "-3.4823 0.71858"); - - + assertEquals(hmm.getProperty(HMMFile.LENGTH), "80"); + assertEquals(hmm.getProperty(HMMFile.MAX_LENGTH), "426"); + assertEquals(hmm.getAlphabetType(), "DNA"); + assertTrue(hmm.getBooleanProperty(HMMFile.REFERENCE_ANNOTATION)); + assertFalse(hmm.getBooleanProperty(HMMFile.MASKED_VALUE)); + assertTrue(hmm.getBooleanProperty(HMMFile.CONSENSUS_RESIDUE)); + assertFalse(hmm.getBooleanProperty(HMMFile.CONSENSUS_STRUCTURE)); + assertTrue(hmm.getBooleanProperty(HMMFile.MAP)); + assertEquals(hmm.getProperty(HMMFile.DATE), "Tue Feb 19 20:33:41 2013"); + assertNull(hmm.getProperty(HMMFile.COMMAND_LOG)); + assertEquals(hmm.getProperty(HMMFile.NUMBER_OF_SEQUENCES), "1997"); + assertEquals(hmm.getProperty(HMMFile.EFF_NUMBER_OF_SEQUENCES), "3.911818"); + assertEquals(hmm.getProperty(HMMFile.CHECK_SUM), "3015610723"); + assertEquals(hmm.getProperty(HMMFile.GATHERING_THRESHOLD), + "2.324 4.234"); + assertEquals(hmm.getProperty(HMMFile.TRUSTED_CUTOFF), "2.343 1.212"); + assertEquals(hmm.getProperty(HMMFile.NOISE_CUTOFF), "2.354 5.456"); + assertEquals(hmm.getViterbi(), "-9.3632 0.71858"); + assertEquals(hmm.getMSV(), "-8.5786 0.71858"); + assertEquals(hmm.getForward(), "-3.4823 0.71858"); } - @Test + @Test(groups = "Functional") public void testFillList() throws IOException { Scanner scanner1 = new Scanner("1.3 2.4 5.3 3.9 9.8 4.7 4.3 2.3 6.9"); @@ -207,12 +211,11 @@ public class HMMFileTest { filledArray.add(0.10026); filledArray.add(0.001); - List testList = HMMFile.fillList(scanner1, 9); + double[] testList = HMMFile.parseDoubles(scanner1, 9); for (int i = 0; i < 9; i++) { - assertEquals(testList.get(i), filledArray.get(i), 0.001d); - + assertEquals(testList[i], filledArray.get(i), 0.001d); } filledArray.clear(); @@ -226,16 +229,15 @@ public class HMMFileTest { filledArray.add(0.00355); filledArray.add(0.2466); - testList = HMMFile.fillList(scanner2, 5); + testList = HMMFile.parseDoubles(scanner2, 5); for (int i = 0; i < 5; i++) { - assertEquals(testList.get(i), filledArray.get(i), 0.001d); + assertEquals(testList[i], filledArray.get(i), 0.001d); } - } - @Test + @Test(groups = "Functional") public void testParseModel() throws IOException { FileReader fr = new FileReader( @@ -283,35 +285,46 @@ public class HMMFileTest { 0.001d); assertEquals(testHMM.getStateTransitionProbability(1111, 6), Double.NEGATIVE_INFINITY); - } - @Test + /** + * Test that if no mapping of nodes to aligned columns is provided by the HMM + * file, we construct one + * + * @throws IOException + */ + @Test(groups = "Functional") + public void testParseModel_noMap() throws IOException + { + fail("test to be written"); + } + + @Test(groups = "Functional") public void testParseAnnotations() { HMMFile testFile = new HMMFile(); HiddenMarkovModel hmm = new HiddenMarkovModel(); - testFile.setHMM(hmm); - hmm.getNodes().add(new HMMNode()); - - hmm.setConsensusResidueStatus(true); - hmm.setMAPStatus(true); - hmm.setReferenceAnnotationStatus(true); - hmm.setConsensusStructureStatus(true); - hmm.setMaskedValueStatus(true); + PA.setValue(testFile, "hmm", hmm); + hmm.addNode(new HMMNode()); + + hmm.setProperty(HMMFile.CONSENSUS_RESIDUE, "yes"); + hmm.setProperty(HMMFile.MAP, "yes"); + hmm.setProperty(HMMFile.REFERENCE_ANNOTATION, "yes"); + hmm.setProperty(HMMFile.CONSENSUS_STRUCTURE, "yes"); + hmm.setProperty(HMMFile.MASKED_VALUE, "yes"); Scanner scanner = new Scanner("1345 t t t t"); HMMNode node = new HMMNode(); - hmm.getNodes().add(node); + hmm.addNode(node); testFile.parseAnnotations(scanner, node); - hmm.setConsensusResidueStatus(true); - hmm.setMAPStatus(false); - hmm.setReferenceAnnotationStatus(true); - hmm.setConsensusStructureStatus(false); - hmm.setMaskedValueStatus(false); + hmm.setProperty(HMMFile.CONSENSUS_RESIDUE, "yes"); + hmm.setProperty(HMMFile.MAP, "no"); + hmm.setProperty(HMMFile.REFERENCE_ANNOTATION, "yes"); + hmm.setProperty(HMMFile.CONSENSUS_STRUCTURE, "no"); + hmm.setProperty(HMMFile.MASKED_VALUE, "no"); Scanner scanner2 = new Scanner("- y x - -"); node = new HMMNode(); - hmm.getNodes().add(node); + hmm.addNode(node); testFile.parseAnnotations(scanner2, node); assertEquals(hmm.getNodeAlignmentColumn(1).intValue(), 1344); @@ -328,9 +341,7 @@ public class HMMFileTest { * * @throws IOException */ - - - @Test(priority = 3) + @Test(groups = "Functional") public void testPrint() throws IOException { PrintWriter writer = new PrintWriter( @@ -348,27 +359,23 @@ public class HMMFileTest { for (int i = 0; i < pKinaseHMM.getLength(); i++) { - List list1; - List list2; - boolean result; + double[] list1; + double[] list2; list1 = pKinaseHMM.getNode(i).getMatchEmissions(); list2 = pKinaseCloneHMM.getNode(i).getMatchEmissions(); - result = checkIfListsAreIdentical(list1, list2); - assertEquals(result, true); + assertEquals(list1, list2); list1 = pKinaseHMM.getNode(i).getInsertEmissions(); list2 = pKinaseCloneHMM.getNode(i).getInsertEmissions(); - result = checkIfListsAreIdentical(list1, list2); - assertEquals(result, true); + assertEquals(list1, list2); list1 = pKinaseHMM.getNode(i).getStateTransitions(); list2 = pKinaseCloneHMM.getNode(i).getStateTransitions(); - result = checkIfListsAreIdentical(list1, list2); - assertEquals(result, true); + assertEquals(list1, list2); if (i > 0) { @@ -393,17 +400,16 @@ public class HMMFileTest { assertEquals(annotation1, annotation2); } - } - } - @Test(priority = 1) - public void testGetFilePropertiesAsString() throws FileNotFoundException + @Test(groups = "Functional") + public void testAppendProperties() throws FileNotFoundException { - String string = fn3.getFilePropertiesAsString(); + StringBuilder sb = new StringBuilder(); + fn3.appendProperties(sb); - Scanner testScanner = new Scanner(string); + Scanner testScanner = new Scanner(sb.toString()); String[] expected = new String[] { "HMMER3/f [3.1b1 | May 2013]", "NAME fn3", "ACC PF00041.13", @@ -423,10 +429,12 @@ public class HMMFileTest { testScanner.close(); } - @Test(priority = 2) - public void testGetModelAsString() throws FileNotFoundException + @Test(groups = "Functional") + public void testAppendModelAsString() throws FileNotFoundException { - String string = fn3.getModelAsString(); + StringBuilder sb = new StringBuilder(); + fn3.appendModelAsString(sb); + String string = sb.toString(); assertEquals(findValue(2, 2, 2, string), "4.42225"); assertEquals(findValue(12, 14, 1, string), "2.79307"); @@ -438,10 +446,10 @@ public class HMMFileTest { assertEquals(findValue(16, 65, 1, string), "2.81003"); assertEquals(findValue(14, 3, 1, string), "2.69012"); assertEquals(findValue(11, 32, 1, string), "4.34805"); - } /** + * A helper method to find a token in the model string * * @param symbolIndex * index of symbol being searched. First symbol has index 1. @@ -454,26 +462,23 @@ public class HMMFileTest { * string model being searched * @return value at specified position */ - - public String findValue(int symbolIndex, int nodeIndex, int line, + private String findValue(int symbolIndex, int nodeIndex, int line, String model) { - String value = ""; - String current; Scanner scanner = new Scanner(model); - current = scanner.nextLine(); - current = scanner.nextLine(); + scanner.nextLine(); + scanner.nextLine(); for (int lineIndex = 0; lineIndex < line - 1; lineIndex++) { - current = scanner.nextLine(); + scanner.nextLine(); } for (int node = 0; node < nodeIndex; node++) { - current = scanner.nextLine(); - current = scanner.nextLine(); - current = scanner.nextLine(); + scanner.nextLine(); + scanner.nextLine(); + scanner.nextLine(); } for (int symbol = 0; symbol < symbolIndex; symbol++) @@ -481,36 +486,15 @@ public class HMMFileTest { value = scanner.next(); if ("COMPO".equals(value)) { - current = scanner.next(); + scanner.next(); } else if (value.length() < 7) { - current = scanner.next(); + scanner.next(); } - } scanner.close(); return value; - } - - public boolean checkIfListsAreIdentical(List list1, - List list2) - { - boolean isDifferent = false; - for (int i = 0; i < list1.size(); i++) - { - Double entry1; - Double entry2; - entry1 = list1.get(i); - entry2 = list2.get(i); - if (!(entry1 == entry2)) - { - isDifferent = true; - } - } - return isDifferent; - } - } diff --git a/test/jalview/util/HMMProbabilityDistributionAnalyserTest.java b/test/jalview/util/HMMProbabilityDistributionAnalyserTest.java index 9489efb..a6b93f5 100644 --- a/test/jalview/util/HMMProbabilityDistributionAnalyserTest.java +++ b/test/jalview/util/HMMProbabilityDistributionAnalyserTest.java @@ -44,7 +44,7 @@ public class HMMProbabilityDistributionAnalyserTest { { analyser.sequences = new Vector<>(); analyser.hmm = new HiddenMarkovModel(); - analyser.hmm.addFileProperty("LENG", "8"); + analyser.hmm.setProperty("LENG", "8"); List nodes = new ArrayList<>(); nodes.add(new HMMNode()); -- 1.7.10.2