From f9fb3eb5e863ed038d8d1c23d2ba2250bc8f9340 Mon Sep 17 00:00:00 2001 From: TZVanaalten Date: Thu, 15 Jun 2017 12:38:48 +0100 Subject: [PATCH] add HMMFile class to read and write HMM files --- src/jalview/datamodel/HiddenMarkovModel.java | 492 ++++++++++++++++++++++++-- src/jalview/io/HMMFile.java | 428 ++++++++++++++++++++++ 2 files changed, 887 insertions(+), 33 deletions(-) create mode 100644 src/jalview/io/HMMFile.java diff --git a/src/jalview/datamodel/HiddenMarkovModel.java b/src/jalview/datamodel/HiddenMarkovModel.java index b422ef1..b863264 100644 --- a/src/jalview/datamodel/HiddenMarkovModel.java +++ b/src/jalview/datamodel/HiddenMarkovModel.java @@ -1,143 +1,569 @@ package jalview.datamodel; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; +import java.util.Scanner; /** - * Data structure to hold a HMM file - */ -/** + * Data structure which stores a hidden Markov model. Currently contains file properties as well, not sure whether these should be transferred to the HMMFile class + * * @author TZVanaalten * */ public class HiddenMarkovModel { + // Stores file properties. Do not directly access this field as it contains + // only string value - use the getter methods. For example, to find the length + // of theHMM, use getModelLength()to return an int value + Map fileProperties = new HashMap<>(); + + // contains the average emission probabilities for each symbol + List averageMatchStateEmissionProbabilities = new ArrayList<>(); + + // contains the probabilities of insert 0 emissions for each symbol + List insertZeroEmissions = new ArrayList<>(); + + // contains the probabilities of transitions from the begin state and insert + // state 0. These are bm, bi, bd, im, ii, dm and dd in order (0th position in + // the array indicates the probability of a bm transition) + + List beginStateTransitions = new ArrayList<>(); + + // contains the alignment column index for each node + List alignmentColumnIndexes = new ArrayList<>(); + + // contains all other annotations for each node. These can be the + // consensus(CONS), reference annotation(RF), mask value(MM) or consensus + // structure(CS) + List> annotations = new ArrayList<>(); + + // contains the match emission for each symbol at each node + List> matchEmissions = new ArrayList<>(); + + // contains the insert emission for each symbol at each node + List> insertEmissions = new ArrayList<>(); + + // contains the state transition for each state transition. See + // beginStateTransitions field for transition possibilities. + List> stateTransitions = new ArrayList<>(); + + // contains cutoffs and thresholds from PFAM + Map pfamData = new HashMap<>(); + + // contains e-value statistic objects which contain the alignment mode + // configuration, and the slope and location of each distribution + Map eValueStatistics = new HashMap<>(); + + final String yes = "yes"; + + final String no = "no"; + + List symbols = new ArrayList<>(); + + public List getBeginStateTransitions() + { + return beginStateTransitions; + } + + public void setBeginStateTransitions(List beginStateTransitionsL) + { + this.beginStateTransitions = beginStateTransitionsL; + } + + public List> getStateTransitions() + { + return stateTransitions; + } + + public void setStateTransitions(List> stateTransitionsL) + { + this.stateTransitions = stateTransitionsL; + } + + public List getSymbols() + { + return symbols; + } + + public void setSymbols(List symbolsL) + { + this.symbols = symbolsL; + } + + public List getAverageMatchStateEmissionProbabilities() + { + return averageMatchStateEmissionProbabilities; + } + + public void setAverageMatchStateEmissionProbabilities( + List averageMatchStateEmissionProbabilitiesL) + { + this.averageMatchStateEmissionProbabilities = averageMatchStateEmissionProbabilitiesL; + } + + + public List getInsertZeroEmissions() + { + return insertZeroEmissions; + } + + public void setInsertZeroEmissions(List insertZeroEmissionsL) + { + this.insertZeroEmissions = insertZeroEmissionsL; + } + + public List> getMatchEmissions() + { + return matchEmissions; + } + + public void setMatchEmissions(List> matchEmissionsL) + { + this.matchEmissions = matchEmissionsL; + } + + public List> getInsertEmissions() + { + return insertEmissions; + } - // Stores file properties - private Map fileProperties = new HashMap<>(); + public void setInsertEmissions(List> insertEmissionsL) + { + this.insertEmissions = insertEmissionsL; + } + public void fillSymbols(String line) + { + Scanner scanner = new Scanner(line); + scanner.next(); + while (scanner.hasNext()) + { + symbols.add(scanner.next().charAt(0)); + } + scanner.close(); + } + public String getName() + { + return fileProperties.get("NAME"); + } public String getAccessionNumber() { return fileProperties.get("ACC"); } + public void setAccessionNumber(String value) + { + fileProperties.put("ACC", value); + } + public String getDescription() { return fileProperties.get("DESC"); } - public int getModelLength() + public void setDescription(String value) { + fileProperties.put("DESC", value); + } + + public Integer getLength() + { + if (fileProperties.get("LENG") == null) + { + return null; + } return Integer.parseInt(fileProperties.get("LENG")); } - public int getMaxInstanceLength() + public void setLength(int value) + { + fileProperties.put("LENG", String.valueOf(value)); + } + + public Integer getMaxInstanceLength() { + if (fileProperties.get("MAXL") == null) + { + return null; + } return Integer.parseInt(fileProperties.get("MAXL")); } + public void setMaxInstanceLength(int value) + { + fileProperties.put("MAXL", String.valueOf(value)); + } + // gets type of symbol alphabet - "amino", "DNA", "RNA" public String getAlphabetType() { return fileProperties.get("ALPH"); } + public void setAlphabetType(String value) + { + fileProperties.put("ALPH", value); + } + // returns boolean indicating whether the reference annotation character field // for each match state is valid or ignored public boolean getReferenceAnnotationFlag() { - if (fileProperties.get("RF") == "yes") + if (fileProperties.get("RF") != null) { - return true; + if (fileProperties.get("RF").equals(yes)) + { + return true; + } } return false; } + public void setReferenceAnnotationFlag(boolean value) + { + if (value) + { + fileProperties.put("RF", yes); + } + else + { + fileProperties.put("RF", no); + } + + } + // returns boolean indicating whether the model mask annotation character // field // for each match state is valid or ignored public boolean getModelMaskedFlag() { - if (fileProperties.get("MM") == "yes") + if (fileProperties.get("MM") != null) { - return true; + if (fileProperties.get("MM").equals(yes)) + { + return true; + } } return false; } + public void setModelMaskedFlag(boolean value) + { + if (value) + { + fileProperties.put("MM", yes); + } + else + { + fileProperties.put("MM", no); + } + } + // returns boolean indicating whether the consensus residue field // for each match state is valid or ignored public boolean getConsensusResidueAnnotationFlag() { - if (fileProperties.get("CONS") == "yes") + if (fileProperties.get("CONS") != null) { - return true; + if (fileProperties.get("CONS").equals(yes)) + { + return true; + } } return false; } + public void setConsensusResidueeAnnotationFlag(boolean value) + { + if (value) + { + fileProperties.put("CONS", yes); + } + else + { + fileProperties.put("CONS", no); + } + } + // returns boolean indicating whether the consensus structure character field // for each match state is valid or ignored public boolean getConsensusStructureAnnotationFlag() { - if (fileProperties.get("CS") == "yes") + if (fileProperties.get("CS") != null) { - return true; + if (fileProperties.get("CS").equals(yes)) + { + return true; + } } return false; } + public void setConsensusStructureAnnotationFlag(boolean value) + { + if (value) + { + fileProperties.put("CS", yes); + } + else + { + fileProperties.put("CS", no); + } + } + // returns boolean indicating whether the model mask annotation character // field // for each match state is valid or ignored public boolean getMapAnnotationFlag() { - if (fileProperties.get("MAP") == "yes") + if (fileProperties.get("MAP") != null) { - return true; + if (fileProperties.get("MAP").equals(yes)) + { + return true; + } } return false; } - // not sure whether to implement this - // public Date getDate() - // { + public void setMapAnnotationFlag(boolean value) + { + if (value) + { + fileProperties.put("MAP", yes); + } + else + { + fileProperties.put("MAP", no); + } + } - // } + // not sure whether to implement this with Date object + public String getDate() + { + return fileProperties.get("DATE"); + } + + public void setDate(String value) + { + fileProperties.put("DATE", value); + } // not sure whether to implement this - // public String getCommandLineLog() - // { + public String getCommandLineLog() + { + return fileProperties.get("COM"); + } - // } + public void setCommandLineLog(String value) + { + fileProperties.put("COM", value); + } // gets the number of sequences that the HMM was trained on - public int getSequenceNumber() + public Integer getSequenceNumber() { + if (fileProperties.get("NSEQ") == null) + { + return null; + } return Integer.parseInt(fileProperties.get("NSEQ")); } + public void setSequenceNumber(int value) + { + fileProperties.put("NSEQ", String.valueOf(value)); + } + // gets the effective number determined during sequence weighting - public int getEffectiveSequenceNumber() + public Double getEffectiveSequenceNumber() + { + if (fileProperties.get("LENG") == null) + { + return null; + } + return Double.parseDouble(fileProperties.get("EFFN")); + } + + public void setEffectiveSequenceNumber(double value) + { + fileProperties.put("EFFN", String.valueOf(value)); + } + + public Long getCheckSum() + { + if (fileProperties.get("LENG") == null) + { + return null; + } + return Long.parseLong(fileProperties.get("CKSUM")); + } + + public void setCheckSum(long value) + { + fileProperties.put("CKSUM", String.valueOf(value)); + } + + public Double getGatheringThreshold1() + { + try + { + return pfamData.get("GA")[0]; + } catch (NullPointerException e) + { + return null; + } + } + + public void setPFAMData(String key, Double[] data) + { + pfamData.put(key, data); + } + + public Double getGatheringThreshold2() + { + try + { + return pfamData.get("GA")[1]; + } catch (NullPointerException e) + { + return null; + } + + } + + public Double getTrustedCutoff1() + { + try + { + return pfamData.get("TC")[0]; + } catch (NullPointerException e) + { + return null; + } + + } + + public Double getTrustedCutoff2() { - return Integer.parseInt(fileProperties.get("EFFN")); + try + { + return pfamData.get("TC")[1]; + } catch (NullPointerException e) + { + return null; + } + } - public int getCheckSum() + public Double getNoiseCutoff1() { - return Integer.parseInt(fileProperties.get("CKSUM")); + try + { + return pfamData.get("NC")[0]; + } catch (NullPointerException e) + { + return null; + } + } - // need to ask if BigDecimal is best decimal type for this purpose - // and how to limit number of decimals - public double getGatheringThresholdGA1() + public Double getNoiseCutoff2() { - return Double.parseDouble((fileProperties.get("GA1"))); + try + { + return pfamData.get("NC")[1]; + } catch (NullPointerException e) + { + return null; + } + + } + + public String getAlignmentModeConfiguration(String key) + { + return eValueStatistics.get(key).alignmentModeConfiguration; + } + + public Double getSlopeOfDistribution(String scoreDistribution) + { + try + { + return eValueStatistics.get(scoreDistribution).slopeOfDistribution; + } catch (NullPointerException e) + { + return null; + } + } + + public Double getLocationOfDistribution(String scoreDistribution) + { + try + { + return eValueStatistics.get(scoreDistribution).locationOfDistribution; + } catch (NullPointerException e) + { + return null; + } } + public void addStatistic(String name, EValueStatistic stats) + { + eValueStatistics.put(name, stats); + } + + /** + * public double getBeginStateTransitions(Character symbol) { return + * beginStateTransitions.get(symbol); } + **/ + public void put(String key, String value) { fileProperties.put(key, value); } + public Map getEValueStatistics() + { + return eValueStatistics; + } + + public void setEValueStatistics( + Map eValueStatisticsM) + { + this.eValueStatistics = eValueStatisticsM; + } + + public List getAlignmentColumnIndexes() + { + return alignmentColumnIndexes; + } + + public void setAlignmentColumnIndexes( + List alignmentColumnIndexesL) + { + this.alignmentColumnIndexes = alignmentColumnIndexesL; + } + + public List> getAnnotations() + { + return annotations; + } + + public void setAnnotations(List> annotationsL) + { + this.annotations = annotationsL; + } + + public Map getFileProperties() + { + return fileProperties; + } + + public void setFileProperties(Map fileProperties) + { + this.fileProperties = fileProperties; + } } + diff --git a/src/jalview/io/HMMFile.java b/src/jalview/io/HMMFile.java new file mode 100644 index 0000000..764db7f --- /dev/null +++ b/src/jalview/io/HMMFile.java @@ -0,0 +1,428 @@ +package jalview.io; + +import jalview.datamodel.EValueStatistic; +import jalview.datamodel.HiddenMarkovModel; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Scanner; + +/** + * reads in and writes out a HMMER standard file + * + * + * @author TZVanaalten + * + */ +public class HMMFile extends FileParse +{ + // HMM to store file data + HiddenMarkovModel hmm = new HiddenMarkovModel(); + + // Source of file + String dataObject; + + // number of symbols + int numberOfSymbols; + + // number of possible transitions + final int NUMBER_OF_TRANSITIONS = 7; + + // file header + String fileHeader; + + /** + * Constructor which contains model to be filled or exported + * + * @param dataSource + * Filename, URL or Pasted String to read from + */ + public HMMFile(String dataSource) + { + dataObject = dataSource; + } + + /** + * reads data from HMM file + * + * @throws IOException + */ + public void parse() throws IOException + { + File file = new File(dataObject); + FileReader fr = new FileReader(file); + BufferedReader br = new BufferedReader(fr); + parseFileProperties(br); + parseModel(br); + + } + + /** + * imports file properties from hmm file + * + * @param input + * buffered reader used to read in file + * @throws IOException + */ + public void parseFileProperties(BufferedReader input) throws IOException + { + boolean readingFile = true; + fileHeader = input.readLine(); + String line = input.readLine(); + while (readingFile) + { + if (line != null) + { + Scanner parser = new Scanner(line); + String next = parser.next(); + if ("HMM".equals(next)) // indicates start of HMM data (end of file + // properties) + { + readingFile = false; + hmm.fillSymbols(line); + numberOfSymbols = hmm.getSymbols().size(); + } + else if ("STATS".equals(next)) // reads e-value stats into separate + // field + // on HMM object + { + readStats(parser); + } + else if ("GA".equals(next) || "TC".equals(next) + || "NC".equals(next)) // reads + // pfam + // data + // into + // separate + // field + // on + // HMM + // object + { + Double[] data = new Double[2]; + data[0] = parser.nextDouble(); + data[1] = parser.nextDouble(); + hmm.setPFAMData(next, data); + } + else + { + String key = next; + String value = parser.next(); + while (parser.hasNext()) + { + value = value + " " + parser.next(); + } + hmm.put(key, value); + } + parser.close(); + } + line = input.readLine(); + if (line == null) + { + readingFile = false; + } + } + + } + + /** + * creates a new EValueStatistic object to store stats + * + * @param parser + * Scanner which contains data for STATS line + * + */ + public void readStats(Scanner parser) + { + if (parser.hasNext()) + { + String name; + double slope; + double location; + String configuration; + + configuration = parser.next(); + name = parser.next(); + slope = parser.nextDouble(); + location = parser.nextDouble(); + hmm.addStatistic(name, + new EValueStatistic(configuration, slope, location)); + } + } + + /** + * parses the model data from the hmm file + * + * @param input + * buffered reader used to read file + * @throws IOException + */ + public void parseModel(BufferedReader input) throws IOException + { + + String line = input.readLine(); + Scanner scanner = new Scanner(line); + String next = scanner.next(); + if ("COMPO".equals(next)) // checks to and stores COMPO data if present + { + for (int i = 0; i < numberOfSymbols; i++) + + { + hmm.getAverageMatchStateEmissionProbabilities() + .add(scanner.nextDouble()); + } + } + scanner.close(); + parseBeginNodeData(input); + for (int i = 0; i < hmm.getLength(); i++) + { + Scanner matchReader = new Scanner(input.readLine()); + matchReader.nextInt(); // skips number indicating position in HMM + hmm.getMatchEmissions() + .add(fillList(matchReader, numberOfSymbols)); + parseAnnotations(matchReader, i); + matchReader.close(); + Scanner insertReader = new Scanner(input.readLine()); + hmm.getInsertEmissions().add(fillList(insertReader, numberOfSymbols)); + insertReader.close(); + Scanner transitionReader = new Scanner(input.readLine()); + hmm.getStateTransitions() + .add(fillList(transitionReader, NUMBER_OF_TRANSITIONS)); + transitionReader.close(); + } + + } + + /** + * parses the begin state transitions and insert 0 emissions + * + * @param input + * buffered reader used to read model + * @param currentline + * string contain all data on current line of buffered reader + * @throws IOException + */ + + public void parseBeginNodeData(BufferedReader input) + throws IOException + { + Scanner scanner = new Scanner(input.readLine()); + hmm.setInsertZeroEmissions(fillList(scanner, hmm.getSymbols().size())); + scanner.close(); + Scanner scannerTransitions = new Scanner(input.readLine()); + hmm.setBeginStateTransitions( + fillList(scannerTransitions, NUMBER_OF_TRANSITIONS)); + scannerTransitions.close(); + } + + /** + * parses annotations on match emission line + * + * @param scanner + * scanner which is processing match emission line + * @param index + * index of node which is beign scanned + */ + public void parseAnnotations(Scanner scanner, int index) + { + if (hmm.getMapAnnotationFlag()) + { + hmm.getAlignmentColumnIndexes().add(scanner.nextInt()); + } + else + { + scanner.next(); + } + hmm.getAnnotations().add(new HashMap()); + hmm.getAnnotations().get(index).put("CONS", scanner.next().charAt(0)); + hmm.getAnnotations().get(index).put("RF", scanner.next().charAt(0)); + hmm.getAnnotations().get(index).put("MM", scanner.next().charAt(0)); + hmm.getAnnotations().get(index).put("CS", scanner.next().charAt(0)); + } + /** + * + * @param transition + * type of transition occuring + * @return index value representing position along stateTransition array. + */ + public Integer getTransitionType(String transition) + { + Integer index; + switch (transition) + { + case "mm": + index = 0; + break; + case "mi": + index = 1; + break; + case "md": + index = 2; + break; + case "im": + index = 3; + break; + case "ii": + index = 4; + break; + case "dm": + index = 5; + break; + case "dd": + index = 6; + break; + default: + index = null; + } + return index; + } + + /** + * + * @param input + * scanner for line containing data to be transferred to list + * @param numberOfElements + * number of elements in the list to be filled + * @return filled list + */ + public static List fillList(Scanner input, + int numberOfElements) + { + List list = new ArrayList<>(); + String next; + for (int i = 0; i < numberOfElements; i++) + { + next = input.next(); + if (next.contains("*")) // state transitions to or from delete states + // occasionally have values of -infinity. These + // values are represented by an * in the .hmm + // file, and by a null value in the + // HiddenMarkovModel class + { + list.add(null); + } + else + { + list.add(Double.valueOf(next)); + } + } + return list; + } + + /** + * writes a HiddenMarkovModel to a file. Needs mode work to make file more + * readable for humans (align columns) + * + * @param exportLocation + * Filename, URL or Pasted String to write to + * @throws FileNotFoundException + * @throws UnsupportedEncodingException + */ + public void exportFile(String exportLocation) + throws FileNotFoundException, UnsupportedEncodingException + { + PrintWriter writer = new PrintWriter(exportLocation, "UTF-8"); + writer.println(fileHeader); + for (Map.Entry entry : hmm.getFileProperties() + .entrySet()) + { + writer.println(entry.getKey() + " " + entry.getValue()); + } + writer.println( + "HMM" + " " + convertCharListToString(hmm.getSymbols())); + writer.println("m->m m->i m->d i->m i->i d->m d->d"); + if (false == hmm.getAverageMatchStateEmissionProbabilities().isEmpty()) + { + writer.println("COMPO" + " " + convertDoubleListToString( + hmm.getAverageMatchStateEmissionProbabilities())); + } + writer.println(convertDoubleListToString(hmm.getInsertZeroEmissions())); + writer.println( + convertDoubleListToString(hmm.getBeginStateTransitions())); + + for (Integer i = 0; i < hmm.getLength(); i++) + { + String matchEmissionLine = i.toString() + " "; // adds node index + matchEmissionLine += convertDoubleListToString( + hmm.getMatchEmissions().get(i)); // adds match emissions + matchEmissionLine += " " + + hmm.getAlignmentColumnIndexes().get(i).toString(); // adds MAP + // annotation + matchEmissionLine += " " + + hmm.getAnnotations().get(i).get("CONS").toString(); // adds CONS + // annotation + matchEmissionLine += " " + + hmm.getAnnotations().get(i).get("RF").toString(); // adds RF + // annotation + matchEmissionLine += " " + + hmm.getAnnotations().get(i).get("MM").toString(); // adds MM + // annotation + matchEmissionLine += " " + + hmm.getAnnotations().get(i).get("CS").toString(); // adds CS + // annotation + writer.println(matchEmissionLine); + + writer.println( + convertDoubleListToString(hmm.getInsertEmissions().get(i))); + writer.println( + convertDoubleListToString(hmm.getStateTransitions().get(i))); + } + writer.println("//"); + + writer.close(); + } + + /** + * converts an list of characters to a string with items separated by spaces + * + * @param list + * character list to be converted + * @return string value of char list + */ + public String convertCharListToString(List list) + { + String string = ""; + for (Character item : list) + { + string = string + item.toString() + " "; + } + + return string; + } + + /** + * converts an list of doubles to a string with items separated by spaces + * + * @param list + * double list to be converted + * @return string value of double list + */ + public String convertDoubleListToString(List list) + { + String string = ""; + for (Double item : list) + { + if (item != null) + { + string = string + item.toString() + " "; + } + else + { + string = string + "*" + " "; + } + + } + + return string; + } +} + -- 1.7.10.2