3 import jalview.api.AlignExportSettingI;
4 import jalview.api.AlignmentViewPanel;
5 import jalview.datamodel.HMMNode;
6 import jalview.datamodel.HiddenMarkovModel;
7 import jalview.datamodel.SequenceI;
9 import java.io.BufferedReader;
10 import java.io.IOException;
11 import java.util.ArrayList;
12 import java.util.List;
13 import java.util.Scanner;
17 * Adds capability to read in and write out HMMER3 files. .
23 public class HMMFile extends AlignFile
24 implements AlignmentFileReaderI, AlignmentFileWriterI
27 * keys to data in HMM file, used to store as properties of the HiddenMarkovModel
29 private static final String HMM = "HMM";
31 public static final String NAME = "NAME";
33 public static final String ACCESSION_NUMBER = "ACC";
35 public static final String DESCRIPTION = "DESC";
37 public static final String LENGTH = "LENG";
39 public static final String MAX_LENGTH = "MAXL";
41 public static final String ALPHABET = "ALPH";
43 private static final String ALPH_AMINO = "amino";
45 private static final String ALPH_DNA = "DNA";
47 private static final String ALPH_RNA = "RNA";
49 private static final String ALPHABET_AMINO = "ACDEFGHIKLMNPQRSTVWY";
51 private static final String ALPHABET_DNA = "ACGT";
53 private static final String ALPHABET_RNA = "ACGU";
55 public static final String DATE = "DATE";
57 public static final String COMMAND_LOG = "COM";
59 public static final String NUMBER_OF_SEQUENCES = "NSEQ";
61 public static final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
63 public static final String CHECK_SUM = "CKSUM";
65 public static final String STATISTICS = "STATS";
67 public static final String COMPO = "COMPO";
69 public static final String GATHERING_THRESHOLD = "GA";
71 public static final String TRUSTED_CUTOFF = "TC";
73 public static final String NOISE_CUTOFF = "NC";
75 public static final String VITERBI = "VITERBI";
77 public static final String MSV = "MSV";
79 public static final String FORWARD = "FORWARD";
81 public static final String MAP = "MAP";
83 public static final String REFERENCE_ANNOTATION = "RF";
85 public static final String CONSENSUS_RESIDUE = "CONS";
87 public static final String CONSENSUS_STRUCTURE = "CS";
89 public static final String MASKED_VALUE = "MM";
91 private static final int NUMBER_OF_TRANSITIONS = 7;
93 private static final String SPACE = " ";
96 * optional guide line added to an output HMMER file, purely for readability
98 private static final String TRANSITIONTYPELINE = " m->m m->i m->d i->m i->i d->m d->d";
100 private static String NL = System.lineSeparator();
102 private HiddenMarkovModel hmm;
104 // number of symbols in the alphabet used in the hidden Markov model
105 private int numberOfSymbols;
108 * Constructor that parses immediately
112 * @throws IOException
114 public HMMFile(String inFile, DataSourceType type) throws IOException
120 * Constructor that parses immediately
123 * @throws IOException
125 public HMMFile(FileParse source) throws IOException
131 * Default constructor
138 * Constructor for HMMFile used for exporting
141 * @param exportImmediately
143 public HMMFile(HiddenMarkovModel markov)
149 * Returns the HMM produced by parsing a HMMER3 file
153 public HiddenMarkovModel getHMM()
159 * Gets the name of the hidden Markov model
163 public String getName()
165 return hmm.getName();
169 * Reads the data from HMM file into the HMM model
176 hmm = new HiddenMarkovModel();
177 parseHeaderLines(dataIn);
179 } catch (Exception e)
186 * Reads the header properties from a HMMER3 file and saves them in the
187 * HiddeMarkovModel. This method exits after reading the next line after the
191 * @throws IOException
193 void parseHeaderLines(BufferedReader input) throws IOException
195 boolean readingHeaders = true;
196 hmm.setFileHeader(input.readLine());
197 String line = input.readLine();
198 while (readingHeaders && line != null)
200 Scanner parser = new Scanner(line);
201 String next = parser.next();
202 if (ALPHABET.equals(next))
204 String alphabetType = parser.next();
205 hmm.setProperty(ALPHABET, alphabetType);
206 String alphabet = ALPH_DNA.equalsIgnoreCase(alphabetType)
208 : (ALPH_RNA.equalsIgnoreCase(alphabetType) ? ALPHABET_RNA
210 numberOfSymbols = hmm.setAlphabet(alphabet);
212 else if (HMM.equals(next))
214 readingHeaders = false;
215 String symbols = line.substring(line.indexOf(HMM) + HMM.length());
216 numberOfSymbols = hmm.setAlphabet(symbols);
218 else if (STATISTICS.equals(next))
224 value = parser.next() + SPACE + SPACE + parser.next();
225 hmm.setProperty(key, value);
230 String value = parser.next();
231 while (parser.hasNext())
233 value = value + SPACE + parser.next();
235 hmm.setProperty(key, value);
238 line = input.readLine();
243 * Parses the model data from the HMMER3 file
246 * @throws IOException
248 void parseModel(BufferedReader input) throws IOException
250 boolean first = true;
251 // specification says there must always be an HMM header
252 // and one more header which is skipped here
253 String line = input.readLine();
254 while (!"//".equals(line))
256 HMMNode node = new HMMNode();
258 Scanner matchReader = new Scanner(line);
259 String next = matchReader.next();
260 if (next.equals(COMPO) || !first)
262 // stores match emission line in list
263 double[] matches = parseDoubles(matchReader, numberOfSymbols);
264 node.setMatchEmissions(matches);
267 // TODO handle files with no column map (make our own)
268 int column = parseAnnotations(matchReader, node);
269 hmm.setAlignmentColumn(node, column - 1);
273 // stores insert emission line in list
274 line = input.readLine();
275 Scanner insertReader = new Scanner(line);
276 double[] inserts = parseDoubles(insertReader, numberOfSymbols);
277 node.setInsertEmissions(inserts);
278 insertReader.close();
280 // stores state transition line in list
281 line = input.readLine();
282 Scanner transitionReader = new Scanner(line);
283 double[] transitions = parseDoubles(transitionReader,
284 NUMBER_OF_TRANSITIONS);
285 node.setStateTransitions(transitions);
286 transitionReader.close();
287 line = input.readLine();
294 * Parses the annotations on the match emission line and add them to the node.
295 * (See p109 of the HMMER User Guide (V3.1b2) for the specification.) Returns
296 * the alignment column number (base 1) that the node maps to, if provided,
302 int parseAnnotations(Scanner scanner, HMMNode node)
305 * map from hmm node to alignment column index, if provided
306 * HMM counts columns from 1, convert to base 0 for Jalview
309 if (hmm.getBooleanProperty(MAP) && scanner.hasNext())
311 column = scanner.nextInt();
312 node.setAlignmentColumn(column - 1);
320 * hmm consensus residue if provided, else -
322 if (scanner.hasNext())
325 consensusR = charValue(scanner.next());
326 node.setConsensusResidue(consensusR);
330 * RF reference annotation, if provided, else -
332 if (scanner.hasNext())
335 reference = charValue(scanner.next());
336 node.setReferenceAnnotation(reference);
340 * 'm' for masked position, if provided, else -
342 if (scanner.hasNext())
345 value = charValue(scanner.next());
346 node.setMaskValue(value);
350 * structure consensus symbol, if provided, else -
352 if (scanner.hasNext())
355 consensusS = charValue(scanner.next());
356 node.setConsensusStructure(consensusS);
363 * Fills an array of doubles parsed from an input line
366 * @param numberOfElements
368 * @throws IOException
370 static double[] parseDoubles(Scanner input,
371 int numberOfElements) throws IOException
373 double[] values = new double[numberOfElements];
374 for (int i = 0; i < numberOfElements; i++)
376 if (!input.hasNext())
378 throw new IOException("Incomplete data");
380 String next = input.next();
381 if (next.contains("*"))
383 values[i] = Double.NEGATIVE_INFINITY;
387 double prob = Double.valueOf(next);
388 prob = Math.pow(Math.E, -prob);
396 * Returns a string to be added to the StringBuilder containing the entire
399 * @param initialColumnSeparation
400 * The initial whitespace separation between the left side of the
401 * file and first character.
402 * @param columnSeparation
403 * The separation between subsequent data entries.
405 * The list of data to be added to the String.
408 String addData(int initialColumnSeparation,
409 int columnSeparation, List<String> data)
412 boolean first = true;
413 for (String value : data)
415 int sep = first ? initialColumnSeparation : columnSeparation;
416 line += String.format("%" + sep + "s", value);
423 * Converts list of characters into a list of Strings.
426 * @return Returns the list of Strings.
428 List<String> charListToStringList(List<Character> list)
430 List<String> strList = new ArrayList<>();
431 for (char value : list)
433 String strValue = Character.toString(value);
434 strList.add(strValue);
440 * Converts an array of doubles into a list of Strings, rounded to the nearest
444 * @param noOfDecimals
447 List<String> doublesToStringList(double[] doubles)
449 List<String> strList = new ArrayList<>();
450 for (double value : doubles)
455 strValue = String.format("%.5f", value);
457 else if (value == -0.00000d)
459 strValue = "0.00000";
465 strList.add(strValue);
471 * Appends model data in string format to the string builder
475 void appendModelAsString(StringBuilder output)
477 output.append(HMM).append(" ");
478 String charSymbols = hmm.getSymbols();
479 for (char c : charSymbols.toCharArray())
481 output.append(String.format("%9s", c));
483 output.append(NL).append(TRANSITIONTYPELINE);
485 int length = hmm.getLength();
487 for (int nodeNo = 0; nodeNo <= length; nodeNo++)
489 String matchLine = String.format("%7s",
490 nodeNo == 0 ? "COMPO" : Integer.toString(nodeNo));
492 double[] doubleMatches = convertToLogSpace(
493 hmm.getNode(nodeNo).getMatchEmissions());
494 List<String> strMatches = doublesToStringList(doubleMatches);
495 matchLine += addData(10, 9, strMatches);
499 matchLine += SPACE + (hmm.getNodeAlignmentColumn(nodeNo) + 1);
500 matchLine += SPACE + hmm.getConsensusResidue(nodeNo);
501 matchLine += SPACE + hmm.getReferenceAnnotation(nodeNo);
502 if (hmm.getFileHeader().contains("HMMER3/f"))
504 matchLine += SPACE + hmm.getMaskedValue(nodeNo);
505 matchLine += SPACE + hmm.getConsensusStructure(nodeNo);
509 output.append(NL).append(matchLine);
511 String insertLine = "";
513 double[] doubleInserts = convertToLogSpace(
514 hmm.getNode(nodeNo).getInsertEmissions());
515 List<String> strInserts = doublesToStringList(doubleInserts);
516 insertLine += addData(17, 9, strInserts);
518 output.append(NL).append(insertLine);
520 String transitionLine = "";
521 double[] doubleTransitions = convertToLogSpace(
522 hmm.getNode(nodeNo).getStateTransitions());
523 List<String> strTransitions = doublesToStringList(
525 transitionLine += addData(17, 9, strTransitions);
527 output.append(NL).append(transitionLine);
532 * Appends formatted HMM file properties to the string builder
536 void appendProperties(StringBuilder output)
538 output.append(hmm.getFileHeader());
540 String format = "%n%-5s %1s";
541 appendProperty(output, format, NAME);
542 appendProperty(output, format, ACCESSION_NUMBER);
543 appendProperty(output, format, DESCRIPTION);
544 appendProperty(output, format, LENGTH);
545 appendProperty(output, format, MAX_LENGTH);
546 appendProperty(output, format, ALPHABET);
547 appendBooleanProperty(output, format, REFERENCE_ANNOTATION);
548 appendBooleanProperty(output, format, MASKED_VALUE);
549 appendBooleanProperty(output, format, CONSENSUS_RESIDUE);
550 appendBooleanProperty(output, format, CONSENSUS_STRUCTURE);
551 appendBooleanProperty(output, format, MAP);
552 appendProperty(output, format, DATE);
553 appendProperty(output, format, NUMBER_OF_SEQUENCES);
554 appendProperty(output, format, EFF_NUMBER_OF_SEQUENCES);
555 appendProperty(output, format, CHECK_SUM);
556 appendProperty(output, format, GATHERING_THRESHOLD);
557 appendProperty(output, format, TRUSTED_CUTOFF);
558 appendProperty(output, format, NOISE_CUTOFF);
560 if (hmm.getMSV() != null)
562 output.append(String.format("%n%-19s %18s", "STATS LOCAL MSV",
565 output.append(String.format("%n%-19s %18s", "STATS LOCAL VITERBI",
568 output.append(String.format("%n%-19s %18s", "STATS LOCAL FORWARD",
574 * Appends 'yes' or 'no' for the given property, according to whether or not
575 * it is set in the HMM
579 * @param propertyName
581 private void appendBooleanProperty(StringBuilder output, String format,
584 boolean set = hmm.getBooleanProperty(propertyName);
585 output.append(String.format(format, propertyName,
586 set ? HiddenMarkovModel.YES : HiddenMarkovModel.NO));
590 * Appends the value of the given property to the output, if not null
594 * @param propertyName
596 private void appendProperty(StringBuilder output, String format,
599 String value = hmm.getProperty(propertyName);
602 output.append(String.format(format, propertyName, value));
607 * Returns the char value of a single lettered String.
612 char charValue(String string)
615 character = string.charAt(0);
620 public String print(SequenceI[] seqs, boolean jvsuffix)
622 if (seqs[0].getHMM() != null)
624 hmm = seqs[0].getHMM();
630 * Prints the .hmm file to a String.
634 public String print()
636 StringBuilder output = new StringBuilder();
637 appendProperties(output);
639 appendModelAsString(output);
640 output.append(NL + "//");
641 return output.toString();
645 * Converts the probabilities contained in an array into log space
649 double[] convertToLogSpace(double[] ds)
651 double[] converted = new double[ds.length];
652 for (int i = 0; i < ds.length; i++)
655 double logProb = -1 * Math.log(prob);
657 converted[i] = logProb;
663 * Returns the HMM sequence produced by reading a .hmm file.
666 public SequenceI[] getSeqsAsArray()
668 SequenceI hmmSeq = hmm.initHMMSequence();
669 SequenceI[] seq = new SequenceI[1];
675 public void setNewlineString(String newLine)
681 public void setExportSettings(AlignExportSettingI exportSettings)
687 public void configureForView(AlignmentViewPanel viewpanel)
693 public boolean hasWarningMessage()
699 public String getWarningMessage()
701 return "warning message";