3 import jalview.api.AlignExportSettingI;
4 import jalview.api.AlignmentViewPanel;
5 import jalview.datamodel.HMMNode;
6 import jalview.datamodel.HiddenMarkovModel;
7 import jalview.datamodel.SequenceI;
9 import java.io.BufferedReader;
10 import java.io.IOException;
11 import java.util.ArrayList;
12 import java.util.List;
13 import java.util.Scanner;
17 * Adds capability to read in and write out HMMER3 files. .
23 public class HMMFile extends AlignFile
24 implements AlignmentFileReaderI, AlignmentFileWriterI
26 private static final String TERMINATOR = "//";
29 * keys to data in HMM file, used to store as properties of the HiddenMarkovModel
31 public static final String HMM = "HMM";
33 public static final String NAME = "NAME";
35 public static final String ACCESSION_NUMBER = "ACC";
37 public static final String DESCRIPTION = "DESC";
39 public static final String LENGTH = "LENG";
41 public static final String MAX_LENGTH = "MAXL";
43 public static final String ALPHABET = "ALPH";
45 public static final String DATE = "DATE";
47 public static final String COMMAND_LOG = "COM";
49 public static final String NUMBER_OF_SEQUENCES = "NSEQ";
51 public static final String EFF_NUMBER_OF_SEQUENCES = "EFFN";
53 public static final String CHECK_SUM = "CKSUM";
55 public static final String STATISTICS = "STATS";
57 public static final String COMPO = "COMPO";
59 public static final String GATHERING_THRESHOLD = "GA";
61 public static final String TRUSTED_CUTOFF = "TC";
63 public static final String NOISE_CUTOFF = "NC";
65 public static final String VITERBI = "VITERBI";
67 public static final String MSV = "MSV";
69 public static final String FORWARD = "FORWARD";
71 public static final String MAP = "MAP";
73 public static final String REFERENCE_ANNOTATION = "RF";
75 public static final String CONSENSUS_RESIDUE = "CONS";
77 public static final String CONSENSUS_STRUCTURE = "CS";
79 public static final String MASKED_VALUE = "MM";
81 private static final String ALPH_AMINO = "amino";
83 private static final String ALPH_DNA = "DNA";
85 private static final String ALPH_RNA = "RNA";
87 private static final String ALPHABET_AMINO = "ACDEFGHIKLMNPQRSTVWY";
89 private static final String ALPHABET_DNA = "ACGT";
91 private static final String ALPHABET_RNA = "ACGU";
93 private static final int NUMBER_OF_TRANSITIONS = 7;
95 private static final String SPACE = " ";
98 * optional guide line added to an output HMMER file, purely for readability
100 private static final String TRANSITIONTYPELINE = " m->m m->i m->d i->m i->i d->m d->d";
102 private static String NL = System.lineSeparator();
104 private HiddenMarkovModel hmm;
106 // number of symbols in the alphabet used in the hidden Markov model
107 private int numberOfSymbols;
110 * Constructor that parses immediately
114 * @throws IOException
116 public HMMFile(String inFile, DataSourceType type) throws IOException
122 * Constructor that parses immediately
125 * @throws IOException
127 public HMMFile(FileParse source) throws IOException
133 * Default constructor
140 * Constructor for HMMFile used for exporting
143 * @param exportImmediately
145 public HMMFile(HiddenMarkovModel markov)
151 * Returns the HMM produced by parsing a HMMER3 file
155 public HiddenMarkovModel getHMM()
161 * Gets the name of the hidden Markov model
165 public String getName()
167 return hmm.getName();
171 * Reads the data from HMM file into the HMM model
178 hmm = new HiddenMarkovModel();
179 parseHeaderLines(dataIn);
181 } catch (Exception e)
188 * Reads the header properties from a HMMER3 file and saves them in the
189 * HiddeMarkovModel. This method exits after reading the next line after the
193 * @throws IOException
195 void parseHeaderLines(BufferedReader input) throws IOException
197 boolean readingHeaders = true;
198 hmm.setFileHeader(input.readLine());
199 String line = input.readLine();
200 while (readingHeaders && line != null)
202 Scanner parser = new Scanner(line);
203 String next = parser.next();
204 if (ALPHABET.equals(next))
206 String alphabetType = parser.next();
207 hmm.setProperty(ALPHABET, alphabetType);
208 String alphabet = ALPH_DNA.equalsIgnoreCase(alphabetType)
210 : (ALPH_RNA.equalsIgnoreCase(alphabetType) ? ALPHABET_RNA
212 numberOfSymbols = hmm.setAlphabet(alphabet);
214 else if (HMM.equals(next))
216 readingHeaders = false;
217 String symbols = line.substring(line.indexOf(HMM) + HMM.length());
218 numberOfSymbols = hmm.setAlphabet(symbols);
220 else if (STATISTICS.equals(next))
226 value = parser.next() + SPACE + SPACE + parser.next();
227 hmm.setProperty(key, value);
232 String value = parser.next();
233 while (parser.hasNext())
235 value = value + SPACE + parser.next();
237 hmm.setProperty(key, value);
240 line = input.readLine();
245 * Parses the model data from the HMMER3 file. The input buffer should be
246 * positioned at the (optional) COMPO line if there is one, else at the insert
247 * emissions line for the BEGIN node of the model.
250 * @throws IOException
252 void parseModel(BufferedReader input) throws IOException
255 * specification says there must always be an HMM header (already read)
256 * and one more header (guide headings) which is skipped here
259 String line = input.readLine();
260 while (line != null && !TERMINATOR.equals(line))
262 HMMNode node = new HMMNode();
264 Scanner scanner = new Scanner(line);
265 String next = scanner.next();
268 * expect COMPO (optional) for average match emissions
269 * or a node number followed by node's match emissions
271 if (COMPO.equals(next) || nodeNo > 0)
274 * parse match emissions
276 double[] matches = parseDoubles(scanner, numberOfSymbols);
277 node.setMatchEmissions(matches);
278 if (!COMPO.equals(next))
280 int column = parseAnnotations(scanner, node);
284 * no MAP annotation provided, just number off from 0 (begin node)
288 hmm.setAlignmentColumn(node, column - 1); // node 1 <==> column 0
290 line = input.readLine();
295 * parse insert emissions
297 scanner = new Scanner(line);
298 double[] inserts = parseDoubles(scanner, numberOfSymbols);
299 node.setInsertEmissions(inserts);
303 * parse state transitions
305 line = input.readLine();
306 scanner = new Scanner(line);
307 double[] transitions = parseDoubles(scanner,
308 NUMBER_OF_TRANSITIONS);
309 node.setStateTransitions(transitions);
311 line = input.readLine();
318 * Parses the annotations on the match emission line and add them to the node.
319 * (See p109 of the HMMER User Guide (V3.1b2) for the specification.) Returns
320 * the alignment column number (base 1) that the node maps to, if provided,
326 int parseAnnotations(Scanner scanner, HMMNode node)
329 * map from hmm node to alignment column index, if provided
330 * HMM counts columns from 1, convert to base 0 for Jalview
334 if (scanner.hasNext())
336 value = scanner.next();
337 if (!"-".equals(value))
341 column = Integer.parseInt(value);
342 node.setAlignmentColumn(column - 1);
343 } catch (NumberFormatException e)
351 * hmm consensus residue if provided, else '-'
353 if (scanner.hasNext())
355 node.setConsensusResidue(scanner.next().charAt(0));
359 * RF reference annotation, if provided, else '-'
361 if (scanner.hasNext())
363 node.setReferenceAnnotation(scanner.next().charAt(0));
367 * 'm' for masked position, if provided, else '-'
369 if (scanner.hasNext())
371 node.setMaskValue(scanner.next().charAt(0));
375 * structure consensus symbol, if provided, else '-'
377 if (scanner.hasNext())
379 node.setConsensusStructure(scanner.next().charAt(0));
386 * Fills an array of doubles parsed from an input line
389 * @param numberOfElements
391 * @throws IOException
393 static double[] parseDoubles(Scanner input,
394 int numberOfElements) throws IOException
396 double[] values = new double[numberOfElements];
397 for (int i = 0; i < numberOfElements; i++)
399 if (!input.hasNext())
401 throw new IOException("Incomplete data");
403 String next = input.next();
404 if (next.contains("*"))
406 values[i] = Double.NEGATIVE_INFINITY;
410 double prob = Double.valueOf(next);
411 prob = Math.pow(Math.E, -prob);
419 * Returns a string to be added to the StringBuilder containing the entire
422 * @param initialColumnSeparation
423 * The initial whitespace separation between the left side of the
424 * file and first character.
425 * @param columnSeparation
426 * The separation between subsequent data entries.
428 * The list of data to be added to the String.
431 String addData(int initialColumnSeparation,
432 int columnSeparation, List<String> data)
435 boolean first = true;
436 for (String value : data)
438 int sep = first ? initialColumnSeparation : columnSeparation;
439 line += String.format("%" + sep + "s", value);
446 * Converts list of characters into a list of Strings.
449 * @return Returns the list of Strings.
451 List<String> charListToStringList(List<Character> list)
453 List<String> strList = new ArrayList<>();
454 for (char value : list)
456 String strValue = Character.toString(value);
457 strList.add(strValue);
463 * Converts an array of doubles into a list of Strings, rounded to the nearest
467 * @param noOfDecimals
470 List<String> doublesToStringList(double[] doubles)
472 List<String> strList = new ArrayList<>();
473 for (double value : doubles)
478 strValue = String.format("%.5f", value);
480 else if (value == -0.00000d)
482 strValue = "0.00000";
488 strList.add(strValue);
494 * Appends model data in string format to the string builder
498 void appendModelAsString(StringBuilder output)
500 output.append(HMM).append(" ");
501 String charSymbols = hmm.getSymbols();
502 for (char c : charSymbols.toCharArray())
504 output.append(String.format("%9s", c));
506 output.append(NL).append(TRANSITIONTYPELINE);
508 int length = hmm.getLength();
510 for (int nodeNo = 0; nodeNo <= length; nodeNo++)
512 String matchLine = String.format("%7s",
513 nodeNo == 0 ? COMPO : Integer.toString(nodeNo));
515 double[] doubleMatches = convertToLogSpace(
516 hmm.getNode(nodeNo).getMatchEmissions());
517 List<String> strMatches = doublesToStringList(doubleMatches);
518 matchLine += addData(10, 9, strMatches);
522 matchLine += SPACE + (hmm.getNodeAlignmentColumn(nodeNo) + 1);
523 matchLine += SPACE + hmm.getConsensusResidue(nodeNo);
524 matchLine += SPACE + hmm.getReferenceAnnotation(nodeNo);
525 if (hmm.getFileHeader().contains("HMMER3/f"))
527 matchLine += SPACE + hmm.getMaskedValue(nodeNo);
528 matchLine += SPACE + hmm.getConsensusStructure(nodeNo);
532 output.append(NL).append(matchLine);
534 String insertLine = "";
536 double[] doubleInserts = convertToLogSpace(
537 hmm.getNode(nodeNo).getInsertEmissions());
538 List<String> strInserts = doublesToStringList(doubleInserts);
539 insertLine += addData(17, 9, strInserts);
541 output.append(NL).append(insertLine);
543 String transitionLine = "";
544 double[] doubleTransitions = convertToLogSpace(
545 hmm.getNode(nodeNo).getStateTransitions());
546 List<String> strTransitions = doublesToStringList(
548 transitionLine += addData(17, 9, strTransitions);
550 output.append(NL).append(transitionLine);
555 * Appends formatted HMM file properties to the string builder
559 void appendProperties(StringBuilder output)
561 output.append(hmm.getFileHeader());
563 String format = "%n%-5s %1s";
564 appendProperty(output, format, NAME);
565 appendProperty(output, format, ACCESSION_NUMBER);
566 appendProperty(output, format, DESCRIPTION);
567 appendProperty(output, format, LENGTH);
568 appendProperty(output, format, MAX_LENGTH);
569 appendProperty(output, format, ALPHABET);
570 appendBooleanProperty(output, format, REFERENCE_ANNOTATION);
571 appendBooleanProperty(output, format, MASKED_VALUE);
572 appendBooleanProperty(output, format, CONSENSUS_RESIDUE);
573 appendBooleanProperty(output, format, CONSENSUS_STRUCTURE);
574 appendBooleanProperty(output, format, MAP);
575 appendProperty(output, format, DATE);
576 appendProperty(output, format, NUMBER_OF_SEQUENCES);
577 appendProperty(output, format, EFF_NUMBER_OF_SEQUENCES);
578 appendProperty(output, format, CHECK_SUM);
579 appendProperty(output, format, GATHERING_THRESHOLD);
580 appendProperty(output, format, TRUSTED_CUTOFF);
581 appendProperty(output, format, NOISE_CUTOFF);
583 if (hmm.getMSV() != null)
585 format = "%n%-19s %18s";
586 output.append(String.format(format, "STATS LOCAL MSV", hmm.getMSV()));
588 output.append(String.format(format, "STATS LOCAL VITERBI",
591 output.append(String.format(format, "STATS LOCAL FORWARD",
597 * Appends 'yes' or 'no' for the given property, according to whether or not
598 * it is set in the HMM
602 * @param propertyName
604 private void appendBooleanProperty(StringBuilder output, String format,
607 boolean set = hmm.getBooleanProperty(propertyName);
608 output.append(String.format(format, propertyName,
609 set ? HiddenMarkovModel.YES : HiddenMarkovModel.NO));
613 * Appends the value of the given property to the output, if not null
617 * @param propertyName
619 private void appendProperty(StringBuilder output, String format,
622 String value = hmm.getProperty(propertyName);
625 output.append(String.format(format, propertyName, value));
630 public String print(SequenceI[] sequences, boolean jvsuffix)
632 if (sequences[0].getHMM() != null)
634 hmm = sequences[0].getHMM();
640 * Prints the .hmm file to a String.
644 public String print()
646 StringBuilder output = new StringBuilder();
647 appendProperties(output);
649 appendModelAsString(output);
650 output.append(NL).append(TERMINATOR).append(NL);
651 return output.toString();
655 * Converts the probabilities contained in an array into log space
659 double[] convertToLogSpace(double[] ds)
661 double[] converted = new double[ds.length];
662 for (int i = 0; i < ds.length; i++)
665 double logProb = -1 * Math.log(prob);
667 converted[i] = logProb;
673 * Returns the HMM sequence produced by reading a .hmm file.
676 public SequenceI[] getSeqsAsArray()
678 SequenceI hmmSeq = hmm.initHMMSequence();
679 SequenceI[] seq = new SequenceI[1];
685 public void setNewlineString(String newLine)
691 public void setExportSettings(AlignExportSettingI exportSettings)
697 public void configureForView(AlignmentViewPanel viewpanel)
703 public boolean hasWarningMessage()
709 public String getWarningMessage()
711 return "warning message";