3 import jalview.datamodel.HMMNode;
4 import jalview.datamodel.HiddenMarkovModel;
5 import jalview.datamodel.SequenceI;
7 import java.io.BufferedReader;
8 import java.io.IOException;
9 import java.util.ArrayList;
10 import java.util.List;
11 import java.util.Scanner;
15 * Adds capability to read in and write out HMMER3 files. .
21 public class HMMFile extends AlignFile
22 implements AlignmentFileReaderI, AlignmentFileWriterI
24 // HMM to store file data
25 private HiddenMarkovModel hmm;
27 // number of possible transitions
28 private static final int NUMBER_OF_TRANSITIONS = 7;
30 private static final String NL = "\n";
32 //number of symbols in the alphabet used in the hidden Markov model
35 private final String SPACE = " ";
37 private final String COMPO = "COMPO";
39 private final String EMPTY = "";
41 //This is a line that needs to be added to each HMMER� file. It is purely for readability.
42 private static final String TRANSITIONTYPELINE = " m->m m->i m->d i->m i->i d->m d->d";
51 public HMMFile(String inFile, DataSourceType type) throws IOException
62 public HMMFile(FileParse source) throws IOException
68 * Default constructor, do not use!
76 * Constructor for HMMFile used for exporting.
79 * @param exportImmediately
81 public HMMFile(HiddenMarkovModel markov)
87 * For testing, do not use.
91 HMMFile(BufferedReader br)
97 * Returns the HMM produced by reading in a HMMER3 file.
101 public HiddenMarkovModel getHMM()
107 * Sets the HMM used in this file.
111 public void setHMM(HiddenMarkovModel model)
117 * Gets the name of the hidden Markov model.
121 public String getName()
123 return hmm.getName();
127 * Reads the data from HMM file into the HMM field on this object.
129 * @throws IOException
132 public void parse() throws IOException
134 hmm = new HiddenMarkovModel();
135 parseFileProperties(dataIn);
140 * Reads the data from HMM file into the HMM field on this object.
142 * @throws IOException
145 public void parse(BufferedReader br) throws IOException
147 hmm = new HiddenMarkovModel();
148 parseFileProperties(br);
155 * Imports the file properties from a HMMER3 file.
158 * The buffered reader used to read in the file.
159 * @throws IOException
161 void parseFileProperties(BufferedReader input) throws IOException
163 boolean readingFile = true;
164 hmm.setFileHeader(input.readLine());
165 String line = input.readLine();
170 Scanner parser = new Scanner(line);
171 String next = parser.next();
172 if ("HMM".equals(next)) // indicates start of HMM data (end of file
177 numberOfSymbols = hmm.getNumberOfSymbols();
179 else if ("STATS".equals(next))
185 value = parser.next() + SPACE + SPACE + parser.next();
186 hmm.addFileProperty(key, value);
191 String value = parser.next();
192 while (parser.hasNext())
194 value = value + SPACE + parser.next();
196 hmm.addFileProperty(key, value);
200 line = input.readLine();
210 * Parses the model data from the HMMER3 file
213 * The buffered reader used to read the file.
214 * @throws IOException
216 void parseModel(BufferedReader input) throws IOException
218 String line = input.readLine();
220 while (!"//".equals(line))
222 hmm.getNodes().add(new HMMNode());
224 Scanner matchReader = new Scanner(line);
225 next = matchReader.next();
226 if (next.equals(COMPO) || node > 0)
228 // stores match emission line in list
229 List<Double> matches = new ArrayList<>();
230 matches = fillList(matchReader, numberOfSymbols);
231 hmm.getNodes().get(node).setMatchEmissions(matches);
234 parseAnnotations(matchReader, node);
238 // stores insert emission line in list
239 line = input.readLine();
240 Scanner insertReader = new Scanner(line);
241 List<Double> inserts = new ArrayList<>();
242 inserts = fillList(insertReader, numberOfSymbols);
243 hmm.getNodes().get(node).setInsertEmissions(inserts);
244 insertReader.close();
246 // stores state transition line in list
247 line = input.readLine();
248 Scanner transitionReader = new Scanner(line);
249 List<Double> transitions = new ArrayList<>();
250 transitions = fillList(transitionReader, NUMBER_OF_TRANSITIONS);
251 hmm.getNodes().get(node).setStateTransitions(transitions);
252 transitionReader.close();
253 line = input.readLine();
260 * Parses the annotations on the match emission line.
263 * The scanner which is processing match emission line.
265 * The index of node which is being scanned.
267 void parseAnnotations(Scanner scanner, int index)
269 if (hmm.mapIsActive() && scanner.hasNext())
272 column = scanner.nextInt();
273 hmm.getNodes().get(index).setAlignmentColumn(column - 1);
274 hmm.getNodeLookup().put(column - 1, index);
281 if (scanner.hasNext())
284 consensusR = charValue(scanner.next());
285 hmm.getNodes().get(index).setConsensusResidue(consensusR);
288 if (scanner.hasNext())
291 reference = charValue(scanner.next());
292 hmm.getNodes().get(index).setReferenceAnnotation(reference);
295 if (scanner.hasNext())
298 value = charValue(scanner.next());
299 hmm.getNodes().get(index).setMaskValue(value);
301 if (scanner.hasNext())
304 consensusS = charValue(scanner.next());
305 hmm.getNodes().get(index).setConsensusStructure(consensusS);
312 * Fills a list of doubles based on an input line.
315 * The scanner for the line containing the data to be transferred to
317 * @param numberOfElements
318 * The number of elements in the list to be filled.
319 * @return filled list Returns the list of doubles.
320 * @throws IOException
322 static List<Double> fillList(Scanner input,
323 int numberOfElements) throws IOException
325 List<Double> list = new ArrayList<>();
326 for (int i = 0; i < numberOfElements; i++)
329 String next = input.next();
330 if (next.contains("*")) // state transitions to or from delete states
331 // occasionally have values of -infinity. These
332 // values are represented by an * in the .hmm
335 list.add(Double.NEGATIVE_INFINITY);
339 double prob = Double.valueOf(next);
340 prob = Math.pow(Math.E, -prob);
344 if (list.size() < numberOfElements)
346 throw new IOException("Incomplete data");
352 * Returns a string to be added to the StringBuilder containing the entire
355 * @param initialColumnSeparation
356 * The initial whitespace separation between the left side of the
357 * file and first character.
358 * @param columnSeparation
359 * The separation between subsequent data entries.
361 * The list fo data to be added to the String.
364 String addData(int initialColumnSeparation,
365 int columnSeparation, List<String> data)
369 for (String value : data)
373 line += String.format("%" + initialColumnSeparation + "s", value);
377 line += String.format("%" + columnSeparation + "s", value);
385 * Converts list of characters into a list of Strings.
388 * @return Returns the list of Strings.
390 List<String> charListToStringList(List<Character> list)
392 List<String> strList = new ArrayList<>();
393 for (char value : list)
395 String strValue = Character.toString(value);
396 strList.add(strValue);
402 * Converts a list of doubles into a list of Strings, rounded to the nearest
406 * @param noOfDecimals
409 List<String> doubleListToStringList(List<Double> list)
411 List<String> strList = new ArrayList<>();
412 for (double value : list)
417 strValue = String.format("%.5f", value);
420 else if (value == -0.00000d)
422 strValue = "0.00000";
429 strList.add(strValue);
435 * Converts a primitive array of Strings to a list of Strings.
440 List<String> stringArrayToStringList(String[] array)
442 List<String> list = new ArrayList<>();
443 for (String value : array)
452 * Returns a string containing the model data.
454 String getModelAsString()
456 StringBuffer output = new StringBuffer();
457 String symbolLine = "HMM";
458 List<Character> charSymbols = hmm.getSymbols();
459 List<String> strSymbols;
460 strSymbols = charListToStringList(charSymbols);
461 symbolLine += addData(11, 9, strSymbols);
462 output.append(symbolLine);
463 output.append(NL + TRANSITIONTYPELINE);
465 int length = hmm.getLength();
467 for (int node = 0; node <= length; node++)
472 matchLine = String.format("%7s", "COMPO");
476 matchLine = String.format("%7s", node);
479 List<String> strMatches;
480 List<Double> doubleMatches;
481 doubleMatches = convertListToLogSpace(
482 hmm.getNode(node).getMatchEmissions());
483 strMatches = doubleListToStringList(doubleMatches);
484 matchLine += addData(10, 9, strMatches);
489 matchLine += SPACE + (hmm.getNodeAlignmentColumn(node) + 1);
490 matchLine += SPACE + hmm.getConsensusResidue(node);
491 matchLine += SPACE + hmm.getReferenceAnnotation(node);
492 if (hmm.getFileHeader().contains("HMMER3/f"))
494 matchLine += SPACE + hmm.getMaskedValue(node);
495 matchLine += SPACE + hmm.getConsensusStructure(node);
500 output.append(NL + matchLine);
502 String insertLine = EMPTY;
503 List<String> strInserts;
504 List<Double> doubleInserts;
505 doubleInserts = convertListToLogSpace(
506 hmm.getNode(node).getInsertEmissions());
507 strInserts = doubleListToStringList(doubleInserts);
508 insertLine += addData(17, 9, strInserts);
510 output.append(NL + insertLine);
512 String transitionLine = EMPTY;
513 List<String> strTransitions;
514 List<Double> doubleTransitions;
515 doubleTransitions = convertListToLogSpace(
516 hmm.getNode(node).getStateTransitions());
517 strTransitions = doubleListToStringList(doubleTransitions);
518 transitionLine += addData(17, 9, strTransitions);
520 output.append(NL + transitionLine);
522 return output.toString();
526 * Returns a String containing the HMM file properties
528 String getFilePropertiesAsString()
530 StringBuffer output = new StringBuffer();
533 output.append(hmm.getFileHeader());
535 line = String.format("%-5s %1s", "NAME", hmm.getName());
536 output.append(NL + line);
538 if (hmm.getAccessionNumber() != null)
540 line = String.format("%-5s %1s", "ACC", hmm.getAccessionNumber());
541 output.append(NL + line);
544 if (hmm.getDescription() != null)
546 line = String.format("%-5s %1s", "DESC", hmm.getDescription());
547 output.append(NL + line);
549 line = String.format("%-5s %1s", "LENG", hmm.getLength());
550 output.append(NL + line);
552 if (hmm.getMaxInstanceLength() != null)
554 line = String.format("%-5s %1s", "MAXL", hmm.getMaxInstanceLength());
555 output.append(NL + line);
557 line = String.format("%-5s %1s", "ALPH", hmm.getAlphabetType());
558 output.append(NL + line);
563 status = hmm.referenceAnnotationIsActive();
564 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
565 line = String.format("%-5s %1s", "RF",
567 output.append(NL + line);
569 status = hmm.maskValueIsActive();
570 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
571 line = String.format("%-5s %1s", "MM",
573 output.append(NL + line);
575 status = hmm.consensusResidueIsActive();
576 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
577 line = String.format("%-5s %1s", "CONS",
579 output.append(NL + line);
581 status = hmm.consensusStructureIsActive();
582 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
583 line = String.format("%-5s %1s", "CS",
585 output.append(NL + line);
587 status = hmm.mapIsActive();
588 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
589 line = String.format("%-5s %1s", "MAP",
591 output.append(NL + line);
594 if (hmm.getDate() != null)
596 line = String.format("%-5s %1s", "DATE", hmm.getDate());
597 output.append(NL + line);
599 if (hmm.getNumberOfSequences() != null)
601 line = String.format("%-5s %1s", "NSEQ", hmm.getNumberOfSequences());
602 output.append(NL + line);
604 if (hmm.getEffectiveNumberOfSequences() != null)
606 line = String.format("%-5s %1s", "EFFN",
607 hmm.getEffectiveNumberOfSequences());
608 output.append(NL + line);
610 if (hmm.getCheckSum() != null)
612 line = String.format("%-5s %1s", "CKSUM", hmm.getCheckSum());
613 output.append(NL + line);
615 if (hmm.getGatheringThreshold() != null)
617 line = String.format("%-5s %1s", "GA", hmm.getGatheringThreshold());
618 output.append(NL + line);
621 if (hmm.getTrustedCutoff() != null)
623 line = String.format("%-5s %1s", "TC", hmm.getTrustedCutoff());
624 output.append(NL + line);
626 if (hmm.getNoiseCutoff() != null)
628 line = String.format("%-5s %1s", "NC", hmm.getNoiseCutoff());
629 output.append(NL + line);
631 if (hmm.getMSV() != null)
633 line = String.format("%-19s %18s", "STATS LOCAL MSV", hmm.getMSV());
634 output.append(NL + line);
636 line = String.format("%-19s %18s", "STATS LOCAL VITERBI",
638 output.append(NL + line);
640 line = String.format("%-19s %18s", "STATS LOCAL FORWARD",
642 output.append(NL + line);
644 return output.toString();
649 * Returns the char value of a single lettered String.
654 char charValue(String string)
657 character = string.charAt(0);
663 public String print(SequenceI[] seqs, boolean jvsuffix)
669 * Prints the .hmm file to a String.
673 public String print()
675 StringBuffer output = new StringBuffer();
676 output.append(getFilePropertiesAsString());
678 output.append(getModelAsString());
679 output.append(NL + "//");
680 return output.toString();
684 * Converts the probabilities contained in a list into log space.
688 List<Double> convertListToLogSpace(List<Double> list)
691 List<Double> convertedList = new ArrayList<>();
692 for (int i = 0; i < list.size(); i++)
694 double prob = list.get(i);
695 double logProb = -1 * Math.log(prob);
697 convertedList.add(logProb);
699 return convertedList;
705 * Returns the HMM sequence produced by reading a .hmm file.
708 public SequenceI[] getSeqsAsArray()
710 SequenceI hmmSeq = hmm.initHMMSequence();
711 SequenceI[] seq = new SequenceI[1];
718 * Fills symbol array and adds each symbol to an index lookup
721 * The scanner scanning the symbol line in the file.
723 public void fillSymbols(Scanner parser)
726 while (parser.hasNext())
728 String strSymbol = parser.next();
729 char[] symbol = strSymbol.toCharArray();
730 hmm.getSymbols().add(symbol[0]);
731 hmm.setSymbolIndex(symbol[0], i);