3 import jalview.datamodel.HMMNode;
4 import jalview.datamodel.HiddenMarkovModel;
5 import jalview.datamodel.SequenceI;
7 import java.io.BufferedReader;
8 import java.io.FileNotFoundException;
9 import java.io.IOException;
10 import java.io.PrintWriter;
11 import java.io.UnsupportedEncodingException;
12 import java.util.ArrayList;
13 import java.util.List;
14 import java.util.Scanner;
18 * Adds capability to read in and write out HMMER3 files. Currently only supports HMMER3/f.
24 public class HMMFile extends AlignFile
25 implements AlignmentFileReaderI, AlignmentFileWriterI
27 // HMM to store file data
28 private HiddenMarkovModel hmm = new HiddenMarkovModel();
33 // number of possible transitions
34 private final int NUMBER_OF_TRANSITIONS = 7;
36 private final String NEW_LINE = "\n";
41 //number of symbols in the alphabet used in the hidden Markov model
44 private final String SPACE = " ";
46 private final String COMPO = "COMPO";
48 private final String EMPTY = "";
50 //This is a line that needs to be added to each HMMER£ file. It is purely for readability.
51 private static final String TRANSITIONTYPELINE = "m->m m->i m->d i->m i->i d->m d->d";
54 * Constructor for HMMFile
58 public HMMFile(FileParse source) throws IOException
64 * Default constructor, do not use!
72 * Returns the HMM produced by reading in a HMMER3 file.
76 public HiddenMarkovModel getHMM()
82 * Sets the HMM used in this file.
86 public void setHMM(HiddenMarkovModel model)
92 * Gets the name of the hidden Markov model.
96 public String getName()
102 * Reads the data from HMM file into the HMM field on this object.
104 * @throws IOException
107 public void parse() throws IOException
109 parseFileProperties(dataIn);
114 * Reads the data from HMM file into the HMM field on this object.
116 * @throws IOException
119 public void parse(BufferedReader br) throws IOException
121 parseFileProperties(br);
128 * Imports the file properties from a HMMER3 file.
131 * The buffered reader used to read in the file.
132 * @throws IOException
134 void parseFileProperties(BufferedReader input) throws IOException
136 boolean readingFile = true;
137 fileHeader = input.readLine();
138 String line = input.readLine();
143 Scanner parser = new Scanner(line);
144 String next = parser.next();
145 if ("HMM".equals(next)) // indicates start of HMM data (end of file
149 hmm.fillSymbols(parser);
150 numberOfSymbols = hmm.getNumberOfSymbols();
152 else if ("STATS".equals(next))
158 value = parser.next() + SPACE + SPACE + parser.next();
159 hmm.addFileProperty(key, value);
164 String value = parser.next();
165 while (parser.hasNext())
167 value = value + SPACE + parser.next();
169 hmm.addFileProperty(key, value);
173 line = input.readLine();
183 * Parses the model data from the HMMER3 file
186 * The buffered reader used to read the file.
187 * @throws IOException
189 void parseModel(BufferedReader input) throws IOException
191 for (int i = 0; i < hmm.getLength() + 1; i++)
193 hmm.getNodes().add(new HMMNode());
196 line = input.readLine();
197 Scanner matchReader = new Scanner(line);
198 next = matchReader.next();
199 if (next.equals(COMPO) || i > 0)
201 // stores match emission line in list
202 List<Double> matches = new ArrayList<>();
203 matches = fillList(matchReader, numberOfSymbols);
204 hmm.getNodes().get(i).setMatchEmissions(matches);
207 parseAnnotations(matchReader, i);
211 // stores insert emission line in list
212 line = input.readLine();
213 Scanner insertReader = new Scanner(line);
214 List<Double> inserts = new ArrayList<>();
215 inserts = fillList(insertReader, numberOfSymbols);
216 hmm.getNodes().get(i).setInsertEmissions(inserts);
217 insertReader.close();
219 // stores state transition line in list
220 line = input.readLine();
221 Scanner transitionReader = new Scanner(line);
222 List<Double> transitions = new ArrayList<>();
223 transitions = fillList(transitionReader, NUMBER_OF_TRANSITIONS);
224 hmm.getNodes().get(i).setStateTransitions(transitions);
225 transitionReader.close();
231 * Parses the annotations on the match emission line.
234 * The scanner which is processing match emission line.
236 * The index of node which is being scanned.
238 void parseAnnotations(Scanner scanner, int index)
240 if (hmm.mapIsActive())
243 column = scanner.nextInt();
244 hmm.getNodes().get(index).setAlignmentColumn(column);
245 hmm.getNodeLookup().put(column, index);
252 if (scanner.hasNext())
255 consensusR = charValue(scanner.next());
256 hmm.getNodes().get(index).setConsensusResidue(consensusR);
259 if (scanner.hasNext())
262 reference = charValue(scanner.next());
263 hmm.getNodes().get(index).setReferenceAnnotation(reference);
266 if (scanner.hasNext())
269 value = charValue(scanner.next());
270 hmm.getNodes().get(index).setMaskValue(value);
272 if (scanner.hasNext())
275 consensusS = charValue(scanner.next());
276 hmm.getNodes().get(index).setConsensusStructure(consensusS);
283 * Fills a list of doubles based on an input line.
286 * The scanner for the line containing the data to be transferred to
288 * @param numberOfElements
289 * The number of elements in the list to be filled.
290 * @return filled list Returns the list of doubles.
292 static List<Double> fillList(Scanner input,
293 int numberOfElements)
295 List<Double> list = new ArrayList<>();
296 for (int i = 0; i < numberOfElements; i++)
299 String next = input.next();
300 if (next.contains("*")) // state transitions to or from delete states
301 // occasionally have values of -infinity. These
302 // values are represented by an * in the .hmm
305 list.add(Double.NEGATIVE_INFINITY);
309 double prob = Double.valueOf(next);
310 prob = Math.pow(Math.E, -prob);
319 * Writes a HMM to a file/
321 * @param exportLocation
322 * Filename, URL or Pasted String to write to.
323 * @throws FileNotFoundException
324 * @throws UnsupportedEncodingException
328 public void exportFile(String exportLocation) throws IOException
330 StringBuilder file = new StringBuilder();
331 appendFileProperties(file);
335 PrintWriter output = new PrintWriter(exportLocation);
342 * Returns a string to be added to the StringBuilder containing the entire
345 * @param initialColumnSeparation
346 * The initial whitespace separation between the left side of the
347 * file and first character.
348 * @param columnSeparation
349 * The separation between subsequent data entries.
351 * The list fo data to be added to the String.
354 String addData(int initialColumnSeparation,
355 int columnSeparation, List<String> data)
359 for (String value : data)
363 line += String.format("%" + initialColumnSeparation + "s", value);
367 line += String.format("%" + columnSeparation + "s", value);
375 * Converts list of characters into a list of Strings.
378 * @return Returns the list of Strings.
380 List<String> charListToStringList(List<Character> list)
382 List<String> strList = new ArrayList<>();
383 for (char value : list)
385 String strValue = Character.toString(value);
386 strList.add(strValue);
392 * Converts a list of doubles into a list of Strings, rounded to the nearest
396 * @param noOfDecimals
399 List<String> doubleListToStringList(List<Double> list)
401 List<String> strList = new ArrayList<>();
402 for (double value : list)
407 strValue = String.format("%.5f", value);
410 else if (value == -0.00000d)
412 strValue = "0.00000";
419 strList.add(strValue);
425 * Converts a primitive array of Strings to a list of Strings.
430 List<String> stringArrayToStringList(String[] array)
432 List<String> list = new ArrayList<>();
433 for (String value : array)
442 * Appends the hidden Markov model data to the StringBuilder containing the
446 * The StringBuilder containing the output.
448 void appendModel(StringBuilder file)
450 String symbolLine = "HMM";
451 List<Character> charSymbols = hmm.getSymbols();
452 List<String> strSymbols;
453 strSymbols = charListToStringList(charSymbols);
454 symbolLine += addData(11, 9, strSymbols);
455 file.append(symbolLine + NEW_LINE);
456 file.append(TRANSITIONTYPELINE + NEW_LINE);
458 int length = hmm.getLength();
460 for (int node = 0; node <= length; node++)
465 matchLine = String.format("%7s", "COMPO");
469 matchLine = String.format("%7s", node);
472 List<String> strMatches;
473 List<Double> doubleMatches;
474 doubleMatches = hmm.getNode(node).getMatchEmissions();
475 convertListToLogSpace(doubleMatches);
476 strMatches = doubleListToStringList(doubleMatches);
477 matchLine += addData(10, 9, strMatches);
482 matchLine += SPACE + hmm.getNodeAlignmentColumn(node);
483 matchLine += SPACE + hmm.getConsensusResidue(node);
484 matchLine += SPACE + hmm.getReferenceAnnotation(node);
485 matchLine += SPACE + hmm.getMaskedValue(node);
486 matchLine += SPACE + hmm.getConsensusStructure(node);
490 file.append(matchLine + NEW_LINE);
492 String insertLine = EMPTY;
493 List<String> strInserts;
494 List<Double> doubleInserts;
495 doubleInserts = hmm.getNode(node).getInsertEmissions();
496 convertListToLogSpace(doubleInserts);
497 strInserts = doubleListToStringList(doubleInserts);
498 insertLine += addData(17, 9, strInserts);
500 file.append(insertLine + NEW_LINE);
502 String transitionLine = EMPTY;
503 List<String> strTransitions;
504 List<Double> doubleTransitions;
505 doubleTransitions = hmm.getNode(node).getStateTransitions();
506 convertListToLogSpace(doubleTransitions);
507 strTransitions = doubleListToStringList(doubleTransitions);
508 transitionLine += addData(17, 9, strTransitions);
510 file.append(transitionLine + NEW_LINE);
515 * Appends the hidden Markov model file properties to the StringBuilder
516 * containing the output
519 * The StringBuilder containing the output.
521 void appendFileProperties(StringBuilder file)
525 file.append(fileHeader + NEW_LINE);
527 line = String.format("%-5s %1s", "NAME", hmm.getName());
528 file.append((line + NEW_LINE));
530 if (hmm.getAccessionNumber() != null)
532 line = String.format("%-5s %1s", "ACC", hmm.getAccessionNumber());
533 file.append((line + NEW_LINE));
536 if (hmm.getDescription() != null)
538 line = String.format("%-5s %1s", "DESC", hmm.getDescription());
539 file.append((line + NEW_LINE));
541 line = String.format("%-5s %1s", "LENG", hmm.getLength());
542 file.append((line + NEW_LINE));
544 if (hmm.getMaxInstanceLength() != null)
546 line = String.format("%-5s %1s", "MAXL", hmm.getMaxInstanceLength());
547 file.append((line + NEW_LINE));
549 line = String.format("%-5s %1s", "ALPH", hmm.getAlphabetType());
550 file.append((line + NEW_LINE));
555 status = hmm.referenceAnnotationIsActive();
556 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
557 line = String.format("%-5s %1s", "RF",
559 file.append((line + NEW_LINE));
561 status = hmm.maskValueIsActive();
562 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
563 line = String.format("%-5s %1s", "MM",
565 file.append((line + NEW_LINE));
567 status = hmm.consensusResidueIsActive();
568 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
569 line = String.format("%-5s %1s", "CONS",
571 file.append((line + NEW_LINE));
573 status = hmm.consensusStructureIsActive();
574 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
575 line = String.format("%-5s %1s", "CS",
577 file.append((line + NEW_LINE));
579 status = hmm.mapIsActive();
580 statusStr = HiddenMarkovModel.findStringFromBoolean(status);
581 line = String.format("%-5s %1s", "MAP",
583 file.append((line + NEW_LINE));
586 if (hmm.getDate() != null)
588 line = String.format("%-5s %1s", "DATE", hmm.getDate());
589 file.append((line + NEW_LINE));
591 if (hmm.getNumberOfSequences() != null)
593 line = String.format("%-5s %1s", "NSEQ", hmm.getNumberOfSequences());
594 file.append((line + NEW_LINE));
596 if (hmm.getEffectiveNumberOfSequences() != null)
598 line = String.format("%-5s %1s", "EFFN",
599 hmm.getEffectiveNumberOfSequences());
600 file.append((line + NEW_LINE));
602 if (hmm.getCheckSum() != null)
604 line = String.format("%-5s %1s", "CKSUM", hmm.getCheckSum());
605 file.append((line + NEW_LINE));
607 if (hmm.getGatheringThreshold() != null)
609 line = String.format("%-5s %1s", "GA", hmm.getGatheringThreshold());
610 file.append((line + NEW_LINE));
613 if (hmm.getTrustedCutoff() != null)
615 line = String.format("%-5s %1s", "TC", hmm.getTrustedCutoff());
616 file.append((line + NEW_LINE));
618 if (hmm.getNoiseCutoff() != null)
620 line = String.format("%-5s %1s", "NC", hmm.getNoiseCutoff());
621 file.append((line + NEW_LINE));
623 if (hmm.getMSV() != null)
625 line = String.format("%-19s %18s", "STATS LOCAL MSV", hmm.getMSV());
626 file.append((line + NEW_LINE));
628 line = String.format("%-19s %18s", "STATS LOCAL VITERBI",
630 file.append((line + NEW_LINE));
632 line = String.format("%-19s %18s", "STATS LOCAL FORWARD",
634 file.append((line + NEW_LINE));
640 * Returns the char value of a single lettered String.
645 char charValue(String string)
648 character = string.charAt(0);
654 public String print(SequenceI[] seqs, boolean jvsuffix)
661 * Converts the probabilities contained in a list into log space.
665 void convertListToLogSpace(List<Double> list)
668 for (int i = 0; i < list.size(); i++)
670 double prob = list.get(i);
671 double logProb = -1 * Math.log(prob);
673 list.set(i, logProb);