src/jalview/datamodel/HiddenMarkovModel.java

   1 package jalview.datamodel;
   2
   3 import jalview.io.HMMFile;
   4 import jalview.schemes.ResidueProperties;
   5 import jalview.util.Comparison;
   6 import jalview.util.MapList;
   7
   8 import java.util.ArrayList;
   9 import java.util.Arrays;
  10 import java.util.HashMap;
  11 import java.util.List;
  12 import java.util.Map;
  13
  14 /**
  15  * Data structure which stores a hidden Markov model
  16  *
  17  * @author TZVanaalten
  18  *
  19  */
  20 public class HiddenMarkovModel
  21 {
  22   private static final char GAP_DASH = '-';
  23
  24   public final static String YES = "yes";
  25
  26   public final static String NO = "no";
  27
  28   public static final int MATCHTOMATCH = 0;
  29
  30   public static final int MATCHTOINSERT = 1;
  31
  32   public static final int MATCHTODELETE = 2;
  33
  34   public static final int INSERTTOMATCH = 3;
  35
  36   public static final int INSERTTOINSERT = 4;
  37
  38   public static final int DELETETOMATCH = 5;
  39
  40   public static final int DELETETODELETE = 6;
  41
  42   private static final double LOG2 = Math.log(2);
  43
  44   /*
  45    * properties read from HMM file header lines
  46    */
  47   private Map<String, String> fileProperties = new HashMap<>();
  48
  49   private String fileHeader;
  50
  51   /*
  52    * the symbols used in this model e.g. "ACGT"
  53    */
  54   private String alphabet;
  55
  56   /*
  57    * symbol lookup index into the alphabet for 'A' to 'Z'
  58    */
  59   private int[] symbolIndexLookup = new int['Z' - 'A' + 1];
  60
  61   /*
  62    * Nodes in the model. The begin node is at index 0, and contains
  63    * average emission probabilities for each symbol.
  64    */
  65   private List<HMMNode> nodes = new ArrayList<>();
  66
  67   /*
  68    * the aligned HMM consensus sequence extracted from the HMM profile
  69    */
  70   private SequenceI hmmSeq;
  71
  72   /*
  73    * mapping from HMM nodes to residues of the hmm consensus sequence
  74    */
  75   private Mapping mapToHmmConsensus;
  76
  77   // stores background frequencies of alignment from which this model came
  78   private Map<Character, Float> backgroundFrequencies;
  79
  80   /**
  81    * Constructor
  82    */
  83   public HiddenMarkovModel()
  84   {
  85   }
  86
  87   /**
  88    * Copy constructor given a new aligned sequence with which to associate the
  89    * HMM profile
  90    *
  91    * @param hmm
  92    * @param sq
  93    */
  94   public HiddenMarkovModel(HiddenMarkovModel hmm, SequenceI sq)
  95   {
  96     super();
  97     this.fileProperties = new HashMap<>(hmm.fileProperties);
  98     this.alphabet = hmm.alphabet;
  99     this.nodes = new ArrayList<>(hmm.nodes);
 100     this.symbolIndexLookup = hmm.symbolIndexLookup;
 101     this.fileHeader = new String(hmm.fileHeader);
 102     this.hmmSeq = sq;
 103     this.backgroundFrequencies = hmm.getBackgroundFrequencies();
 104     if (sq.getDatasetSequence() == hmm.mapToHmmConsensus.getTo())
 105     {
 106       // same dataset sequence e.g. after realigning search results
 107       this.mapToHmmConsensus = hmm.mapToHmmConsensus;
 108     }
 109     else
 110     {
 111       // different dataset sequence e.g. after loading HMM from project
 112       this.mapToHmmConsensus = new Mapping(sq.getDatasetSequence(),
 113               hmm.mapToHmmConsensus.getMap());
 114     }
 115   }
 116
 117   /**
 118    * Returns the information content at a specified column, calculated as the
 119    * sum (over possible symbols) of the log ratio
 120    *
 121    * <pre>
 122    *  log(emission probability / background probability) / log(2)
 123    * </pre>
 124    *
 125    * @param column
 126    *          column position (base 0)
 127    * @return
 128    */
 129   public float getInformationContent(int column)
 130   {
 131     float informationContent = 0f;
 132
 133     for (char symbol : getSymbols().toCharArray())
 134     {
 135       float freq = ResidueProperties.backgroundFrequencies
 136               .get(getAlphabetType()).get(symbol);
 137       float prob = (float) getMatchEmissionProbability(column, symbol);
 138       informationContent += prob * Math.log(prob / freq);
 139     }
 140
 141     informationContent = informationContent / (float) LOG2;
 142
 143     return informationContent;
 144   }
 145
 146   /**
 147    * Gets the file header of the .hmm file this model came from
 148    *
 149    * @return
 150    */
 151   public String getFileHeader()
 152   {
 153     return fileHeader;
 154   }
 155
 156   /**
 157    * Sets the file header of this model.
 158    *
 159    * @param header
 160    */
 161   public void setFileHeader(String header)
 162   {
 163     fileHeader = header;
 164   }
 165
 166   /**
 167    * Returns the symbols used in this hidden Markov model
 168    *
 169    * @return
 170    */
 171   public String getSymbols()
 172   {
 173     return alphabet;
 174   }
 175
 176   /**
 177    * Gets the node in the hidden Markov model at the specified position.
 178    *
 179    * @param nodeIndex
 180    *          The index of the node requested. Node 0 optionally contains the
 181    *          average match emission probabilities across the entire model, and
 182    *          always contains the insert emission probabilities and state
 183    *          transition probabilities for the begin node. Node 1 contains the
 184    *          first node in the HMM that can correspond to a column in the
 185    *          alignment.
 186    * @return
 187    */
 188   public HMMNode getNode(int nodeIndex)
 189   {
 190     return nodes.get(nodeIndex);
 191   }
 192
 193   /**
 194    * Returns the name of the sequence alignment on which the HMM is based.
 195    *
 196    * @return
 197    */
 198   public String getName()
 199   {
 200     return fileProperties.get(HMMFile.NAME);
 201   }
 202
 203   /**
 204    * Answers the string value of the property (parsed from an HMM file) for the
 205    * given key, or null if the property is not present
 206    *
 207    * @param key
 208    * @return
 209    */
 210   public String getProperty(String key)
 211   {
 212     return fileProperties.get(key);
 213   }
 214
 215   /**
 216    * Answers true if the property with the given key is present with a value of
 217    * "yes" (not case-sensitive), else false
 218    *
 219    * @param key
 220    * @return
 221    */
 222   public boolean getBooleanProperty(String key)
 223   {
 224     return YES.equalsIgnoreCase(fileProperties.get(key));
 225   }
 226
 227   /**
 228    * Returns the length of the hidden Markov model. The value returned is the
 229    * LENG property if specified, else the number of nodes, excluding the begin
 230    * node (which should be the same thing).
 231    *
 232    * @return
 233    */
 234   public int getLength()
 235   {
 236     if (fileProperties.get(HMMFile.LENGTH) == null)
 237     {
 238       return nodes.size() - 1; // not counting BEGIN node
 239     }
 240     return Integer.parseInt(fileProperties.get(HMMFile.LENGTH));
 241   }
 242
 243   /**
 244    * Returns the value of mandatory property "ALPH" - "amino", "DNA", "RNA" are
 245    * the options. Other alphabets may be added.
 246    *
 247    * @return
 248    */
 249   public String getAlphabetType()
 250   {
 251     return fileProperties.get(HMMFile.ALPHABET);
 252   }
 253
 254   /**
 255    * Sets the model alphabet to the symbols in the given string (ignoring any
 256    * whitespace), and returns the number of symbols
 257    *
 258    * @param symbols
 259    */
 260   public int setAlphabet(String symbols)
 261   {
 262     String trimmed = symbols.toUpperCase().replaceAll("\\s", "");
 263     int count = trimmed.length();
 264     alphabet = trimmed;
 265     symbolIndexLookup = new int['Z' - 'A' + 1];
 266     Arrays.fill(symbolIndexLookup, -1);
 267     int ignored = 0;
 268
 269     /*
 270      * save the symbols in order, and a quick lookup of symbol position
 271      */
 272     for (short i = 0; i < count; i++)
 273     {
 274       char symbol = trimmed.charAt(i);
 275       if (symbol >= 'A' && symbol <= 'Z'
 276               && symbolIndexLookup[symbol - 'A'] == -1)
 277       {
 278         symbolIndexLookup[symbol - 'A'] = i;
 279       }
 280       else
 281       {
 282         System.err
 283                 .println(
 284                         "Unexpected or duplicated character in HMM ALPHabet: "
 285                                 + symbol);
 286         ignored++;
 287       }
 288     }
 289     return count - ignored;
 290   }
 291
 292   /**
 293    * Answers the node of the model corresponding to an aligned column position
 294    * (0...), or null if there is no such node
 295    *
 296    * @param column
 297    * @return
 298    */
 299   HMMNode getNodeForColumn(int column)
 300   {
 301     /*
 302      * if the hmm consensus is gapped at the column,
 303      * there is no corresponding node
 304      */
 305     if (Comparison.isGap(hmmSeq.getCharAt(column)))
 306     {
 307       return null;
 308     }
 309
 310     /*
 311      * find the node (if any) that is mapped to the
 312      * consensus sequence residue position at the column
 313      */
 314     int seqPos = hmmSeq.findPosition(column);
 315     int[] nodeNo = mapToHmmConsensus.getMap().locateInFrom(seqPos, seqPos);
 316     if (nodeNo != null)
 317     {
 318       return getNode(nodeNo[0]);
 319     }
 320     return null;
 321   }
 322
 323   /**
 324    * Gets the match emission probability for a given symbol at a column in the
 325    * alignment.
 326    *
 327    * @param alignColumn
 328    *          The index of the alignment column, starting at index 0. Index 0
 329    *          usually corresponds to index 1 in the HMM.
 330    * @param symbol
 331    *          The symbol for which the desired probability is being requested.
 332    * @return
 333    *
 334    */
 335   public double getMatchEmissionProbability(int alignColumn, char symbol)
 336   {
 337     HMMNode node = getNodeForColumn(alignColumn);
 338     int symbolIndex = getSymbolIndex(symbol);
 339     if (node != null && symbolIndex != -1)
 340     {
 341       return node.getMatchEmission(symbolIndex);
 342     }
 343     return 0D;
 344   }
 345
 346   /**
 347    * Gets the insert emission probability for a given symbol at a column in the
 348    * alignment.
 349    *
 350    * @param alignColumn
 351    *          The index of the alignment column, starting at index 0. Index 0
 352    *          usually corresponds to index 1 in the HMM.
 353    * @param symbol
 354    *          The symbol for which the desired probability is being requested.
 355    * @return
 356    *
 357    */
 358   public double getInsertEmissionProbability(int alignColumn, char symbol)
 359   {
 360     HMMNode node = getNodeForColumn(alignColumn);
 361     int symbolIndex = getSymbolIndex(symbol);
 362     if (node != null && symbolIndex != -1)
 363     {
 364       return node.getInsertEmission(symbolIndex);
 365     }
 366     return 0D;
 367   }
 368
 369   /**
 370    * Gets the state transition probability for a given symbol at a column in the
 371    * alignment.
 372    *
 373    * @param alignColumn
 374    *          The index of the alignment column, starting at index 0. Index 0
 375    *          usually corresponds to index 1 in the HMM.
 376    * @param symbol
 377    *          The symbol for which the desired probability is being requested.
 378    * @return
 379    *
 380    */
 381   public double getStateTransitionProbability(int alignColumn,
 382           int transition)
 383   {
 384     HMMNode node = getNodeForColumn(alignColumn);
 385     if (node != null)
 386     {
 387       return node.getStateTransition(transition);
 388     }
 389     return 0D;
 390   }
 391
 392   /**
 393    * Returns the sequence position linked to the node at the given index. This
 394    * corresponds to an aligned column position (counting from 1).
 395    *
 396    * @param nodeIndex
 397    *          The index of the node, starting from index 1. Index 0 is the begin
 398    *          node, which does not correspond to a column in the alignment.
 399    * @return
 400    */
 401   public int getNodeMapPosition(int nodeIndex)
 402   {
 403     return nodes.get(nodeIndex).getResidueNumber();
 404   }
 405
 406   /**
 407    * Returns the consensus residue at the specified node.
 408    *
 409    * @param nodeIndex
 410    *          The index of the specified node.
 411    * @return
 412    */
 413   public char getConsensusResidue(int nodeIndex)
 414   {
 415    char value = nodes.get(nodeIndex).getConsensusResidue();
 416    return value;
 417   }
 418
 419   /**
 420    * Returns the reference annotation at the specified node.
 421    *
 422    * @param nodeIndex
 423    *          The index of the specified node.
 424    * @return
 425    */
 426   public char getReferenceAnnotation(int nodeIndex)
 427   {
 428    char value = nodes.get(nodeIndex).getReferenceAnnotation();
 429    return value;
 430   }
 431
 432   /**
 433    * Returns the mask value at the specified node.
 434    *
 435    * @param nodeIndex
 436    *          The index of the specified node.
 437    * @return
 438    */
 439   public char getMaskedValue(int nodeIndex)
 440   {
 441    char value = nodes.get(nodeIndex).getMaskValue();
 442    return value;
 443   }
 444
 445   /**
 446    * Returns the consensus structure at the specified node.
 447    *
 448    * @param nodeIndex
 449    *          The index of the specified node.
 450    * @return
 451    */
 452   public char getConsensusStructure(int nodeIndex)
 453   {
 454    char value = nodes.get(nodeIndex).getConsensusStructure();
 455    return value;
 456   }
 457
 458   /**
 459    * Sets a property read from an HMM file
 460    *
 461    * @param key
 462    * @param value
 463    */
 464   public void setProperty(String key, String value)
 465   {
 466     fileProperties.put(key, value);
 467   }
 468
 469   /**
 470    * Temporary implementation, should not be used.
 471    *
 472    * @return
 473    */
 474   public String getViterbi()
 475   {
 476     String value;
 477     value = fileProperties.get(HMMFile.VITERBI);
 478     return value;
 479   }
 480
 481   /**
 482    * Temporary implementation, should not be used.
 483    *
 484    * @return
 485    */
 486   public String getMSV()
 487   {
 488     String value;
 489     value = fileProperties.get(HMMFile.MSV);
 490     return value;
 491   }
 492
 493   /**
 494    * Temporary implementation, should not be used.
 495    *
 496    * @return
 497    */
 498   public String getForward()
 499   {
 500     String value;
 501     value = fileProperties.get(HMMFile.FORWARD);
 502     return value;
 503   }
 504
 505   /**
 506    * Constructs the consensus sequence based on the most probable symbol at each
 507    * position. Gap characters are inserted for discontinuities in the node map
 508    * numbering (if provided), else an ungapped sequence is generated.
 509    * <p>
 510    * A mapping between the HMM nodes and residue positions of the sequence is
 511    * also built and saved.
 512    *
 513    * @return
 514    */
 515   void buildConsensusSequence()
 516   {
 517     List<int[]> toResidues = new ArrayList<>();
 518
 519     /*
 520      * if the HMM provided a map to sequence, use those start/end values,
 521      * else just treat it as for a contiguous sequence numbered from 1
 522      */
 523     boolean hasMap = getBooleanProperty(HMMFile.MAP);
 524     int start = hasMap ? getNode(1).getResidueNumber() : 1;
 525     int endResNo = hasMap ? getNode(nodes.size() - 1).getResidueNumber()
 526             : (start + getLength() - 1);
 527     char[] sequence = new char[endResNo];
 528
 529     int lastResNo = start - 1;
 530     int seqOffset = -1;
 531     int gapCount = 0;
 532
 533
 534     for (int seqN = 0; seqN < start; seqN++)
 535     {
 536       sequence[seqN] = GAP_DASH;
 537       seqOffset++;
 538     }
 539
 540     for (int nodeNo = 1; nodeNo < nodes.size(); nodeNo++)
 541     {
 542       HMMNode node = nodes.get(nodeNo);
 543       final int resNo = hasMap ? node.getResidueNumber() : lastResNo + 1;
 544
 545       /*
 546        * insert gaps if map numbering is not continuous
 547        */
 548       while (resNo > lastResNo + 1)
 549       {
 550         sequence[seqOffset++] = GAP_DASH;
 551         lastResNo++;
 552         gapCount++;
 553       }
 554       char consensusResidue = node.getConsensusResidue();
 555       if (GAP_DASH == consensusResidue)
 556       {
 557         /*
 558          * no residue annotation in HMM - scan for the symbol
 559          * with the highest match emission probability
 560          */
 561         int symbolIndex = node.getMaxMatchEmissionIndex();
 562         consensusResidue = alphabet.charAt(symbolIndex);
 563         if (node.getMatchEmission(symbolIndex) < 0.5D)
 564         {
 565           // follow convention of lower case if match emission prob < 0.5
 566           consensusResidue = Character.toLowerCase(consensusResidue);
 567         }
 568       }
 569       sequence[seqOffset++] = consensusResidue;
 570       lastResNo = resNo;
 571     }
 572
 573     Sequence seq = new Sequence(getName(), sequence, start,
 574             lastResNo - gapCount);
 575     seq.createDatasetSequence();
 576     seq.setHMM(this);
 577     this.hmmSeq = seq;
 578
 579     /*
 580      * construct and store Mapping of nodes to residues
 581      * note as constructed this is just an identity mapping,
 582      * but it allows for greater flexibility in future
 583      */
 584     List<int[]> fromNodes = new ArrayList<>();
 585     fromNodes.add(new int[] { 1, getLength() });
 586     toResidues.add(new int[] { seq.getStart(), seq.getEnd() });
 587     MapList mapList = new MapList(fromNodes, toResidues, 1, 1);
 588     mapToHmmConsensus = new Mapping(seq.getDatasetSequence(), mapList);
 589   }
 590
 591
 592   /**
 593    * Answers the aligned consensus sequence for the profile. Note this will
 594    * return null if called before <code>setNodes</code> has been called.
 595    *
 596    * @return
 597    */
 598   public SequenceI getConsensusSequence()
 599   {
 600     return hmmSeq;
 601   }
 602
 603   /**
 604    * Answers the index position (0...) of the given symbol, or -1 if not a valid
 605    * symbol for this HMM
 606    *
 607    * @param symbol
 608    * @return
 609    */
 610   private int getSymbolIndex(char symbol)
 611   {
 612     /*
 613      * symbolIndexLookup holds the index for 'A' to 'Z'
 614      */
 615     char c = Character.toUpperCase(symbol);
 616     if ('A' <= c && c <= 'Z')
 617     {
 618       return symbolIndexLookup[c - 'A'];
 619     }
 620     return -1;
 621   }
 622
 623   /**
 624    * Sets the nodes of this HMM, and also extracts the HMM consensus sequence
 625    * and a mapping between node numbers and sequence positions
 626    *
 627    * @param nodeList
 628    */
 629   public void setNodes(List<HMMNode> nodeList)
 630   {
 631     nodes = nodeList;
 632     if (nodes.size() > 1)
 633     {
 634       buildConsensusSequence();
 635     }
 636   }
 637
 638   /**
 639    * Sets the aligned consensus sequence this HMM is the model for
 640    *
 641    * @param hmmSeq
 642    */
 643   public void setHmmSeq(SequenceI hmmSeq)
 644   {
 645     this.hmmSeq = hmmSeq;
 646   }
 647
 648   public void setBackgroundFrequencies(Map<Character, Float> bkgdFreqs)
 649   {
 650     backgroundFrequencies = bkgdFreqs;
 651   }
 652
 653   public void setBackgroundFrequencies(ResidueCount bkgdFreqs)
 654   {
 655     backgroundFrequencies = new HashMap<>();
 656
 657     int total = bkgdFreqs.getTotalResidueCount();
 658
 659     for (char c : bkgdFreqs.getSymbolCounts().symbols)
 660     {
 661       backgroundFrequencies.put(c, bkgdFreqs.getCount(c) * 1f / total);
 662     }
 663
 664   }
 665
 666   public Map<Character, Float> getBackgroundFrequencies()
 667   {
 668     return backgroundFrequencies;
 669   }
 670
 671 }
 672