/************************************************************
 * HMMER - Biological sequence analysis with profile HMMs
 * Copyright (C) 1992-1999 Washington University School of Medicine
 * All Rights Reserved
 * 
 *     This source code is distributed under the terms of the
 *     GNU General Public License. See the files COPYING and LICENSE
 *     for details.
 ************************************************************/

/* alphabet.c
 * Configuration of the global symbol alphabet information.
 * RCS $Id: alphabet.c,v 1.1.1.1 2005/03/22 08:34:08 cmzmasek Exp $
 */

#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#ifdef HMMER_THREADS
#include <pthread.h>
#endif /* HMMER_THREADS */

#include "config.h"
#include "structs.h"
#include "funcs.h"
#include "squid.h"

#ifdef MEMDEBUG
#include "dbmalloc.h"
#endif

static void set_degenerate(char iupac, char *syms);


/* Function: DetermineAlphabet()
 * 
 * Purpose:  From a set of sequences (raw or aligned), make a good
 *           guess whether they're Nucleic, Amino, or something
 *           else, and set alphabet accordingly. 
 *           
 *           If Alphabet_type is already set, that means our
 *           autodetection was overridden from the command line, 
 *           and we just set the other globals accordingly.  
 */
void
DetermineAlphabet(char **rseqs, int  nseq)
{
  int idx;
  int other, nucleic, amino;
  int type;
  
  /* Autodetection of alphabet type.
   */
  type = hmmNOTSETYET;
  other = nucleic = amino = 0;
  for (idx = 0; idx < nseq; idx++) {
    switch (Seqtype(rseqs[idx])) {
    case kRNA:      nucleic++;   break;
    case kDNA:      nucleic++;   break;
    case kAmino:    amino++;     break;
    case kOtherSeq: other++;     break;
    default: Die("No such alphabet type");
    }
  }

  if      (nucleic == nseq) type = hmmNUCLEIC;
  else if (amino   == nseq) type = hmmAMINO;
  else if (nucleic > amino && nucleic > other) {
    Warn("Looks like nucleic acid sequence, hope that's right");
    type = hmmNUCLEIC;
  }
  else if (amino > nucleic && amino > other) {
    Warn("Looks like amino acid sequence, hope that's right");
    type = hmmAMINO;
  }
  else Die("Sorry, I can't tell if that's protein or DNA"); 

  /* Now set up the alphabet.
   */
  SetAlphabet(type);
}


/* Function: SetAlphabet()
 * 
 * Purpose:  Set the alphabet globals, given an alphabet type
 *           of either hmmAMINO or hmmNUCLEIC.
 */
void
SetAlphabet(int type)
{
  int x;
#ifdef HMMER_THREADS
  pthread_mutex_t  alphabet_lock; /* alphabet is global; must protect to be threadsafe */
  int              rtn;		  /* return code from pthreads */

  if ((rtn = pthread_mutex_init(&alphabet_lock, NULL)) != 0)
    Die("pthread_mutex_init FAILED; %s\n", strerror(rtn));
  if ((rtn = pthread_mutex_lock(&alphabet_lock)) != 0)
    Die("pthread_mutex_lock FAILED: %s\n", strerror(rtn));
#endif

 /* Because the alphabet information is global, we must
  * be careful to make this a thread-safe function. The mutex
  * (above) takes care of that. But, indeed, it's also
  * just good sense (and more efficient) to simply never
  * allow resetting the alphabet. If type is Alphabet_type,
  * silently return; else die with an alphabet mismatch
  * warning.
  */
  if (Alphabet_type != hmmNOTSETYET) 
    {
      if (type != Alphabet_type) 
	Die("An alphabet type conflict occurred.\nYou probably mixed a DNA seq file with a protein model, or vice versa.");
      
#ifdef HMMER_THREADS
      if ((rtn = pthread_mutex_unlock(&alphabet_lock)) != 0)
	Die("pthread_mutex_unlock failure: %s\n", strerror(rtn));
#endif
      return;
    }

  switch(type) { /* Alphabet is not a string - careful! */
  case hmmAMINO: 
    Alphabet_type     = type;
    strncpy(Alphabet, "ACDEFGHIKLMNPQRSTVWYBZX", 23);
    Alphabet_size     = 20; 
    Alphabet_iupac    = 23;
    for (x = 0; x < Alphabet_iupac; x++) {
      memset(Degenerate[x], 0, Alphabet_size);
    }
    for (x = 0; x < Alphabet_size; x++) {
      Degenerate[x][x] = 1;
      DegenCount[x] = 1;
    }
    set_degenerate('B', "ND");
    set_degenerate('Z', "QE");
    set_degenerate('X', "ACDEFGHIKLMNPQRSTVWY");
    break;
  case hmmNUCLEIC:
    Alphabet_type     = type;
    strncpy(Alphabet, "ACGTUNRYMKSWHBVDX", 17); 
    Alphabet_size     = 4; 
    Alphabet_iupac    = 17;
    for (x = 0; x < Alphabet_iupac; x++) {
      memset(Degenerate[x], 0, Alphabet_size);
    }
    for (x = 0; x < Alphabet_size; x++) {
      Degenerate[x][x] = 1;
      DegenCount[x] = 1;
    }
    set_degenerate('U', "T");
    set_degenerate('N', "ACGT");
    set_degenerate('X', "ACGT");
    set_degenerate('R', "AG");
    set_degenerate('Y', "CT");
    set_degenerate('M', "AC");
    set_degenerate('K', "GT");
    set_degenerate('S', "CG");
    set_degenerate('W', "AT");
    set_degenerate('H', "ACT");
    set_degenerate('B', "CGT");
    set_degenerate('V', "ACG");
    set_degenerate('D', "AGT");
    break;
  default: Die("No support for non-nucleic or protein alphabets");  
  }

#ifdef HMMER_THREADS
  if ((rtn = pthread_mutex_unlock(&alphabet_lock)) != 0)
    Die("pthread_mutex_unlock failure: %s\n", strerror(rtn));
#endif
}

/* Function: SymbolIndex()
 * 
 * Purpose:  Convert a symbol to its index in Alphabet[].
 *           Bogus characters are converted to 'X'.
 *           More robust than the SYMIDX() macro but
 *           presumably slower.
 */ 
int
SymbolIndex(char sym)
{
  char *s;
  return ((s = strchr(Alphabet, (char) toupper((int) sym))) == NULL) ?
	  Alphabet_iupac-1 : s - Alphabet;
} 


/* Function: DigitizeSequence()
 * 
 * Purpose:  Internal representation of a sequence in HMMER is
 *           as a char array. 1..L are the indices
 *           of seq symbols in Alphabet[]. 0,L+1 are sentinel
 *           bytes, set to be Alphabet_iupac -- i.e. one more
 *           than the maximum allowed index.  
 *           
 *           Assumes that 'X', the fully degenerate character,
 *           is the last character in the allowed alphabet.
 *           
 * Args:     seq - sequence to be digitized (0..L-1)
 *           L   - length of sequence      
 *           
 * Return:   digitized sequence, dsq.
 *           dsq is allocated here and must be free'd by caller.
 */
char *
DigitizeSequence(char *seq, int L)
{
  char *dsq;
  int i;

  dsq = MallocOrDie (sizeof(char) * (L+2));
  dsq[0] = dsq[L+1] = (char) Alphabet_iupac;
  for (i = 1; i <= L; i++) 
    dsq[i] = SymbolIndex(seq[i-1]);
  return dsq;
}


/* Function: DedigitizeSequence()
 * Date:     SRE, Tue Dec 16 10:39:19 1997 [StL]
 * 
 * Purpose:  Returns a 0..L-1 character string, converting the
 *           dsq back to the real alphabet.
 */
char *
DedigitizeSequence(char *dsq, int L)
{
  char *seq;
  int i;

  seq = MallocOrDie(sizeof(char) * (L+1));
  for (i = 0; i < L; i++)
    seq[i] = Alphabet[(int) dsq[i+1]];
  seq[L] = '\0';
  return seq;
}


/* Function: DigitizeAlignment() 
 * 
 * Purpose:  Given an alignment, return digitized unaligned
 *           sequence array. (Tracebacks are always relative
 *           to digitized unaligned seqs, even if they are
 *           faked from an existing alignment in modelmakers.c.)
 *           
 * Args:     msa      - alignment to digitize
 *           ret_dsqs - RETURN: array of digitized unaligned sequences
 *           
 * Return:   (void)
 *           dsqs is alloced here. Free2DArray(dseqs, nseq).
 */ 
void
DigitizeAlignment(MSA *msa, char ***ret_dsqs)
{
  char **dsq;
  int    idx;			/* counter for sequences     */
  int    dpos;			/* position in digitized seq */
  int    apos;			/* position in aligned seq   */

  dsq = (char **) MallocOrDie (sizeof(char *) * msa->nseq);
  for (idx = 0; idx < msa->nseq; idx++) {
    dsq[idx] = (char *) MallocOrDie (sizeof(char) * (msa->alen+2));

    dsq[idx][0] = (char) Alphabet_iupac; /* sentinel byte at start */

    for (apos = 0, dpos = 1; apos < msa->alen; apos++) {
      if (! isgap(msa->aseq[idx][apos]))  /* skip gaps */
	dsq[idx][dpos++] = SymbolIndex(msa->aseq[idx][apos]);
    }
    dsq[idx][dpos] = (char) Alphabet_iupac; /* sentinel byte at end */
  }
  *ret_dsqs = dsq;
}


/* Function: P7CountSymbol()
 * 
 * Purpose:  Given a possibly degenerate symbol code, increment
 *           a symbol counter array (generally an emission
 *           probability vector in counts form) appropriately.
 *           
 * Args:     counters:  vector to count into. [0..Alphabet_size-1]
 *           symidx:    symbol index to count: [0..Alphabet_iupac-1]
 *           wt:        weight to use for the count; often 1.0
 *           
 * Return:   (void)                
 */
void
P7CountSymbol(float *counters, char symidx, float wt)
{
  int x;

  if (symidx < Alphabet_size) 
    counters[(int) symidx] += wt;
  else
    for (x = 0; x < Alphabet_size; x++) {
      if (Degenerate[(int) symidx][x])
	counters[x] += wt / (float) DegenCount[(int) symidx];
    }
}


/* Function: DefaultGeneticCode()
 * 
 * Purpose:  Configure aacode, mapping triplets to amino acids.
 *           Triplet index: AAA = 0, AAC = 1, ... UUU = 63.
 *           AA index: alphabetical: A=0,C=1... Y=19
 *           Stop codon: -1. 
 *           Uses the stdcode1[] global translation table from SQUID.
 *           
 * Args:     aacode  - preallocated 0.63 array for genetic code
 *                     
 * Return:   (void)
 */
void
DefaultGeneticCode(int *aacode)
{
  int x;

  for (x = 0; x < 64; x++) {
    if (*(stdcode1[x]) == '*') aacode[x] = -1;
    else                       aacode[x] = SYMIDX(*(stdcode1[x]));
  }
}


/* Function: DefaultCodonBias()
 * 
 * Purpose:  Configure a codonbias table, mapping triplets to
 *           probability of using the triplet for the amino acid
 *           it represents: P(triplet | aa).
 *           The default is to assume codons are used equiprobably.
 *           
 * Args:     codebias:  0..63 array of P(triplet|aa), preallocated.
 * 
 * Return:   (void)
 */
void
DefaultCodonBias(float *codebias)
{
  codebias[0]  = 1./2.;	/* AAA Lys 2 */
  codebias[1]  = 1./2.;	/* AAC Asn 2 */
  codebias[2]  = 1./2.;	/* AAG Lys 2 */
  codebias[3]  = 1./2.;	/* AAU Asn 2 */
  codebias[4]  = 1./4.;	/* ACA Thr 4 */
  codebias[5]  = 1./4.;	/* ACC Thr 4 */
  codebias[6]  = 1./4.;	/* ACG Thr 4 */
  codebias[7]  = 1./4.;	/* ACU Thr 4 */
  codebias[8]  = 1./6.;	/* AGA Ser 6 */
  codebias[9]  = 1./6.;	/* AGC Arg 6 */
  codebias[10] = 1./6.;	/* AGG Ser 6 */
  codebias[11] = 1./6.;	/* AGU Arg 6 */
  codebias[12] = 1./3.;	/* AUA Ile 3 */
  codebias[13] = 1./3.;	/* AUC Ile 3 */
  codebias[14] = 1.;	/* AUG Met 1 */
  codebias[15] = 1./3.;	/* AUU Ile 3 */
  codebias[16] = 1./2.;	/* CAA Gln 2 */
  codebias[17] = 1./2.;	/* CAC His 2 */
  codebias[18] = 1./2.;	/* CAG Gln 2 */
  codebias[19] = 1./2.;	/* CAU His 2 */
  codebias[20] = 1./4.;	/* CCA Pro 4 */
  codebias[21] = 1./4.;	/* CCC Pro 4 */
  codebias[22] = 1./4.;	/* CCG Pro 4 */
  codebias[23] = 1./4.;	/* CCU Pro 4 */
  codebias[24] = 1./6.;	/* CGA Arg 6 */
  codebias[25] = 1./6.;	/* CGC Arg 6 */
  codebias[26] = 1./6.;	/* CGG Arg 6 */
  codebias[27] = 1./6.;	/* CGU Arg 6 */
  codebias[28] = 1./6.;	/* CUA Leu 6 */
  codebias[29] = 1./6.;	/* CUC Leu 6 */
  codebias[30] = 1./6.;	/* CUG Leu 6 */
  codebias[31] = 1./6.;	/* CUU Leu 6 */
  codebias[32] = 1./2.;	/* GAA Glu 2 */
  codebias[33] = 1./2.;	/* GAC Asp 2 */
  codebias[34] = 1./2.;	/* GAG Glu 2 */
  codebias[35] = 1./2.;	/* GAU Asp 2 */
  codebias[36] = 1./4.;	/* GCA Ala 4 */
  codebias[37] = 1./4.;	/* GCC Ala 4 */
  codebias[38] = 1./4.;	/* GCG Ala 4 */
  codebias[39] = 1./4.;	/* GCU Ala 4 */
  codebias[40] = 1./4.;	/* GGA Gly 4 */
  codebias[41] = 1./4.;	/* GGC Gly 4 */
  codebias[42] = 1./4.;	/* GGG Gly 4 */
  codebias[43] = 1./4.;	/* GGU Gly 4 */
  codebias[44] = 1./4.;	/* GUA Val 4 */
  codebias[45] = 1./4.;	/* GUC Val 4 */
  codebias[46] = 1./4.;	/* GUG Val 4 */
  codebias[47] = 1./4.;	/* GUU Val 4 */
  codebias[48] = 0.;	/* UAA och - */
  codebias[49] = 1./2.;	/* UAC Tyr 2 */
  codebias[50] = 0.;	/* UAG amb - */
  codebias[51] = 1./2.;	/* UAU Tyr 2 */
  codebias[52] = 1./6.;	/* UCA Ser 6 */
  codebias[53] = 1./6.;	/* UCC Ser 6 */
  codebias[54] = 1./6.;	/* UCG Ser 6 */
  codebias[55] = 1./6.;	/* UCU Ser 6 */
  codebias[56] = 0.;	/* UGA opa - */
  codebias[57] = 1./2.;	/* UGC Cys 2 */
  codebias[58] = 1.;	/* UGG Trp 1 */
  codebias[59] = 1./2.;	/* UGU Cys 2 */
  codebias[60] = 1./6.;	/* UUA Leu 6 */
  codebias[61] = 1./2.;	/* UUC Phe 2 */
  codebias[62] = 1./6.; /* UUG Leu 6 */
  codebias[63] = 1./2.;	/* UUU Phe 2 */
}


/* Function: set_degenerate()
 * 
 * Purpose:  convenience function for setting up 
 *           Degenerate[][] global for the alphabet.
 */
static void 
set_degenerate(char iupac, char *syms)
{
  DegenCount[strchr(Alphabet,iupac)-Alphabet] = strlen(syms);
  while (*syms) {
    Degenerate[strchr(Alphabet,iupac)-Alphabet]
              [strchr(Alphabet,*syms)-Alphabet] = 1;
    syms++;
  }
}