--- /dev/null
+/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+
+/*********************************************************************
+ * Clustal Omega - Multiple sequence alignment
+ *
+ * Copyright (C) 2010 University College Dublin
+ *
+ * Clustal-Omega is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This file is part of Clustal-Omega.
+ *
+ ********************************************************************/
+
+/*
+ * RCS $Id: seq.c 245 2011-06-15 12:38:53Z fabian $
+ *
+ *
+ * Module for sequence/alignment IO and misc.
+ *
+ * This depends heavily on Sean Eddy's squid library, which is obsoleted by
+ * HMMER3's Easel. However, easel doesn't support that many non-aligned input
+ * formats.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <assert.h>
+#include "squid/squid.h"
+#include <ctype.h>
+
+#include "util.h"
+#include "log.h"
+#include "seq.h"
+
+
+#define ALLOW_ONLY_PROTEIN 0 /* requested by Hamish, FS, r244 -> r245 */
+
+
+
+
+/**
+ * @brief Stripped down version of squid's alistat
+ *
+ *
+ * @param[in] prMSeq
+ * The alignment to analyse
+ * @param[in] bSampling
+ * For many sequences: samples from pool
+ * @param[in] bReportAll
+ * Report identities for all sequence pairs
+ *
+ * Don't have to worry about sequence case because our version of PairwiseIdentity is case insensitive
+ */
+void
+AliStat(mseq_t *prMSeq, bool bSampling, bool bReportAll) {
+
+ /*
+ * bSampling = squid's do_fast
+ * bReportAll = squid's allreport
+ */
+ float **ppdIdentMx; /* identity matrix (squid: imx) */
+ const int iNumSample = 1000; /* sample size (squid: nsample) */
+ int iAux;
+
+ MSA *msa; /* squid's alignment structure */
+ int small, large;
+ int bestj, worstj;
+ float sum;
+ float worst_worst, worst_best, best_best;
+ float avgid;
+ int i, j;
+ int nres; /* number of residues */
+
+ if (bSampling && bReportAll) {
+ Log(&rLog, LOG_WARN,
+ "Cannot report all and sample at the same time. Skipping %s()", __FUNCTION__);
+ return;
+ }
+ if (FALSE == prMSeq->aligned) {
+ Log(&rLog, LOG_WARN,
+ "Sequences are not aligned. Skipping %s()", __FUNCTION__);
+ return;
+ }
+
+ /* silence gcc warnings about uninitialized variables
+ */
+ worst_worst = worst_best = best_best = 0.0;
+ bestj = worstj = -1;
+
+
+ /** mseq to squid msa
+ *
+ * FIXME code overlap with WriteAlignment. Make it a function and take
+ * code there (contains more comments) as template
+ *
+ */
+ msa = MSAAlloc(prMSeq->nseqs,
+ /* derive alignment length from first seq */
+ strlen(prMSeq->seq[0]));
+ for (i=0; i<prMSeq->nseqs; i++) {
+ int key; /* MSA struct internal index for sequence */
+ char *this_name = prMSeq->sqinfo[i].name; /* prMSeq sequence name */
+ char *this_seq = prMSeq->seq[i]; /* prMSeq sequence */
+ SQINFO *this_sqinfo = &prMSeq->sqinfo[i]; /* prMSeq sequence name */
+
+ key = GKIStoreKey(msa->index, this_name);
+ msa->sqname[key] = sre_strdup(this_name, strlen(this_name));
+ /* setting msa->sqlen[idx] and msa->aseq[idx] */
+ msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key],
+ this_seq, strlen(this_seq));
+ if (this_sqinfo->flags & SQINFO_DESC) {
+ MSASetSeqDescription(msa, key, this_sqinfo->desc);
+ }
+ msa->nseq++;
+ }
+
+
+
+ nres = 0;
+ small = large = -1;
+ for (i = 0; i < msa->nseq; i++) {
+ int rlen; /* raw sequence length */
+ rlen = DealignedLength(msa->aseq[i]);
+ nres += rlen;
+ if (small == -1 || rlen < small) small = rlen;
+ if (large == -1 || rlen > large) large = rlen;
+ }
+
+
+ if (bSampling) {
+ avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen,
+ msa->nseq, iNumSample);
+
+ } else {
+ float best, worst;
+
+ /* this might be slow...could use openmp inside squid */
+ MakeIdentityMx(msa->aseq, msa->nseq, &ppdIdentMx);
+ if (bReportAll) {
+ printf(" %-15s %5s %7s %-15s %7s %-15s\n",
+ "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)");
+ printf(" --------------- ----- ------- --------------- ------- ---------------\n");
+ }
+
+ sum = 0.0;
+ worst_best = 1.0;
+ best_best = 0.0;
+ worst_worst = 1.0;
+ for (i = 0; i < msa->nseq; i++) {
+ worst = 1.0;
+ best = 0.0;
+ for (j = 0; j < msa->nseq; j++) {
+ /* closest seq to this one = best */
+ if (i != j && ppdIdentMx[i][j] > best) {
+ best = ppdIdentMx[i][j]; bestj = j;
+ }
+ if (ppdIdentMx[i][j] < worst) {
+ worst = ppdIdentMx[i][j]; worstj = j;
+ }
+ }
+
+ if (bReportAll) {
+ printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n",
+ msa->sqname[i], DealignedLength(msa->aseq[i]),
+ best * 100., msa->sqname[bestj],
+ worst * 100., msa->sqname[worstj]);
+ }
+ if (best > best_best) best_best = best;
+ if (best < worst_best) worst_best = best;
+ if (worst < worst_worst) worst_worst = worst;
+ for (j = 0; j < i; j++)
+ sum += ppdIdentMx[i][j];
+ }
+ avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0);
+ if (bReportAll)
+ puts("");
+ FMX2Free(ppdIdentMx);
+ } /* else bSampling */
+
+
+
+ /* Print output
+ */
+ if (msa->name != NULL)
+ printf("Alignment name: %s\n", msa->name);
+ /*printf("Format: %s\n", SeqfileFormat2String(afp->format));*/
+ printf("Number of sequences: %d\n", msa->nseq);
+ printf("Total # residues: %d\n", nres);
+ printf("Smallest: %d\n", small);
+ printf("Largest: %d\n", large);
+ printf("Average length: %.1f\n", (float) nres / (float) msa->nseq);
+ printf("Alignment length: %d\n", msa->alen);
+ printf("Average identity: %.2f%%\n", 100.*avgid);
+
+ if (! bSampling) {
+ printf("Most related pair: %.2f%%\n", 100.*best_best);
+ printf("Most unrelated pair: %.2f%%\n", 100.*worst_worst);
+ printf("Most distant seq: %.2f%%\n", 100.*worst_best);
+ }
+
+ /*
+ char *cs;
+ cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen);
+ printf cs;
+ */
+
+ MSAFree(msa);
+}
+/* end of AliStat() */
+
+
+
+
+
+/**
+ * @brief Shuffle mseq order
+ *
+ * @param[out] mseq
+ * mseq structure to shuffle
+ *
+ */
+void
+ShuffleMSeq(mseq_t *mseq)
+{
+ int iSeqIndex;
+ int *piPermArray;
+
+
+ /* quick and dirty: create an array with permuted indices and
+ * swap accordingly (which amounts to shuffling twice)
+ */
+
+ PermutationArray(&piPermArray, mseq->nseqs);
+
+ for (iSeqIndex=0; iSeqIndex<mseq->nseqs; iSeqIndex++) {
+ SeqSwap(mseq, piPermArray[iSeqIndex], iSeqIndex);
+ }
+
+ CKFREE(piPermArray);
+}
+/*** end: ShuffleMSeq() ***/
+
+
+/**
+ * @brief Swap two sequences in an mseq_t structure.
+ *
+ * @param[out] prMSeq
+ * Multiple sequence struct
+ * @param[in] i
+ * Index of first sequence
+ * @param[in] j
+ * Index of seconds sequence
+ *
+ *
+ */
+void
+SeqSwap(mseq_t *prMSeq, int i, int j)
+{
+ char *pcTmp;
+ SQINFO rSqinfoTmp;
+
+ assert(NULL!=prMSeq);
+ assert(i<prMSeq->nseqs && j<prMSeq->nseqs);
+
+ if (i==j) {
+ return;
+ }
+
+ pcTmp = prMSeq->seq[i];
+ prMSeq->seq[i] = prMSeq->seq[j];
+ prMSeq->seq[j] = pcTmp;
+
+ pcTmp = prMSeq->orig_seq[i];
+ prMSeq->orig_seq[i] = prMSeq->orig_seq[j];
+ prMSeq->orig_seq[j] = pcTmp;
+
+ rSqinfoTmp = prMSeq->sqinfo[i];
+ prMSeq->sqinfo[i] = prMSeq->sqinfo[j];
+ prMSeq->sqinfo[j] = rSqinfoTmp;
+
+ return;
+}
+/*** end: SeqSwap() ***/
+
+
+
+
+/**
+ * @brief Dealigns all sequences in mseq structure, updates the
+ * sequence length info and sets aligned to FALSE
+ *
+ * @param[out] mseq
+ * The mseq structure to dealign
+ *
+ */
+void
+DealignMSeq(mseq_t *mseq)
+{
+ int i; /* aux */
+ for (i=0; i<mseq->nseqs; i++) {
+ DealignSeq(mseq->seq[i]);
+ mseq->sqinfo[i].len = strlen(mseq->seq[i]);
+ }
+ mseq->aligned = FALSE;
+
+ return;
+}
+/*** end: DealinMSeq() ***/
+
+
+
+/**
+ * @brief debug output of sqinfo struct
+ *
+ * @param[in] sqinfo
+ * Squid's SQINFO struct for a certain seqeuence
+ *
+ * @note useful for debugging only
+ *
+ */
+void
+LogSqInfo(SQINFO *sqinfo)
+{
+
+ /*LOG_DEBUG("sqinfo->flags = %d", sqinfo->flags);*/
+ if (sqinfo->flags & SQINFO_NAME)
+ Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->name = %s", sqinfo->name);
+
+ if (sqinfo->flags & SQINFO_ID)
+ Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->id = %s", sqinfo->id);
+
+ if (sqinfo->flags & SQINFO_ACC)
+ Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->acc = %s", sqinfo->acc);
+
+ if (sqinfo->flags & SQINFO_DESC)
+ Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->desc = %s", sqinfo->desc);
+
+ if (sqinfo->flags & SQINFO_LEN)
+ Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->len = %d", sqinfo->len);
+
+ if (sqinfo->flags & SQINFO_START)
+ Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->start = %d", sqinfo->start);
+
+ if (sqinfo->flags & SQINFO_STOP)
+ Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->stop = %d", sqinfo->stop);
+
+ if (sqinfo->flags & SQINFO_OLEN)
+ Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->olen = %d", sqinfo->olen);
+
+ if (sqinfo->flags & SQINFO_TYPE)
+ Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->type = %d", sqinfo->type);
+
+ if (sqinfo->flags & SQINFO_SS)
+ Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->ss = %s", sqinfo->ss);
+
+ if (sqinfo->flags & SQINFO_SA)
+ Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->sa = %s", sqinfo->sa);
+}
+/*** end: log_sqinfo ***/
+
+
+/**
+ * @brief convert int-encoded seqtype to string
+ *
+ * @param[in] seqtype int-encoded sequence type
+ *
+ * @return character pointer describing the sequence type
+ *
+ */
+const char*
+SeqTypeToStr(int seqtype)
+{
+ switch (seqtype) {
+ case SEQTYPE_DNA:
+ return "DNA";
+ case SEQTYPE_RNA:
+ return "RNA";
+ case SEQTYPE_PROTEIN:
+ return "Protein";
+ case SEQTYPE_UNKNOWN:
+ return "UNKNOWN";
+ default:
+ Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__);
+ }
+ return "Will never get here";
+}
+/*** end: seqtype_to_str ***/
+
+
+
+/**
+ * @brief reads sequences from file
+ *
+ * @param[out] prMSeq
+ * Multiple sequence struct. Must be preallocated.
+ * FIXME: would make more sense to allocate it here.
+ * @param[in] seqfile
+ * Sequence file name. If '-' sequence will be read from stdin.
+ * @param[in] seqtype
+ * int-encoded sequence type. Set to
+ * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence)
+ * @param[in] iMaxNumSeq
+ * Return an error, if more than iMaxNumSeq have been read
+ * @param[in] iMaxSeqLen
+ * Return an error, if a seq longer than iMaxSeqLen has been read
+ *
+ * @return 0 on success, -1 on error
+ *
+ * @note
+ * - Depends heavily on squid
+ * - Sequence file format will be guessed
+ * - If supported by squid, gzipped files can be read as well.
+ */
+int
+ReadSequences(mseq_t *prMSeq, char *seqfile, int seqtype,
+ int iMaxNumSeq, int iMaxSeqLen)
+{
+ int fmt = SQFILE_UNKNOWN; /* file format: auto detect if unknown */
+ SQFILE *dbfp; /* sequence file descriptor */
+ char *cur_seq;
+ SQINFO cur_sqinfo;
+ int iSeqIdx; /* sequence counter */
+ int iSeqPos; /* sequence string position counter */
+
+ assert(NULL!=seqfile);
+
+
+ /* Try to work around inability to autodetect from a pipe or .gz:
+ * assume FASTA format
+ */
+ if (SQFILE_UNKNOWN == fmt &&
+ (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) {
+ fmt = SQFILE_FASTA;
+ }
+
+ /* Using squid routines to read input. taken from seqstat_main.c. we don't
+ * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen
+ * etc. NOTE this also means we discard some information, e.g. when
+ * reading from and writing to a stockholm file, all extra MSA
+ * info/annotation will be lost.
+ *
+ */
+
+ if (NULL == (dbfp = SeqfileOpen(seqfile, fmt, NULL))) {
+ Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile);
+ return -1;
+ }
+
+
+ /* FIXME squid's ReadSeq() will exit with fatal error if format is
+ * unknown. This will be a problem for a GUI. Same is true for many squid
+ * other functions.
+ *
+ * The original squid:ReadSeq() dealigns sequences on input. We
+ * use a patched version.
+ *
+ */
+ while (ReadSeq(dbfp, dbfp->format,
+ &cur_seq,
+ &cur_sqinfo)) {
+
+ if (prMSeq->nseqs+1>iMaxNumSeq) {
+ Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'",
+ iMaxNumSeq, cur_sqinfo.name, seqfile);
+ return -1;
+ }
+ if ((int)strlen(cur_seq)>iMaxSeqLen) {
+ Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)",
+ cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen);
+ return -1;
+ }
+
+ /* FIXME: use modified version of AddSeq() that allows handing down SqInfo
+ */
+
+ prMSeq->seq = (char **)
+ CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *));
+ prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq);
+
+
+ prMSeq->sqinfo = (SQINFO *)
+ CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO));
+ SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo);
+
+#ifdef TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]);
+ LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]);
+#endif
+ /* always guess type from first seq. use squid function and
+ * convert value
+ */
+ if (0 == prMSeq->nseqs) {
+ int type = Seqtype(prMSeq->seq[prMSeq->nseqs]);
+ switch (type) {
+ case kDNA:
+ prMSeq->seqtype = SEQTYPE_DNA;
+ break;
+ case kRNA:
+ prMSeq->seqtype = SEQTYPE_RNA;
+ break;
+ case kAmino:
+ prMSeq->seqtype = SEQTYPE_PROTEIN;
+ break;
+ case kOtherSeq:
+ prMSeq->seqtype = SEQTYPE_UNKNOWN;
+ break;
+ default:
+ Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__);
+ }
+
+ /* override with given sequence type but check with
+ * automatically detected type and warn if necessary
+ */
+ if (SEQTYPE_UNKNOWN != seqtype) {
+ if (prMSeq->seqtype != seqtype) {
+ Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested",
+ SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(seqtype));
+ prMSeq->seqtype = seqtype;
+ }
+ }
+ /* if type could not be determined and was not set return error */
+ if (SEQTYPE_UNKNOWN == seqtype && SEQTYPE_UNKNOWN == prMSeq->seqtype) {
+ Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence");
+ FreeSequence(cur_seq, &cur_sqinfo);
+ SeqfileClose(dbfp);
+ return -1;
+ }
+ }
+
+ Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s",
+ prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype),
+ prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len,
+ prMSeq->seq[prMSeq->nseqs]);
+
+ /* FIXME IPUAC and/or case conversion? If yes see
+ * corresponding squid functions. Special treatment of
+ * Stockholm tilde-gaps for ktuple code?
+ */
+
+ prMSeq->nseqs++;
+
+ FreeSequence(cur_seq, &cur_sqinfo);
+ }
+ SeqfileClose(dbfp);
+
+#if ALLOW_ONLY_PROTEIN
+ if (SEQTYPE_PROTEIN != prMSeq->seqtype) {
+ Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.",
+ SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME);
+ }
+#endif
+
+ /* Check if sequences are aligned */
+ prMSeq->aligned = SeqsAreAligned(prMSeq);
+
+
+ /* keep original sequence as copy and convert "working" sequence
+ *
+ */
+ prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *));
+ for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) {
+
+ prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]);
+
+
+ /* convert unknown characters according to set seqtype
+ * be conservative, i.e. don't allow any fancy ambiguity
+ * characters to make sure that ktuple code etc. works.
+ */
+
+ /* first on the fly conversion between DNA and RNA
+ */
+ if (prMSeq->seqtype==SEQTYPE_DNA)
+ ToDNA(prMSeq->seq[iSeqIdx]);
+ if (prMSeq->seqtype==SEQTYPE_RNA)
+ ToRNA(prMSeq->seq[iSeqIdx]);
+
+ /* then check of each character
+ */
+ for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) {
+ char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]);
+ if (isgap(*res))
+ continue;
+
+ if (prMSeq->seqtype==SEQTYPE_PROTEIN) {
+ if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) {
+ *res = AMINOACID_ANY;
+ }
+ } else if (prMSeq->seqtype==SEQTYPE_DNA) {
+ if (NULL == strchr(DNA_ALPHABET, toupper(*res))) {
+ *res = NUCLEOTIDE_ANY;
+ }
+ } else if (prMSeq->seqtype==SEQTYPE_RNA) {
+ if (NULL == strchr(RNA_ALPHABET, toupper(*res))) {
+ *res = NUCLEOTIDE_ANY;
+ }
+ }
+ }
+ }
+
+
+ prMSeq->filename = CkStrdup(seqfile);
+ Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s",
+ prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename);
+
+ return 0;
+}
+/*** end: ReadSequences ***/
+
+
+/**
+ * @brief allocate and initialise new mseq_t
+ *
+ * @param[out] prMSeq
+ * newly allocated and initialised mseq_t
+ *
+ * @note caller has to free by calling FreeMSeq()
+ *
+ * @see FreeMSeq
+ *
+ */
+void
+NewMSeq(mseq_t **prMSeq)
+{
+ *prMSeq = (mseq_t *) CKMALLOC(1 * sizeof(mseq_t));
+
+ (*prMSeq)->nseqs = 0;
+ (*prMSeq)->seq = NULL;
+ (*prMSeq)->orig_seq = NULL;
+ (*prMSeq)->seqtype = SEQTYPE_UNKNOWN;
+ (*prMSeq)->sqinfo = NULL;
+ (*prMSeq)->filename = NULL;
+}
+/*** end: NewMSeq ***/
+
+
+
+/**
+ * @brief copies an mseq structure
+ *
+ * @param[out] prMSeqDest_p
+ * Copy of mseq structure
+ * @param[in] prMSeqSrc
+ * Source mseq structure to copy
+ *
+ * @note caller has to free copy by calling FreeMSeq()
+ *
+ */
+void
+CopyMSeq(mseq_t **prMSeqDest_p, mseq_t *prMSeqSrc)
+{
+ int i;
+ assert(prMSeqSrc != NULL && prMSeqDest_p != NULL);
+
+ NewMSeq(prMSeqDest_p);
+
+ (*prMSeqDest_p)->nseqs = prMSeqSrc->nseqs;
+ (*prMSeqDest_p)->seqtype = prMSeqSrc->seqtype;
+ if (prMSeqSrc->filename!=NULL) {
+ (*prMSeqDest_p)->filename = CkStrdup(prMSeqSrc->filename);
+ }
+
+ (*prMSeqDest_p)->seq = (char **)
+ CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(char *));
+ (*prMSeqDest_p)->orig_seq = (char **)
+ CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(char *));
+ (*prMSeqDest_p)->sqinfo = (SQINFO *)
+ CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(SQINFO));
+
+
+
+ for (i=0; i<(*prMSeqDest_p)->nseqs; i++) {
+ (*prMSeqDest_p)->seq[i] = CkStrdup(prMSeqSrc->seq[i]);
+ (*prMSeqDest_p)->orig_seq[i] = CkStrdup(prMSeqSrc->orig_seq[i]);
+ SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[i], &prMSeqSrc->sqinfo[i]);
+ }
+}
+/*** end: CopyMSeq ***/
+
+
+
+/**
+ * @brief
+ *
+ * @param[in] seqname
+ * The sequence name to search for
+ * @param[in] mseq
+ * The multiple sequence structure to search in
+ *
+ * @return -1 on failure, sequence index of matching name otherwise
+ *
+ * @warning If sequence name happens to be used twice, only the first
+ * one will be reported back
+ *
+ */
+int
+FindSeqName(char *seqname, mseq_t *mseq)
+{
+ int i; /* aux */
+
+ assert(NULL!=mseq);
+
+ for (i=0; i<mseq->nseqs; i++) {
+ if (STR_EQ(mseq->sqinfo[i].name, seqname)) {
+ return i;
+ }
+ }
+
+ return -1;
+}
+/*** end: FindSeqName() ***/
+
+
+/**
+ * @brief Frees an mseq_t and it's members and zeros all members
+ *
+ * @param[in] mseq mseq_to to free
+ *
+ * @note use in conjunction with NewMSeq()
+ * @see new_mseq
+ */
+void
+FreeMSeq(mseq_t **mseq)
+{
+ int i;
+
+ if (NULL==(*mseq)) {
+ return;
+ }
+
+ if ((*mseq)->filename) {
+ (*mseq)->filename = CKFREE((*mseq)->filename);
+ }
+
+ for (i=0; i<(*mseq)->nseqs; i++) {
+ FreeSequence((*mseq)->seq[i], &(*mseq)->sqinfo[i]);
+ CKFREE((*mseq)->orig_seq[i]);
+ }
+ if ((*mseq)->seq) {
+ CKFREE((*mseq)->seq);
+ }
+ if ((*mseq)->orig_seq) { /* FIXME (FS): only ptr to ptr freed, actual sequences NOT freed*/
+ CKFREE((*mseq)->orig_seq);
+ }
+ if ((*mseq)->sqinfo) {
+ CKFREE((*mseq)->sqinfo);
+ }
+
+ (*mseq)->seqtype = SEQTYPE_UNKNOWN;
+ (*mseq)->nseqs = 0;
+
+ CKFREE((*mseq));
+}
+/*** end: FreeMSeq ***/
+
+
+/**
+ * @brief Write alignment to file.
+ *
+ * @param[in] mseq
+ * The mseq_t struct containing the aligned sequences
+ * @param[in] pcAlnOutfile
+ * The name of the output file
+ * @param[in] outfmt
+ * The alignment output format (defined in squid.h)
+ *
+ * @return Non-zero on error
+ *
+ * @note We create a temporary squid MSA struct in here because we never
+ * use it within clustal. We might be better of using the old clustal
+ * output routines instead.
+ *
+ */
+int
+WriteAlignment(mseq_t *mseq, const char *pcAlnOutfile, int outfmt)
+{
+ int i; /* aux */
+ MSA *msa; /* squid's alignment structure */
+ FILE *pfOut = NULL;
+ int key; /* MSA struct internal index for sequence */
+ int alen; /* alignment length */
+ bool use_stdout;
+
+ assert(mseq!=NULL);
+
+ if (MSAFILE_UNKNOWN == outfmt) {
+ Log(&rLog, LOG_ERROR, "Unknown output format chosen");
+ return -1;
+ }
+
+ if (NULL == pcAlnOutfile) {
+ pfOut = stdout;
+ use_stdout = TRUE;
+ } else {
+ use_stdout = FALSE;
+ if (NULL == (pfOut = fopen(pcAlnOutfile, "w"))) {
+ Log(&rLog, LOG_ERROR, "Could not open file %s for writing", pcAlnOutfile);
+ return -1;
+ }
+ }
+
+
+ /* derive alignment length from first seq */
+ alen = strlen(mseq->seq[0]);
+
+ msa = MSAAlloc(mseq->nseqs, alen);
+
+ /* basic structure borrowed code from squid-1.9g/a2m.c:ReadA2M()
+ * we actually create a copy of mseq. keeping the pointers becomes
+ * messy when calling MSAFree()
+ */
+ for (i=0; i<mseq->nseqs; i++) {
+ char *this_name = mseq->sqinfo[i].name; /* mseq sequence name */
+ char *this_seq = mseq->seq[i]; /* mseq sequence */
+ SQINFO *this_sqinfo = &mseq->sqinfo[i]; /* mseq sequence name */
+
+ key = GKIStoreKey(msa->index, this_name);
+ msa->sqname[key] = sre_strdup(this_name, strlen(this_name));
+
+ /* setting msa->sqlen[idx] and msa->aseq[idx] */
+ msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key],
+ this_seq, strlen(this_seq));
+
+ if (this_sqinfo->flags & SQINFO_DESC) {
+ /* FIXME never get here ... */
+ MSASetSeqDescription(msa, key, this_sqinfo->desc);
+ }
+ /* FIXME extend this by copying more stuff according to flags.
+ * See MSAFileRead() in msa.c and used functions there
+ *
+ * Problem is that we never parse MSA information as we use squid'sSeqFile
+ */
+
+ msa->nseq++;
+ }
+
+
+ /* FIXME Would like to, but can't use MSAVerifyParse(msa) here, as it
+ * will die on error. Need to implement our own version
+ */
+#if 0
+ MSAVerifyParse(msa);
+#endif
+
+ /* The below is copy of MSAFileWrite() which originally only writes to stdout.
+ */
+
+ /* Be sloppy and make a2m and fasta the same. same for vienna (which is
+ the same). same same. can can. boleh boleh */
+ if (outfmt==SQFILE_FASTA)
+ outfmt = MSAFILE_A2M;
+ if (outfmt==SQFILE_VIENNA)
+ outfmt = MSAFILE_VIENNA;
+
+ switch (outfmt) {
+ case MSAFILE_A2M:
+ WriteA2M(pfOut, msa, 0);
+ break;
+ case MSAFILE_VIENNA:
+ WriteA2M(pfOut, msa, 1);
+ break;
+ case MSAFILE_CLUSTAL:
+ WriteClustal(pfOut, msa);
+ break;
+ case MSAFILE_MSF:
+ WriteMSF(pfOut, msa);
+ break;
+ case MSAFILE_PHYLIP:
+ WritePhylip(pfOut, msa);
+ break;
+ case MSAFILE_SELEX:
+ WriteSELEX(pfOut, msa);
+ break;
+ case MSAFILE_STOCKHOLM:
+ WriteStockholm(pfOut, msa);
+ break;
+ default:
+ Log(&rLog, LOG_FATAL, "internal error: %s",
+ "invalid output format should have been detected before");
+ }
+
+ if (use_stdout == FALSE) {
+ (void) fclose(pfOut);
+ Log(&rLog, LOG_INFO,
+ "Alignment written to %s", pcAlnOutfile);
+ }
+ MSAFree(msa);
+
+ return 0;
+}
+/*** end of WriteAlignment() ***/
+
+
+/**
+ * @brief Removes all gap-characters from a sequence.
+ *
+ * @param[out] seq
+ * Sequence to dealign
+ *
+ * @note seq will not be reallocated
+ */
+void
+DealignSeq(char *seq)
+{
+ int aln_pos;
+ int dealn_pos;
+
+ assert(seq!=NULL);
+
+ dealn_pos=0;
+ for (aln_pos=0; aln_pos<(int)strlen(seq); aln_pos++) {
+ if (! isgap(seq[aln_pos])) {
+ seq[dealn_pos++] = seq[aln_pos];
+ }
+ }
+ seq[dealn_pos] = '\0';
+
+ return;
+}
+/*** end: DealignSeq() ***/
+
+
+
+/**
+ * @brief Sort sequences by length
+ *
+ * @param[out] prMSeq
+ * mseq to sort by length
+ * @param[out] cOrder
+ * Sorting order. 'd' for descending, 'a' for ascending.
+ *
+ *
+ */
+void
+SortMSeqByLength(mseq_t *prMSeq, const char cOrder)
+{
+ int *piSeqLen;
+ int *piOrder;
+ int iSeqIndex;
+ mseq_t *prMSeqCopy = NULL;
+
+ assert('a'==cOrder || 'd'==cOrder);
+
+ Log(&rLog, LOG_WARN,
+ "FIXME: This modifies sequence ordering. Might not be what user wants. Will change output order as well");
+
+ piSeqLen = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
+ piOrder = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
+ }
+ QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, cOrder, FALSE);
+
+ CopyMSeq(&prMSeqCopy, prMSeq);
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ /* copy mseq entry
+ */
+ CKFREE(prMSeq->seq[iSeqIndex]);
+ prMSeq->seq[iSeqIndex] = CkStrdup(prMSeqCopy->seq[piOrder[iSeqIndex]]);
+
+ CKFREE(prMSeq->orig_seq[iSeqIndex]);
+ prMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeqCopy->orig_seq[piOrder[iSeqIndex]]);
+
+ SeqinfoCopy(&prMSeq->sqinfo[iSeqIndex], &prMSeqCopy->sqinfo[piOrder[iSeqIndex]]);
+ }
+
+ CKFREE(piSeqLen);
+ CKFREE(piOrder);
+ FreeMSeq(&prMSeqCopy);
+
+ return;
+}
+/*** end: SortMSeqByLength() ***/
+
+
+
+/**
+ * @brief Checks if sequences in given mseq structure are aligned. By
+ * definition this is only true, if sequences are of the same length
+ * and at least one gap was found
+ *
+ * @param[in] prMSeq
+ * Sequences to check
+ *
+ * @return TRUE if sequences are aligned, FALSE if not
+ *
+ *
+ */
+bool
+SeqsAreAligned(mseq_t *prMSeq)
+{
+ bool bGapFound, bSameLength;
+ int iSeqIdx; /* sequence counter */
+ int iSeqPos; /* sequence string position counter */
+
+ /* Special case of just one sequence:
+ * it is arguable that a single sequence qualifies as a profile,
+ * however, this is what we do at the first stage of MSA anyway.
+ * So, if there is only 1 sequence it is a 1-profile
+ * and it is (defined to be) aligned (with itself). FS, r240 -> 241
+ */
+ if (1 == prMSeq->nseqs) {
+ return TRUE;
+ }
+
+
+ /* Check if sequences are aligned. For being aligned, the
+ * sequences have to be of same length (bSameLength) and at least
+ * one of them has to contain at least one gap (bGapFound)
+ */
+ bGapFound = FALSE;
+ bSameLength = TRUE;
+ for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) {
+ if (FALSE == bGapFound) {
+ for (iSeqPos=0;
+ iSeqPos<prMSeq->sqinfo[iSeqIdx].len && false==bGapFound;
+ iSeqPos++) {
+ if (isgap(prMSeq->seq[iSeqIdx][iSeqPos])) {
+ bGapFound = TRUE;
+ /* skip rest of sequence */
+ break;
+ }
+ }
+ }
+
+ if (iSeqIdx>0) {
+ if (prMSeq->sqinfo[iSeqIdx].len != prMSeq->sqinfo[iSeqIdx-1].len) {
+ bSameLength = FALSE;
+ /* no need to continue search, bSameLength==FALSE is
+ * sufficient condition */
+ break;
+ }
+ }
+ }
+#if 0
+ Log(&rLog, LOG_FORCED_DEBUG, "bSameLength=%d bGapFound=%d", bSameLength, bGapFound);
+#endif
+ if (TRUE == bSameLength && TRUE == bGapFound) {
+ return TRUE;
+ } else {
+ return FALSE;
+ }
+
+}
+/*** end: SeqsAreAligned() ***/
+
+
+
+/**
+ * @brief Creates a new sequence entry and appends it to an existing mseq
+ * structure.
+ *
+ * @param[out] prMSeqDest_p
+ * Already existing and initialised mseq structure
+ * @param[in] pcSeqName
+ * sequence name of the sequence to add
+ * @param[in] pcSeqRes
+ * the actual sequence (residues) to add
+ *
+ * @note Don't forget to update the align and type flag if necessary!
+ *
+ * FIXME allow adding of more features
+ *
+ */
+void
+AddSeq(mseq_t **prMSeqDest_p, char *pcSeqName, char *pcSeqRes)
+{
+ int iSeqIdx = 0;
+ SQINFO sqinfo;
+
+ assert(NULL != prMSeqDest_p);
+ assert(NULL != pcSeqName);
+ assert(NULL != pcSeqRes);
+
+ iSeqIdx = (*prMSeqDest_p)->nseqs;
+
+ (*prMSeqDest_p)->seq = (char **)
+ CKREALLOC((*prMSeqDest_p)->seq, (iSeqIdx+1) * sizeof(char *));
+ (*prMSeqDest_p)->orig_seq = (char **)
+ CKREALLOC((*prMSeqDest_p)->orig_seq, (iSeqIdx+1) * sizeof(char *));
+ (*prMSeqDest_p)->sqinfo = (SQINFO *)
+ CKREALLOC((*prMSeqDest_p)->sqinfo, (iSeqIdx+1) * sizeof(SQINFO));
+
+
+ (*prMSeqDest_p)->seq[iSeqIdx] = CkStrdup(pcSeqRes);
+ (*prMSeqDest_p)->orig_seq[iSeqIdx] = CkStrdup(pcSeqRes);
+
+ /* should probably get ri of SqInfo altogether in the long run and just
+ transfer the intersting members into our own struct
+ */
+ sqinfo.flags = 0; /* init */
+
+ sqinfo.len = strlen(pcSeqRes);
+ sqinfo.flags |= SQINFO_LEN;
+
+ /* name is an array of SQINFO_NAMELEN length */
+ strncpy(sqinfo.name, pcSeqName, SQINFO_NAMELEN-1);
+ sqinfo.name[SQINFO_NAMELEN-1] = '\0';
+ sqinfo.flags |= SQINFO_NAME;
+
+ SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iSeqIdx],
+ & sqinfo);
+
+ (*prMSeqDest_p)->nseqs++;
+
+ return;
+}
+/* end of AddSeq() */
+
+
+
+
+/**
+ * @brief Appends an mseq structure to an already existing one.
+ * filename will be left untouched.
+ *
+ * @param[in] prMSeqDest_p
+ * MSeq structure to which to append to
+ * @param[out] prMSeqToAdd
+ * MSeq structure which is to append
+ *
+ *
+ */
+void
+JoinMSeqs(mseq_t **prMSeqDest_p, mseq_t *prMSeqToAdd)
+{
+ int iSrcSeqIndex;
+ int iNewNSeq;
+
+ assert(NULL != prMSeqDest_p && NULL != (*prMSeqDest_p));
+ assert(NULL != prMSeqToAdd);
+
+ if (0 == prMSeqToAdd->nseqs) {
+ Log(&rLog, LOG_WARN, "Was asked to add 0 sequences");
+ return;
+ }
+
+ /* warn on seqtype mismatch and keep original seqtype */
+ if ((*prMSeqDest_p)->seqtype != prMSeqToAdd->seqtype) {
+ Log(&rLog, LOG_WARN, "Joining sequences of different type");
+ }
+
+ /* leave filename as it is */
+
+ /*
+ * copy new seq/s, orig_seq/s, sqinfo/s
+ */
+ iNewNSeq = (*prMSeqDest_p)->nseqs + prMSeqToAdd->nseqs;
+
+ (*prMSeqDest_p)->seq = (char **)
+ CKREALLOC((*prMSeqDest_p)->seq, iNewNSeq * sizeof(char *));
+
+ (*prMSeqDest_p)->orig_seq = (char **)
+ CKREALLOC((*prMSeqDest_p)->orig_seq, iNewNSeq * sizeof(char *));
+
+ (*prMSeqDest_p)->sqinfo = (SQINFO *)
+ CKREALLOC((*prMSeqDest_p)->sqinfo, iNewNSeq * sizeof(SQINFO));
+
+
+ for (iSrcSeqIndex=0; iSrcSeqIndex < prMSeqToAdd->nseqs; iSrcSeqIndex++) {
+ int iDstSeqIndex = (*prMSeqDest_p)->nseqs++;
+
+ (*prMSeqDest_p)->seq[iDstSeqIndex] =
+ CkStrdup(prMSeqToAdd->seq[iSrcSeqIndex]);
+
+ (*prMSeqDest_p)->orig_seq[iDstSeqIndex] =
+ CkStrdup(prMSeqToAdd->orig_seq[iSrcSeqIndex]);
+
+ SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iDstSeqIndex],
+ & prMSeqToAdd->sqinfo[iSrcSeqIndex]);
+ }
+
+ (*prMSeqDest_p)->nseqs = iNewNSeq;
+
+ (*prMSeqDest_p)->aligned = SeqsAreAligned(*prMSeqDest_p);
+
+ return;
+}
+/*** end: JoinMSeqs() ***/