/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /********************************************************************* * Clustal Omega - Multiple sequence alignment * * Copyright (C) 2010 University College Dublin * * Clustal-Omega is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This file is part of Clustal-Omega. * ********************************************************************/ /* * RCS $Id: seq.c 245 2011-06-15 12:38:53Z fabian $ * * * Module for sequence/alignment IO and misc. * * This depends heavily on Sean Eddy's squid library, which is obsoleted by * HMMER3's Easel. However, easel doesn't support that many non-aligned input * formats. * */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "squid/squid.h" #include #include "util.h" #include "log.h" #include "seq.h" #define ALLOW_ONLY_PROTEIN 0 /* requested by Hamish, FS, r244 -> r245 */ /** * @brief Stripped down version of squid's alistat * * * @param[in] prMSeq * The alignment to analyse * @param[in] bSampling * For many sequences: samples from pool * @param[in] bReportAll * Report identities for all sequence pairs * * Don't have to worry about sequence case because our version of PairwiseIdentity is case insensitive */ void AliStat(mseq_t *prMSeq, bool bSampling, bool bReportAll) { /* * bSampling = squid's do_fast * bReportAll = squid's allreport */ float **ppdIdentMx; /* identity matrix (squid: imx) */ const int iNumSample = 1000; /* sample size (squid: nsample) */ int iAux; MSA *msa; /* squid's alignment structure */ int small, large; int bestj, worstj; float sum; float worst_worst, worst_best, best_best; float avgid; int i, j; int nres; /* number of residues */ if (bSampling && bReportAll) { Log(&rLog, LOG_WARN, "Cannot report all and sample at the same time. Skipping %s()", __FUNCTION__); return; } if (FALSE == prMSeq->aligned) { Log(&rLog, LOG_WARN, "Sequences are not aligned. Skipping %s()", __FUNCTION__); return; } /* silence gcc warnings about uninitialized variables */ worst_worst = worst_best = best_best = 0.0; bestj = worstj = -1; /** mseq to squid msa * * FIXME code overlap with WriteAlignment. Make it a function and take * code there (contains more comments) as template * */ msa = MSAAlloc(prMSeq->nseqs, /* derive alignment length from first seq */ strlen(prMSeq->seq[0])); for (i=0; inseqs; i++) { int key; /* MSA struct internal index for sequence */ char *this_name = prMSeq->sqinfo[i].name; /* prMSeq sequence name */ char *this_seq = prMSeq->seq[i]; /* prMSeq sequence */ SQINFO *this_sqinfo = &prMSeq->sqinfo[i]; /* prMSeq sequence name */ key = GKIStoreKey(msa->index, this_name); msa->sqname[key] = sre_strdup(this_name, strlen(this_name)); /* setting msa->sqlen[idx] and msa->aseq[idx] */ msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key], this_seq, strlen(this_seq)); if (this_sqinfo->flags & SQINFO_DESC) { MSASetSeqDescription(msa, key, this_sqinfo->desc); } msa->nseq++; } nres = 0; small = large = -1; for (i = 0; i < msa->nseq; i++) { int rlen; /* raw sequence length */ rlen = DealignedLength(msa->aseq[i]); nres += rlen; if (small == -1 || rlen < small) small = rlen; if (large == -1 || rlen > large) large = rlen; } if (bSampling) { avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen, msa->nseq, iNumSample); } else { float best, worst; /* this might be slow...could use openmp inside squid */ MakeIdentityMx(msa->aseq, msa->nseq, &ppdIdentMx); if (bReportAll) { printf(" %-15s %5s %7s %-15s %7s %-15s\n", "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)"); printf(" --------------- ----- ------- --------------- ------- ---------------\n"); } sum = 0.0; worst_best = 1.0; best_best = 0.0; worst_worst = 1.0; for (i = 0; i < msa->nseq; i++) { worst = 1.0; best = 0.0; for (j = 0; j < msa->nseq; j++) { /* closest seq to this one = best */ if (i != j && ppdIdentMx[i][j] > best) { best = ppdIdentMx[i][j]; bestj = j; } if (ppdIdentMx[i][j] < worst) { worst = ppdIdentMx[i][j]; worstj = j; } } if (bReportAll) { printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n", msa->sqname[i], DealignedLength(msa->aseq[i]), best * 100., msa->sqname[bestj], worst * 100., msa->sqname[worstj]); } if (best > best_best) best_best = best; if (best < worst_best) worst_best = best; if (worst < worst_worst) worst_worst = worst; for (j = 0; j < i; j++) sum += ppdIdentMx[i][j]; } avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0); if (bReportAll) puts(""); FMX2Free(ppdIdentMx); } /* else bSampling */ /* Print output */ if (msa->name != NULL) printf("Alignment name: %s\n", msa->name); /*printf("Format: %s\n", SeqfileFormat2String(afp->format));*/ printf("Number of sequences: %d\n", msa->nseq); printf("Total # residues: %d\n", nres); printf("Smallest: %d\n", small); printf("Largest: %d\n", large); printf("Average length: %.1f\n", (float) nres / (float) msa->nseq); printf("Alignment length: %d\n", msa->alen); printf("Average identity: %.2f%%\n", 100.*avgid); if (! bSampling) { printf("Most related pair: %.2f%%\n", 100.*best_best); printf("Most unrelated pair: %.2f%%\n", 100.*worst_worst); printf("Most distant seq: %.2f%%\n", 100.*worst_best); } /* char *cs; cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen); printf cs; */ MSAFree(msa); } /* end of AliStat() */ /** * @brief Shuffle mseq order * * @param[out] mseq * mseq structure to shuffle * */ void ShuffleMSeq(mseq_t *mseq) { int iSeqIndex; int *piPermArray; /* quick and dirty: create an array with permuted indices and * swap accordingly (which amounts to shuffling twice) */ PermutationArray(&piPermArray, mseq->nseqs); for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { SeqSwap(mseq, piPermArray[iSeqIndex], iSeqIndex); } CKFREE(piPermArray); } /*** end: ShuffleMSeq() ***/ /** * @brief Swap two sequences in an mseq_t structure. * * @param[out] prMSeq * Multiple sequence struct * @param[in] i * Index of first sequence * @param[in] j * Index of seconds sequence * * */ void SeqSwap(mseq_t *prMSeq, int i, int j) { char *pcTmp; SQINFO rSqinfoTmp; assert(NULL!=prMSeq); assert(inseqs && jnseqs); if (i==j) { return; } pcTmp = prMSeq->seq[i]; prMSeq->seq[i] = prMSeq->seq[j]; prMSeq->seq[j] = pcTmp; pcTmp = prMSeq->orig_seq[i]; prMSeq->orig_seq[i] = prMSeq->orig_seq[j]; prMSeq->orig_seq[j] = pcTmp; rSqinfoTmp = prMSeq->sqinfo[i]; prMSeq->sqinfo[i] = prMSeq->sqinfo[j]; prMSeq->sqinfo[j] = rSqinfoTmp; return; } /*** end: SeqSwap() ***/ /** * @brief Dealigns all sequences in mseq structure, updates the * sequence length info and sets aligned to FALSE * * @param[out] mseq * The mseq structure to dealign * */ void DealignMSeq(mseq_t *mseq) { int i; /* aux */ for (i=0; inseqs; i++) { DealignSeq(mseq->seq[i]); mseq->sqinfo[i].len = strlen(mseq->seq[i]); } mseq->aligned = FALSE; return; } /*** end: DealinMSeq() ***/ /** * @brief debug output of sqinfo struct * * @param[in] sqinfo * Squid's SQINFO struct for a certain seqeuence * * @note useful for debugging only * */ void LogSqInfo(SQINFO *sqinfo) { /*LOG_DEBUG("sqinfo->flags = %d", sqinfo->flags);*/ if (sqinfo->flags & SQINFO_NAME) Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->name = %s", sqinfo->name); if (sqinfo->flags & SQINFO_ID) Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->id = %s", sqinfo->id); if (sqinfo->flags & SQINFO_ACC) Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->acc = %s", sqinfo->acc); if (sqinfo->flags & SQINFO_DESC) Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->desc = %s", sqinfo->desc); if (sqinfo->flags & SQINFO_LEN) Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->len = %d", sqinfo->len); if (sqinfo->flags & SQINFO_START) Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->start = %d", sqinfo->start); if (sqinfo->flags & SQINFO_STOP) Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->stop = %d", sqinfo->stop); if (sqinfo->flags & SQINFO_OLEN) Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->olen = %d", sqinfo->olen); if (sqinfo->flags & SQINFO_TYPE) Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->type = %d", sqinfo->type); if (sqinfo->flags & SQINFO_SS) Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->ss = %s", sqinfo->ss); if (sqinfo->flags & SQINFO_SA) Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->sa = %s", sqinfo->sa); } /*** end: log_sqinfo ***/ /** * @brief convert int-encoded seqtype to string * * @param[in] seqtype int-encoded sequence type * * @return character pointer describing the sequence type * */ const char* SeqTypeToStr(int seqtype) { switch (seqtype) { case SEQTYPE_DNA: return "DNA"; case SEQTYPE_RNA: return "RNA"; case SEQTYPE_PROTEIN: return "Protein"; case SEQTYPE_UNKNOWN: return "UNKNOWN"; default: Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__); } return "Will never get here"; } /*** end: seqtype_to_str ***/ /** * @brief reads sequences from file * * @param[out] prMSeq * Multiple sequence struct. Must be preallocated. * FIXME: would make more sense to allocate it here. * @param[in] seqfile * Sequence file name. If '-' sequence will be read from stdin. * @param[in] seqtype * int-encoded sequence type. Set to * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence) * @param[in] iMaxNumSeq * Return an error, if more than iMaxNumSeq have been read * @param[in] iMaxSeqLen * Return an error, if a seq longer than iMaxSeqLen has been read * * @return 0 on success, -1 on error * * @note * - Depends heavily on squid * - Sequence file format will be guessed * - If supported by squid, gzipped files can be read as well. */ int ReadSequences(mseq_t *prMSeq, char *seqfile, int seqtype, int iMaxNumSeq, int iMaxSeqLen) { int fmt = SQFILE_UNKNOWN; /* file format: auto detect if unknown */ SQFILE *dbfp; /* sequence file descriptor */ char *cur_seq; SQINFO cur_sqinfo; int iSeqIdx; /* sequence counter */ int iSeqPos; /* sequence string position counter */ assert(NULL!=seqfile); /* Try to work around inability to autodetect from a pipe or .gz: * assume FASTA format */ if (SQFILE_UNKNOWN == fmt && (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) { fmt = SQFILE_FASTA; } /* Using squid routines to read input. taken from seqstat_main.c. we don't * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen * etc. NOTE this also means we discard some information, e.g. when * reading from and writing to a stockholm file, all extra MSA * info/annotation will be lost. * */ if (NULL == (dbfp = SeqfileOpen(seqfile, fmt, NULL))) { Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile); return -1; } /* FIXME squid's ReadSeq() will exit with fatal error if format is * unknown. This will be a problem for a GUI. Same is true for many squid * other functions. * * The original squid:ReadSeq() dealigns sequences on input. We * use a patched version. * */ while (ReadSeq(dbfp, dbfp->format, &cur_seq, &cur_sqinfo)) { if (prMSeq->nseqs+1>iMaxNumSeq) { Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'", iMaxNumSeq, cur_sqinfo.name, seqfile); return -1; } if ((int)strlen(cur_seq)>iMaxSeqLen) { Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)", cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen); return -1; } /* FIXME: use modified version of AddSeq() that allows handing down SqInfo */ prMSeq->seq = (char **) CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *)); prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq); prMSeq->sqinfo = (SQINFO *) CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO)); SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo); #ifdef TRACE Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]); LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]); #endif /* always guess type from first seq. use squid function and * convert value */ if (0 == prMSeq->nseqs) { int type = Seqtype(prMSeq->seq[prMSeq->nseqs]); switch (type) { case kDNA: prMSeq->seqtype = SEQTYPE_DNA; break; case kRNA: prMSeq->seqtype = SEQTYPE_RNA; break; case kAmino: prMSeq->seqtype = SEQTYPE_PROTEIN; break; case kOtherSeq: prMSeq->seqtype = SEQTYPE_UNKNOWN; break; default: Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__); } /* override with given sequence type but check with * automatically detected type and warn if necessary */ if (SEQTYPE_UNKNOWN != seqtype) { if (prMSeq->seqtype != seqtype) { Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested", SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(seqtype)); prMSeq->seqtype = seqtype; } } /* if type could not be determined and was not set return error */ if (SEQTYPE_UNKNOWN == seqtype && SEQTYPE_UNKNOWN == prMSeq->seqtype) { Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence"); FreeSequence(cur_seq, &cur_sqinfo); SeqfileClose(dbfp); return -1; } } Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len, prMSeq->seq[prMSeq->nseqs]); /* FIXME IPUAC and/or case conversion? If yes see * corresponding squid functions. Special treatment of * Stockholm tilde-gaps for ktuple code? */ prMSeq->nseqs++; FreeSequence(cur_seq, &cur_sqinfo); } SeqfileClose(dbfp); #if ALLOW_ONLY_PROTEIN if (SEQTYPE_PROTEIN != prMSeq->seqtype) { Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.", SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME); } #endif /* Check if sequences are aligned */ prMSeq->aligned = SeqsAreAligned(prMSeq); /* keep original sequence as copy and convert "working" sequence * */ prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *)); for (iSeqIdx=0; iSeqIdxnseqs; iSeqIdx++) { prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]); /* convert unknown characters according to set seqtype * be conservative, i.e. don't allow any fancy ambiguity * characters to make sure that ktuple code etc. works. */ /* first on the fly conversion between DNA and RNA */ if (prMSeq->seqtype==SEQTYPE_DNA) ToDNA(prMSeq->seq[iSeqIdx]); if (prMSeq->seqtype==SEQTYPE_RNA) ToRNA(prMSeq->seq[iSeqIdx]); /* then check of each character */ for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) { char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]); if (isgap(*res)) continue; if (prMSeq->seqtype==SEQTYPE_PROTEIN) { if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) { *res = AMINOACID_ANY; } } else if (prMSeq->seqtype==SEQTYPE_DNA) { if (NULL == strchr(DNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } else if (prMSeq->seqtype==SEQTYPE_RNA) { if (NULL == strchr(RNA_ALPHABET, toupper(*res))) { *res = NUCLEOTIDE_ANY; } } } } prMSeq->filename = CkStrdup(seqfile); Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s", prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename); return 0; } /*** end: ReadSequences ***/ /** * @brief allocate and initialise new mseq_t * * @param[out] prMSeq * newly allocated and initialised mseq_t * * @note caller has to free by calling FreeMSeq() * * @see FreeMSeq * */ void NewMSeq(mseq_t **prMSeq) { *prMSeq = (mseq_t *) CKMALLOC(1 * sizeof(mseq_t)); (*prMSeq)->nseqs = 0; (*prMSeq)->seq = NULL; (*prMSeq)->orig_seq = NULL; (*prMSeq)->seqtype = SEQTYPE_UNKNOWN; (*prMSeq)->sqinfo = NULL; (*prMSeq)->filename = NULL; } /*** end: NewMSeq ***/ /** * @brief copies an mseq structure * * @param[out] prMSeqDest_p * Copy of mseq structure * @param[in] prMSeqSrc * Source mseq structure to copy * * @note caller has to free copy by calling FreeMSeq() * */ void CopyMSeq(mseq_t **prMSeqDest_p, mseq_t *prMSeqSrc) { int i; assert(prMSeqSrc != NULL && prMSeqDest_p != NULL); NewMSeq(prMSeqDest_p); (*prMSeqDest_p)->nseqs = prMSeqSrc->nseqs; (*prMSeqDest_p)->seqtype = prMSeqSrc->seqtype; if (prMSeqSrc->filename!=NULL) { (*prMSeqDest_p)->filename = CkStrdup(prMSeqSrc->filename); } (*prMSeqDest_p)->seq = (char **) CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(char *)); (*prMSeqDest_p)->orig_seq = (char **) CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(char *)); (*prMSeqDest_p)->sqinfo = (SQINFO *) CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(SQINFO)); for (i=0; i<(*prMSeqDest_p)->nseqs; i++) { (*prMSeqDest_p)->seq[i] = CkStrdup(prMSeqSrc->seq[i]); (*prMSeqDest_p)->orig_seq[i] = CkStrdup(prMSeqSrc->orig_seq[i]); SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[i], &prMSeqSrc->sqinfo[i]); } } /*** end: CopyMSeq ***/ /** * @brief * * @param[in] seqname * The sequence name to search for * @param[in] mseq * The multiple sequence structure to search in * * @return -1 on failure, sequence index of matching name otherwise * * @warning If sequence name happens to be used twice, only the first * one will be reported back * */ int FindSeqName(char *seqname, mseq_t *mseq) { int i; /* aux */ assert(NULL!=mseq); for (i=0; inseqs; i++) { if (STR_EQ(mseq->sqinfo[i].name, seqname)) { return i; } } return -1; } /*** end: FindSeqName() ***/ /** * @brief Frees an mseq_t and it's members and zeros all members * * @param[in] mseq mseq_to to free * * @note use in conjunction with NewMSeq() * @see new_mseq */ void FreeMSeq(mseq_t **mseq) { int i; if (NULL==(*mseq)) { return; } if ((*mseq)->filename) { (*mseq)->filename = CKFREE((*mseq)->filename); } for (i=0; i<(*mseq)->nseqs; i++) { FreeSequence((*mseq)->seq[i], &(*mseq)->sqinfo[i]); CKFREE((*mseq)->orig_seq[i]); } if ((*mseq)->seq) { CKFREE((*mseq)->seq); } if ((*mseq)->orig_seq) { /* FIXME (FS): only ptr to ptr freed, actual sequences NOT freed*/ CKFREE((*mseq)->orig_seq); } if ((*mseq)->sqinfo) { CKFREE((*mseq)->sqinfo); } (*mseq)->seqtype = SEQTYPE_UNKNOWN; (*mseq)->nseqs = 0; CKFREE((*mseq)); } /*** end: FreeMSeq ***/ /** * @brief Write alignment to file. * * @param[in] mseq * The mseq_t struct containing the aligned sequences * @param[in] pcAlnOutfile * The name of the output file * @param[in] outfmt * The alignment output format (defined in squid.h) * * @return Non-zero on error * * @note We create a temporary squid MSA struct in here because we never * use it within clustal. We might be better of using the old clustal * output routines instead. * */ int WriteAlignment(mseq_t *mseq, const char *pcAlnOutfile, int outfmt) { int i; /* aux */ MSA *msa; /* squid's alignment structure */ FILE *pfOut = NULL; int key; /* MSA struct internal index for sequence */ int alen; /* alignment length */ bool use_stdout; assert(mseq!=NULL); if (MSAFILE_UNKNOWN == outfmt) { Log(&rLog, LOG_ERROR, "Unknown output format chosen"); return -1; } if (NULL == pcAlnOutfile) { pfOut = stdout; use_stdout = TRUE; } else { use_stdout = FALSE; if (NULL == (pfOut = fopen(pcAlnOutfile, "w"))) { Log(&rLog, LOG_ERROR, "Could not open file %s for writing", pcAlnOutfile); return -1; } } /* derive alignment length from first seq */ alen = strlen(mseq->seq[0]); msa = MSAAlloc(mseq->nseqs, alen); /* basic structure borrowed code from squid-1.9g/a2m.c:ReadA2M() * we actually create a copy of mseq. keeping the pointers becomes * messy when calling MSAFree() */ for (i=0; inseqs; i++) { char *this_name = mseq->sqinfo[i].name; /* mseq sequence name */ char *this_seq = mseq->seq[i]; /* mseq sequence */ SQINFO *this_sqinfo = &mseq->sqinfo[i]; /* mseq sequence name */ key = GKIStoreKey(msa->index, this_name); msa->sqname[key] = sre_strdup(this_name, strlen(this_name)); /* setting msa->sqlen[idx] and msa->aseq[idx] */ msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key], this_seq, strlen(this_seq)); if (this_sqinfo->flags & SQINFO_DESC) { /* FIXME never get here ... */ MSASetSeqDescription(msa, key, this_sqinfo->desc); } /* FIXME extend this by copying more stuff according to flags. * See MSAFileRead() in msa.c and used functions there * * Problem is that we never parse MSA information as we use squid'sSeqFile */ msa->nseq++; } /* FIXME Would like to, but can't use MSAVerifyParse(msa) here, as it * will die on error. Need to implement our own version */ #if 0 MSAVerifyParse(msa); #endif /* The below is copy of MSAFileWrite() which originally only writes to stdout. */ /* Be sloppy and make a2m and fasta the same. same for vienna (which is the same). same same. can can. boleh boleh */ if (outfmt==SQFILE_FASTA) outfmt = MSAFILE_A2M; if (outfmt==SQFILE_VIENNA) outfmt = MSAFILE_VIENNA; switch (outfmt) { case MSAFILE_A2M: WriteA2M(pfOut, msa, 0); break; case MSAFILE_VIENNA: WriteA2M(pfOut, msa, 1); break; case MSAFILE_CLUSTAL: WriteClustal(pfOut, msa); break; case MSAFILE_MSF: WriteMSF(pfOut, msa); break; case MSAFILE_PHYLIP: WritePhylip(pfOut, msa); break; case MSAFILE_SELEX: WriteSELEX(pfOut, msa); break; case MSAFILE_STOCKHOLM: WriteStockholm(pfOut, msa); break; default: Log(&rLog, LOG_FATAL, "internal error: %s", "invalid output format should have been detected before"); } if (use_stdout == FALSE) { (void) fclose(pfOut); Log(&rLog, LOG_INFO, "Alignment written to %s", pcAlnOutfile); } MSAFree(msa); return 0; } /*** end of WriteAlignment() ***/ /** * @brief Removes all gap-characters from a sequence. * * @param[out] seq * Sequence to dealign * * @note seq will not be reallocated */ void DealignSeq(char *seq) { int aln_pos; int dealn_pos; assert(seq!=NULL); dealn_pos=0; for (aln_pos=0; aln_pos<(int)strlen(seq); aln_pos++) { if (! isgap(seq[aln_pos])) { seq[dealn_pos++] = seq[aln_pos]; } } seq[dealn_pos] = '\0'; return; } /*** end: DealignSeq() ***/ /** * @brief Sort sequences by length * * @param[out] prMSeq * mseq to sort by length * @param[out] cOrder * Sorting order. 'd' for descending, 'a' for ascending. * * */ void SortMSeqByLength(mseq_t *prMSeq, const char cOrder) { int *piSeqLen; int *piOrder; int iSeqIndex; mseq_t *prMSeqCopy = NULL; assert('a'==cOrder || 'd'==cOrder); Log(&rLog, LOG_WARN, "FIXME: This modifies sequence ordering. Might not be what user wants. Will change output order as well"); piSeqLen = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int)); piOrder = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int)); for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len; } QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, cOrder, FALSE); CopyMSeq(&prMSeqCopy, prMSeq); for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { /* copy mseq entry */ CKFREE(prMSeq->seq[iSeqIndex]); prMSeq->seq[iSeqIndex] = CkStrdup(prMSeqCopy->seq[piOrder[iSeqIndex]]); CKFREE(prMSeq->orig_seq[iSeqIndex]); prMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeqCopy->orig_seq[piOrder[iSeqIndex]]); SeqinfoCopy(&prMSeq->sqinfo[iSeqIndex], &prMSeqCopy->sqinfo[piOrder[iSeqIndex]]); } CKFREE(piSeqLen); CKFREE(piOrder); FreeMSeq(&prMSeqCopy); return; } /*** end: SortMSeqByLength() ***/ /** * @brief Checks if sequences in given mseq structure are aligned. By * definition this is only true, if sequences are of the same length * and at least one gap was found * * @param[in] prMSeq * Sequences to check * * @return TRUE if sequences are aligned, FALSE if not * * */ bool SeqsAreAligned(mseq_t *prMSeq) { bool bGapFound, bSameLength; int iSeqIdx; /* sequence counter */ int iSeqPos; /* sequence string position counter */ /* Special case of just one sequence: * it is arguable that a single sequence qualifies as a profile, * however, this is what we do at the first stage of MSA anyway. * So, if there is only 1 sequence it is a 1-profile * and it is (defined to be) aligned (with itself). FS, r240 -> 241 */ if (1 == prMSeq->nseqs) { return TRUE; } /* Check if sequences are aligned. For being aligned, the * sequences have to be of same length (bSameLength) and at least * one of them has to contain at least one gap (bGapFound) */ bGapFound = FALSE; bSameLength = TRUE; for (iSeqIdx=0; iSeqIdxnseqs; iSeqIdx++) { if (FALSE == bGapFound) { for (iSeqPos=0; iSeqPossqinfo[iSeqIdx].len && false==bGapFound; iSeqPos++) { if (isgap(prMSeq->seq[iSeqIdx][iSeqPos])) { bGapFound = TRUE; /* skip rest of sequence */ break; } } } if (iSeqIdx>0) { if (prMSeq->sqinfo[iSeqIdx].len != prMSeq->sqinfo[iSeqIdx-1].len) { bSameLength = FALSE; /* no need to continue search, bSameLength==FALSE is * sufficient condition */ break; } } } #if 0 Log(&rLog, LOG_FORCED_DEBUG, "bSameLength=%d bGapFound=%d", bSameLength, bGapFound); #endif if (TRUE == bSameLength && TRUE == bGapFound) { return TRUE; } else { return FALSE; } } /*** end: SeqsAreAligned() ***/ /** * @brief Creates a new sequence entry and appends it to an existing mseq * structure. * * @param[out] prMSeqDest_p * Already existing and initialised mseq structure * @param[in] pcSeqName * sequence name of the sequence to add * @param[in] pcSeqRes * the actual sequence (residues) to add * * @note Don't forget to update the align and type flag if necessary! * * FIXME allow adding of more features * */ void AddSeq(mseq_t **prMSeqDest_p, char *pcSeqName, char *pcSeqRes) { int iSeqIdx = 0; SQINFO sqinfo; assert(NULL != prMSeqDest_p); assert(NULL != pcSeqName); assert(NULL != pcSeqRes); iSeqIdx = (*prMSeqDest_p)->nseqs; (*prMSeqDest_p)->seq = (char **) CKREALLOC((*prMSeqDest_p)->seq, (iSeqIdx+1) * sizeof(char *)); (*prMSeqDest_p)->orig_seq = (char **) CKREALLOC((*prMSeqDest_p)->orig_seq, (iSeqIdx+1) * sizeof(char *)); (*prMSeqDest_p)->sqinfo = (SQINFO *) CKREALLOC((*prMSeqDest_p)->sqinfo, (iSeqIdx+1) * sizeof(SQINFO)); (*prMSeqDest_p)->seq[iSeqIdx] = CkStrdup(pcSeqRes); (*prMSeqDest_p)->orig_seq[iSeqIdx] = CkStrdup(pcSeqRes); /* should probably get ri of SqInfo altogether in the long run and just transfer the intersting members into our own struct */ sqinfo.flags = 0; /* init */ sqinfo.len = strlen(pcSeqRes); sqinfo.flags |= SQINFO_LEN; /* name is an array of SQINFO_NAMELEN length */ strncpy(sqinfo.name, pcSeqName, SQINFO_NAMELEN-1); sqinfo.name[SQINFO_NAMELEN-1] = '\0'; sqinfo.flags |= SQINFO_NAME; SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iSeqIdx], & sqinfo); (*prMSeqDest_p)->nseqs++; return; } /* end of AddSeq() */ /** * @brief Appends an mseq structure to an already existing one. * filename will be left untouched. * * @param[in] prMSeqDest_p * MSeq structure to which to append to * @param[out] prMSeqToAdd * MSeq structure which is to append * * */ void JoinMSeqs(mseq_t **prMSeqDest_p, mseq_t *prMSeqToAdd) { int iSrcSeqIndex; int iNewNSeq; assert(NULL != prMSeqDest_p && NULL != (*prMSeqDest_p)); assert(NULL != prMSeqToAdd); if (0 == prMSeqToAdd->nseqs) { Log(&rLog, LOG_WARN, "Was asked to add 0 sequences"); return; } /* warn on seqtype mismatch and keep original seqtype */ if ((*prMSeqDest_p)->seqtype != prMSeqToAdd->seqtype) { Log(&rLog, LOG_WARN, "Joining sequences of different type"); } /* leave filename as it is */ /* * copy new seq/s, orig_seq/s, sqinfo/s */ iNewNSeq = (*prMSeqDest_p)->nseqs + prMSeqToAdd->nseqs; (*prMSeqDest_p)->seq = (char **) CKREALLOC((*prMSeqDest_p)->seq, iNewNSeq * sizeof(char *)); (*prMSeqDest_p)->orig_seq = (char **) CKREALLOC((*prMSeqDest_p)->orig_seq, iNewNSeq * sizeof(char *)); (*prMSeqDest_p)->sqinfo = (SQINFO *) CKREALLOC((*prMSeqDest_p)->sqinfo, iNewNSeq * sizeof(SQINFO)); for (iSrcSeqIndex=0; iSrcSeqIndex < prMSeqToAdd->nseqs; iSrcSeqIndex++) { int iDstSeqIndex = (*prMSeqDest_p)->nseqs++; (*prMSeqDest_p)->seq[iDstSeqIndex] = CkStrdup(prMSeqToAdd->seq[iSrcSeqIndex]); (*prMSeqDest_p)->orig_seq[iDstSeqIndex] = CkStrdup(prMSeqToAdd->orig_seq[iSrcSeqIndex]); SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iDstSeqIndex], & prMSeqToAdd->sqinfo[iSrcSeqIndex]); } (*prMSeqDest_p)->nseqs = iNewNSeq; (*prMSeqDest_p)->aligned = SeqsAreAligned(*prMSeqDest_p); return; } /*** end: JoinMSeqs() ***/