1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: seq.c 298 2014-11-07 12:18:36Z fabian $
21 * Module for sequence/alignment IO and misc.
23 * This depends heavily on Sean Eddy's squid library, which is obsoleted by
24 * HMMER3's Easel. However, easel doesn't support that many non-aligned input
34 #include "squid/squid.h"
40 /*#include "../../mymemmonitor.h"*/
42 #define ALLOW_ONLY_PROTEIN 0 // DD
47 * @brief Stripped down version of squid's alistat
51 * The alignment to analyse
52 * @param[in] bSampling
53 * For many sequences: samples from pool
54 * @param[in] bReportAll
55 * Report identities for all sequence pairs
57 * Don't have to worry about sequence case because our version of PairwiseIdentity is case insensitive
60 AliStat(mseq_t *prMSeq, bool bSampling, bool bReportAll) {
63 * bSampling = squid's do_fast
64 * bReportAll = squid's allreport
66 float **ppdIdentMx; /* identity matrix (squid: imx) */
67 const int iNumSample = 1000; /* sample size (squid: nsample) */
70 MSA *msa; /* squid's alignment structure */
74 float worst_worst, worst_best, best_best;
77 int nres; /* number of residues */
79 if (bSampling && bReportAll) {
81 "Cannot report all and sample at the same time. Skipping %s()", __FUNCTION__);
84 if (FALSE == prMSeq->aligned) {
86 "Sequences are not aligned. Skipping %s()", __FUNCTION__);
90 /* silence gcc warnings about uninitialized variables
92 worst_worst = worst_best = best_best = 0.0;
98 * FIXME code overlap with WriteAlignment. Make it a function and take
99 * code there (contains more comments) as template
102 msa = MSAAlloc(prMSeq->nseqs,
103 /* derive alignment length from first seq */
104 strlen(prMSeq->seq[0]));
105 for (i=0; i<prMSeq->nseqs; i++) {
106 int key; /* MSA struct internal index for sequence */
107 char *this_name = prMSeq->sqinfo[i].name; /* prMSeq sequence name */
108 char *this_seq = prMSeq->seq[i]; /* prMSeq sequence */
109 SQINFO *this_sqinfo = &prMSeq->sqinfo[i]; /* prMSeq sequence name */
111 key = GKIStoreKey(msa->index, this_name);
112 msa->sqname[key] = sre_strdup(this_name, strlen(this_name));
113 /* setting msa->sqlen[idx] and msa->aseq[idx] */
114 msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key],
115 this_seq, strlen(this_seq));
116 if (this_sqinfo->flags & SQINFO_DESC) {
117 MSASetSeqDescription(msa, key, this_sqinfo->desc);
126 for (i = 0; i < msa->nseq; i++) {
127 int rlen; /* raw sequence length */
128 rlen = DealignedLength(msa->aseq[i]);
130 if (small == -1 || rlen < small) small = rlen;
131 if (large == -1 || rlen > large) large = rlen;
136 avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen,
137 msa->nseq, iNumSample);
142 /* this might be slow...could use openmp inside squid */
143 MakeIdentityMx(msa->aseq, msa->nseq, &ppdIdentMx);
145 printf(" %-15s %5s %7s %-15s %7s %-15s\n",
146 "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)");
147 printf(" --------------- ----- ------- --------------- ------- ---------------\n");
154 for (i = 0; i < msa->nseq; i++) {
157 for (j = 0; j < msa->nseq; j++) {
158 /* closest seq to this one = best */
159 if (i != j && ppdIdentMx[i][j] > best) {
160 best = ppdIdentMx[i][j]; bestj = j;
162 if (ppdIdentMx[i][j] < worst) {
163 worst = ppdIdentMx[i][j]; worstj = j;
168 printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n",
169 msa->sqname[i], DealignedLength(msa->aseq[i]),
170 best * 100., msa->sqname[bestj],
171 worst * 100., msa->sqname[worstj]);
173 if (best > best_best) best_best = best;
174 if (best < worst_best) worst_best = best;
175 if (worst < worst_worst) worst_worst = worst;
176 for (j = 0; j < i; j++)
177 sum += ppdIdentMx[i][j];
179 avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0);
182 FMX2Free(ppdIdentMx);
183 } /* else bSampling */
189 if (msa->name != NULL)
190 printf("Alignment name: %s\n", msa->name);
191 /*printf("Format: %s\n", SeqfileFormat2String(afp->format));*/
192 printf("Number of sequences: %d\n", msa->nseq);
193 printf("Total # residues: %d\n", nres);
194 printf("Smallest: %d\n", small);
195 printf("Largest: %d\n", large);
196 printf("Average length: %.1f\n", (float) nres / (float) msa->nseq);
197 printf("Alignment length: %d\n", msa->alen);
198 printf("Average identity: %.2f%%\n", 100.*avgid);
201 printf("Most related pair: %.2f%%\n", 100.*best_best);
202 printf("Most unrelated pair: %.2f%%\n", 100.*worst_worst);
203 printf("Most distant seq: %.2f%%\n", 100.*worst_best);
208 cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen);
214 /* end of AliStat() */
221 * @brief Shuffle mseq order
224 * mseq structure to shuffle
228 ShuffleMSeq(mseq_t *mseq)
234 /* quick and dirty: create an array with permuted indices and
235 * swap accordingly (which amounts to shuffling twice)
238 PermutationArray(&piPermArray, mseq->nseqs);
240 for (iSeqIndex=0; iSeqIndex<mseq->nseqs; iSeqIndex++) {
241 SeqSwap(mseq, piPermArray[iSeqIndex], iSeqIndex);
246 /*** end: ShuffleMSeq() ***/
250 * @brief Swap two sequences in an mseq_t structure.
253 * Multiple sequence struct
255 * Index of first sequence
257 * Index of seconds sequence
262 SeqSwap(mseq_t *prMSeq, int i, int j)
267 assert(NULL!=prMSeq);
268 assert(i<prMSeq->nseqs && j<prMSeq->nseqs);
274 pcTmp = prMSeq->seq[i];
275 prMSeq->seq[i] = prMSeq->seq[j];
276 prMSeq->seq[j] = pcTmp;
278 pcTmp = prMSeq->orig_seq[i];
279 prMSeq->orig_seq[i] = prMSeq->orig_seq[j];
280 prMSeq->orig_seq[j] = pcTmp;
282 rSqinfoTmp = prMSeq->sqinfo[i];
283 prMSeq->sqinfo[i] = prMSeq->sqinfo[j];
284 prMSeq->sqinfo[j] = rSqinfoTmp;
288 /*** end: SeqSwap() ***/
294 * @brief Dealigns all sequences in mseq structure, updates the
295 * sequence length info and sets aligned to FALSE
298 * The mseq structure to dealign
302 DealignMSeq(mseq_t *mseq)
305 for (i=0; i<mseq->nseqs; i++) {
306 DealignSeq(mseq->seq[i]);
307 mseq->sqinfo[i].len = strlen(mseq->seq[i]);
309 mseq->aligned = FALSE;
313 /*** end: DealinMSeq() ***/
318 * @brief debug output of sqinfo struct
321 * Squid's SQINFO struct for a certain seqeuence
323 * @note useful for debugging only
327 LogSqInfo(SQINFO *sqinfo)
330 /*LOG_DEBUG("sqinfo->flags = %d", sqinfo->flags);*/
331 if (sqinfo->flags & SQINFO_NAME)
332 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->name = %s", sqinfo->name);
334 if (sqinfo->flags & SQINFO_ID)
335 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->id = %s", sqinfo->id);
337 if (sqinfo->flags & SQINFO_ACC)
338 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->acc = %s", sqinfo->acc);
340 if (sqinfo->flags & SQINFO_DESC)
341 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->desc = %s", sqinfo->desc);
343 if (sqinfo->flags & SQINFO_LEN)
344 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->len = %d", sqinfo->len);
346 if (sqinfo->flags & SQINFO_START)
347 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->start = %d", sqinfo->start);
349 if (sqinfo->flags & SQINFO_STOP)
350 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->stop = %d", sqinfo->stop);
352 if (sqinfo->flags & SQINFO_OLEN)
353 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->olen = %d", sqinfo->olen);
355 if (sqinfo->flags & SQINFO_TYPE)
356 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->type = %d", sqinfo->type);
358 if (sqinfo->flags & SQINFO_SS)
359 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->ss = %s", sqinfo->ss);
361 if (sqinfo->flags & SQINFO_SA)
362 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->sa = %s", sqinfo->sa);
364 /*** end: log_sqinfo ***/
368 * @brief convert int-encoded iSeqType to string
370 * @param[in] iSeqType int-encoded sequence type
372 * @return character pointer describing the sequence type
376 SeqTypeToStr(int iSeqType)
383 case SEQTYPE_PROTEIN:
385 case SEQTYPE_UNKNOWN:
388 Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__);
390 return "Will never get here";
392 /*** end: SeqTypeToStr ***/
397 * @brief reads sequences from file
400 * Multiple sequence struct. Must be preallocated.
401 * FIXME: would make more sense to allocate it here.
403 * Sequence file name. If '-' sequence will be read from stdin.
404 * @param[in] iSeqType
405 * int-encoded sequence type. Set to
406 * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence)
407 * @param[in] iMaxNumSeq
408 * Return an error, if more than iMaxNumSeq have been read
409 * @param[in] iMaxSeqLen
410 * Return an error, if a seq longer than iMaxSeqLen has been read
412 * @return 0 on success, -1 on error
415 * - Depends heavily on squid
416 * - Sequence file format will be guessed
417 * - If supported by squid, gzipped files can be read as well.
420 ReadSequences(mseq_t *prMSeq, char *seqfile,
421 int iSeqType, int iSeqFmt, bool bIsProfile, bool bDealignInputSeqs,
422 int iMaxNumSeq, int iMaxSeqLen, char *pcHMMBatch)
424 SQFILE *dbfp; /* sequence file descriptor */
425 char *cur_seq = NULL;
426 SQINFO cur_sqinfo = {0};
427 int iSeqIdx; /* sequence counter */
428 int iSeqPos; /* sequence string position counter */
430 assert(NULL!=seqfile);
433 /* Try to work around inability to autodetect from a pipe or .gz:
434 * assume FASTA format
436 if (SQFILE_UNKNOWN == iSeqFmt &&
437 (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) {
438 iSeqFmt = SQFILE_FASTA;
441 /* Using squid routines to read input. taken from seqstat_main.c. we don't
442 * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen
443 * etc. NOTE this also means we discard some information, e.g. when
444 * reading from and writing to a stockholm file, all extra MSA
445 * info/annotation will be lost.
449 if (NULL == (dbfp = SeqfileOpen(seqfile, iSeqFmt, NULL))) {
450 Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile);
455 /* FIXME squid's ReadSeq() will exit with fatal error if format is
456 * unknown. This will be a problem for a GUI. Same is true for many squid
459 * The original squid:ReadSeq() dealigns sequences on input. We
460 * use a patched version.
463 while (ReadSeq(dbfp, dbfp->format,
467 if (prMSeq->nseqs+1>iMaxNumSeq) {
468 Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'",
469 iMaxNumSeq, cur_sqinfo.name, seqfile);
472 if ((int)strlen(cur_seq)>iMaxSeqLen) {
473 Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)",
474 cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen);
477 if ((int)strlen(cur_seq)==0) {
478 Log(&rLog, LOG_ERROR, "Sequence '%s' has 0 residues",
483 /* FIXME: use modified version of AddSeq() that allows handing down SqInfo
486 prMSeq->seq = (char **)
487 CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *));
488 prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq);
491 prMSeq->sqinfo = (SQINFO *)
492 CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO));
493 memset(&prMSeq->sqinfo[prMSeq->nseqs], 0, sizeof(SQINFO));
494 SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo);
497 Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]);
498 LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]);
500 /* always guess type from first seq. use squid function and
503 if (0 == prMSeq->nseqs) {
504 int type = Seqtype(prMSeq->seq[prMSeq->nseqs]);
507 prMSeq->seqtype = SEQTYPE_DNA;
510 prMSeq->seqtype = SEQTYPE_RNA;
513 prMSeq->seqtype = SEQTYPE_PROTEIN;
516 prMSeq->seqtype = SEQTYPE_UNKNOWN;
519 Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__);
522 /* override with given sequence type but check with
523 * automatically detected type and warn if necessary
525 if (SEQTYPE_UNKNOWN != iSeqType) {
526 if (prMSeq->seqtype != iSeqType) {
527 Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested",
528 SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(iSeqType));
529 prMSeq->seqtype = iSeqType;
532 /* if type could not be determined and was not set return error */
533 if (SEQTYPE_UNKNOWN == iSeqType && SEQTYPE_UNKNOWN == prMSeq->seqtype) {
534 Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence");
535 FreeSequence(cur_seq, &cur_sqinfo);
541 Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s",
542 prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype),
543 prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len,
544 prMSeq->seq[prMSeq->nseqs]);
546 /* FIXME IPUAC and/or case conversion? If yes see
547 * corresponding squid functions. Special treatment of
548 * Stockholm tilde-gaps for ktuple code?
553 FreeSequence(cur_seq, &cur_sqinfo);
557 /*#if ALLOW_ONLY_PROTEIN
558 if (SEQTYPE_PROTEIN != prMSeq->seqtype) {
559 Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.",
560 SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME);
564 /* Check if sequences are aligned */
565 prMSeq->aligned = SeqsAreAligned(prMSeq, bIsProfile, bDealignInputSeqs);
568 /* keep original sequence as copy and convert "working" sequence
571 prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *));
572 for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) {
574 prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]);
577 /* convert unknown characters according to set seqtype
578 * be conservative, i.e. don't allow any fancy ambiguity
579 * characters to make sure that ktuple code etc. works.
582 /* first on the fly conversion between DNA and RNA
584 if (prMSeq->seqtype==SEQTYPE_DNA)
585 ToDNA(prMSeq->seq[iSeqIdx]);
586 if (prMSeq->seqtype==SEQTYPE_RNA)
587 ToRNA(prMSeq->seq[iSeqIdx]);
589 /* then check of each character
591 for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) {
592 char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]);
596 if (prMSeq->seqtype==SEQTYPE_PROTEIN) {
597 if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) {
598 *res = AMINOACID_ANY;
600 } else if (prMSeq->seqtype==SEQTYPE_DNA) {
601 if (NULL == strchr(DNA_ALPHABET, toupper(*res))) {
602 *res = NUCLEOTIDE_ANY;
604 } else if (prMSeq->seqtype==SEQTYPE_RNA) {
605 if (NULL == strchr(RNA_ALPHABET, toupper(*res))) {
606 *res = NUCLEOTIDE_ANY;
612 /* order in which sequences appear in guide-tree
613 * only allocate if different output-order desired */
614 prMSeq->tree_order = NULL;
616 prMSeq->filename = CkStrdup(seqfile);
617 Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s",
618 prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename);
620 prMSeq->pppcHMMBNames = NULL;
621 prMSeq->ppiHMMBindex = NULL;
623 /* read HMM-batch file if existent */
624 if (NULL != pcHMMBatch) {
626 enum {MAXLINE=10000};
627 FILE *pfHMMBatch = NULL;
628 char zcScanline[MAXLINE] = {0};
629 char *pcToken = NULL;
630 char *pcSeqName = NULL;
633 /* check that file exists */
634 if (NULL == (pfHMMBatch = fopen(pcHMMBatch, "r"))){
635 Log(&rLog, LOG_ERROR, "Failed to open HMM-batch file %s for reading", pcHMMBatch);
639 /* initialise names and indices */
640 prMSeq->pppcHMMBNames = (char ***)CKMALLOC(prMSeq->nseqs * sizeof(char **));
641 for (iSeq = 0; iSeq < prMSeq->nseqs; iSeq++){
642 prMSeq->pppcHMMBNames[iSeq] = NULL;
644 prMSeq->ppiHMMBindex = (int **)CKMALLOC(prMSeq->nseqs * sizeof(int *));
645 for (iSeq = 0; iSeq < prMSeq->nseqs; iSeq++){
646 prMSeq->ppiHMMBindex[iSeq] = (int *)CKMALLOC(1 * sizeof(int));
647 prMSeq->ppiHMMBindex[iSeq][0] = -1;
650 /* read batch file line-by-line */
651 while (NULL != fgets(zcScanline, MAXLINE, pfHMMBatch)){
653 pcToken = strtok(zcScanline, " \040\t\n");
654 if (NULL == pcToken){
660 /* identify sequence label from batch file in labels read from sequence file */
661 for (iSeq = 0; iSeq < prMSeq->nseqs; iSeq++){
664 if (0 == strcmp(pcSeqName, prMSeq->sqinfo[iSeq].name)){
666 while (NULL != (pcToken = strtok(NULL, " \040\t\n"))){
667 prMSeq->pppcHMMBNames[iSeq] = (char **)CKREALLOC(prMSeq->pppcHMMBNames[iSeq],
668 (iHMM+2) * sizeof(char *));
669 prMSeq->pppcHMMBNames[iSeq][iHMM] = CkStrdup(pcToken);
670 prMSeq->ppiHMMBindex[iSeq] = (int *)CKREALLOC(prMSeq->ppiHMMBindex[iSeq],
671 (iHMM+2) * sizeof(int));
672 prMSeq->ppiHMMBindex[iSeq][iHMM] = 0;
674 prMSeq->pppcHMMBNames[iSeq][iHMM] = NULL;
675 prMSeq->ppiHMMBindex[iSeq][iHMM] = 0;
680 } /* 0 <= iSeq < prMSeq->nseqs */
681 if (iSeq >= prMSeq->nseqs) {
683 "sequence %s not found in input sequences (%s), will be ignored",
689 fclose(pfHMMBatch); pfHMMBatch = NULL;
691 } /* there was a HMM batch file */
693 prMSeq->pppcHMMBNames = NULL;
694 prMSeq->ppiHMMBindex = NULL;
695 } /* there was no HMM batch file */
699 /*** end: ReadSequences ***/
703 * @brief allocate and initialise new mseq_t
706 * newly allocated and initialised mseq_t
708 * @note caller has to free by calling FreeMSeq()
714 NewMSeq(mseq_t **prMSeq)
716 *prMSeq = (mseq_t *) CKMALLOC(1 * sizeof(mseq_t));
718 (*prMSeq)->nseqs = 0;
719 (*prMSeq)->seq = NULL;
720 (*prMSeq)->orig_seq = NULL;
721 (*prMSeq)->seqtype = SEQTYPE_UNKNOWN;
722 (*prMSeq)->sqinfo = NULL;
723 (*prMSeq)->filename = NULL;
724 (*prMSeq)->tree_order = NULL;
725 (*prMSeq)->pppcHMMBNames = NULL;
726 (*prMSeq)->ppiHMMBindex = NULL;
728 /*** end: NewMSeq ***/
733 * @brief copies an mseq structure
735 * @param[out] prMSeqDest_p
736 * Copy of mseq structure
737 * @param[in] prMSeqSrc
738 * Source mseq structure to copy
740 * @note caller has to free copy by calling FreeMSeq()
744 CopyMSeq(mseq_t **prMSeqDest_p, mseq_t *prMSeqSrc)
747 assert(prMSeqSrc != NULL && prMSeqDest_p != NULL);
749 NewMSeq(prMSeqDest_p);
751 (*prMSeqDest_p)->nseqs = prMSeqSrc->nseqs;
752 (*prMSeqDest_p)->seqtype = prMSeqSrc->seqtype;
753 if (prMSeqSrc->filename!=NULL) {
754 (*prMSeqDest_p)->filename = CkStrdup(prMSeqSrc->filename);
757 (*prMSeqDest_p)->seq = (char **)
758 CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(char *));
759 (*prMSeqDest_p)->orig_seq = (char **)
760 CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(char *));
761 (*prMSeqDest_p)->sqinfo = (SQINFO *)
762 CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(SQINFO));
766 for (i=0; i<(*prMSeqDest_p)->nseqs; i++) {
767 (*prMSeqDest_p)->seq[i] = CkStrdup(prMSeqSrc->seq[i]);
768 (*prMSeqDest_p)->orig_seq[i] = CkStrdup(prMSeqSrc->orig_seq[i]);
769 SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[i], &prMSeqSrc->sqinfo[i]);
772 /*** end: CopyMSeq ***/
780 * The sequence name to search for
782 * The multiple sequence structure to search in
784 * @return -1 on failure, sequence index of matching name otherwise
786 * @warning If sequence name happens to be used twice, only the first
787 * one will be reported back
791 FindSeqName(char *seqname, mseq_t *mseq)
797 for (i=0; i<mseq->nseqs; i++) {
798 if (STR_EQ(mseq->sqinfo[i].name, seqname)) {
805 /*** end: FindSeqName() ***/
809 * @brief Frees an mseq_t and it's members and zeros all members
811 * @param[in] mseq mseq_to to free
813 * @note use in conjunction with NewMSeq()
817 FreeMSeq(mseq_t **mseq)
825 if ((*mseq)->filename) {
826 (*mseq)->filename = CKFREE((*mseq)->filename);
829 for (i=0; i<(*mseq)->nseqs; i++) {
830 FreeSequence((*mseq)->seq[i], &(*mseq)->sqinfo[i]);
831 CKFREE((*mseq)->orig_seq[i]);
834 CKFREE((*mseq)->seq);
836 if ((*mseq)->orig_seq) { /* FIXME (FS): only ptr to ptr freed, actual sequences NOT freed*/
837 CKFREE((*mseq)->orig_seq);
839 if ((*mseq)->sqinfo) {
840 CKFREE((*mseq)->sqinfo);
843 /* FIXME (FS): problem with freeing tree_order */
844 if ((*mseq)->tree_order){
845 CKFREE((*mseq)->tree_order);
848 if (NULL != (*mseq)->pppcHMMBNames){ /* FS, r291 -> */
849 for (i = 0; (*mseq)->pppcHMMBNames[i] && (i < (*mseq)->nseqs); i++){
851 for (iIter = 0; NULL != (*mseq)->pppcHMMBNames[i][iIter]; iIter++){
852 CKFREE((*mseq)->pppcHMMBNames[i][iIter]);
856 (*mseq)->seqtype = SEQTYPE_UNKNOWN;
861 /*** end: FreeMSeq ***/
865 * @brief Write alignment to file.
868 * The mseq_t struct containing the aligned sequences
869 * @param[in] pcAlnOutfile
870 * The name of the output file
872 * The alignment output format (defined in squid.h)
874 * length of line for Clustal/Fasta format
876 * @return Non-zero on error
878 * @note We create a temporary squid MSA struct in here because we never
879 * use it within clustal. We might be better of using the old clustal
880 * output routines instead.
884 WriteAlignment(mseq_t *mseq, const char *pcAlnOutfile, int outfmt, int iWrap, bool bResno)
887 MSA *msa; /* squid's alignment structure */
889 int key; /* MSA struct internal index for sequence */
890 int alen; /* alignment length */
895 if (MSAFILE_UNKNOWN == outfmt) {
896 Log(&rLog, LOG_ERROR, "Unknown output format chosen");
900 if (NULL == pcAlnOutfile) {
905 if (NULL == (pfOut = fopen(pcAlnOutfile, "w"))) {
906 Log(&rLog, LOG_ERROR, "Could not open file %s for writing", pcAlnOutfile);
912 /* derive alignment length from first seq */
913 alen = strlen(mseq->seq[0]);
915 msa = MSAAlloc(mseq->nseqs, alen);
917 /* basic structure borrowed code from squid-1.9g/a2m.c:ReadA2M()
918 * we actually create a copy of mseq. keeping the pointers becomes
919 * messy when calling MSAFree()
921 for (i=0; i<mseq->nseqs; i++) {
922 char *this_name = NULL; /* mseq sequence name */
923 char *this_seq = NULL; /* mseq sequence */
924 SQINFO *this_sqinfo = NULL; /* mseq sequence name */
927 /* mseq->tree_order encodes to order in which sequences are listed in the guide-tree,
928 if the user wants the sequence output in the input-order then mseq->tree_order==NULL,
929 otherwise mseq->tree_order!=NULL, containing the indices of the sequences, FS, r274 -> */
930 iI = (NULL == mseq->tree_order) ? i : mseq->tree_order[i];
932 this_name = mseq->sqinfo[iI].name; /* mseq sequence name */
933 this_seq = mseq->seq[iI]; /* mseq sequence */
934 this_sqinfo = &mseq->sqinfo[iI]; /* mseq sequence name */
936 key = GKIStoreKey(msa->index, this_name);
937 msa->sqname[key] = sre_strdup(this_name, strlen(this_name));
939 /* setting msa->sqlen[idx] and msa->aseq[idx] */
940 msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key],
941 this_seq, strlen(this_seq));
943 if (this_sqinfo->flags & SQINFO_DESC) {
944 /* FIXME never get here ... */
945 MSASetSeqDescription(msa, key, this_sqinfo->desc);
947 /* FIXME extend this by copying more stuff according to flags.
948 * See MSAFileRead() in msa.c and used functions there
950 * Problem is that we never parse MSA information as we use squid'sSeqFile
955 } /* 0 <= i < mseq->nseqs */
958 /* FIXME Would like to, but can't use MSAVerifyParse(msa) here, as it
959 * will die on error. Need to implement our own version
965 /* The below is copy of MSAFileWrite() which originally only writes to stdout.
968 /* Be sloppy and make a2m and fasta the same. same for vienna (which is
969 the same). same same. can can. boleh boleh */
970 if (outfmt==SQFILE_FASTA)
971 outfmt = MSAFILE_A2M;
972 if (outfmt==SQFILE_VIENNA)
973 outfmt = MSAFILE_VIENNA;
977 /*WriteA2M(pfOut, msa, 0);*/
978 WriteA2M(pfOut, msa, iWrap);
981 /*WriteA2M(pfOut, msa, 1);*/
982 WriteA2M(pfOut, msa, INT_MAX);
984 case MSAFILE_CLUSTAL:
985 WriteClustal(pfOut, msa, iWrap, TRUE==bResno ? 1 : 0, mseq->seqtype);
988 WriteMSF(pfOut, msa);
991 WritePhylip(pfOut, msa);
994 WriteSELEX(pfOut, msa);
996 case MSAFILE_STOCKHOLM:
997 WriteStockholm(pfOut, msa);
1000 Log(&rLog, LOG_FATAL, "internal error: %s",
1001 "invalid output format should have been detected before");
1004 if (use_stdout == FALSE) {
1005 (void) fclose(pfOut);
1006 Log(&rLog, LOG_INFO,
1007 "Alignment written to %s", pcAlnOutfile);
1013 /*** end of WriteAlignment() ***/
1017 * @brief Removes all gap-characters from a sequence.
1020 * Sequence to dealign
1022 * @note seq will not be reallocated
1025 DealignSeq(char *seq)
1033 for (aln_pos=0; aln_pos<(int)strlen(seq); aln_pos++) {
1034 if (! isgap(seq[aln_pos])) {
1035 seq[dealn_pos++] = seq[aln_pos];
1038 seq[dealn_pos] = '\0';
1042 /*** end: DealignSeq() ***/
1047 * @brief Sort sequences by length
1049 * @param[out] prMSeq
1050 * mseq to sort by length
1051 * @param[out] cOrder
1052 * Sorting order. 'd' for descending, 'a' for ascending.
1057 SortMSeqByLength(mseq_t *prMSeq, const char cOrder)
1062 mseq_t *prMSeqCopy = NULL;
1064 assert('a'==cOrder || 'd'==cOrder);
1066 Log(&rLog, LOG_WARN,
1067 "FIXME: This modifies sequence ordering. Might not be what user wants. Will change output order as well");
1069 piSeqLen = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
1070 piOrder = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
1071 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
1072 piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
1074 QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, cOrder, FALSE);
1076 CopyMSeq(&prMSeqCopy, prMSeq);
1077 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
1080 CKFREE(prMSeq->seq[iSeqIndex]);
1081 prMSeq->seq[iSeqIndex] = CkStrdup(prMSeqCopy->seq[piOrder[iSeqIndex]]);
1083 CKFREE(prMSeq->orig_seq[iSeqIndex]);
1084 prMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeqCopy->orig_seq[piOrder[iSeqIndex]]);
1086 SeqinfoCopy(&prMSeq->sqinfo[iSeqIndex], &prMSeqCopy->sqinfo[piOrder[iSeqIndex]]);
1091 FreeMSeq(&prMSeqCopy);
1095 /*** end: SortMSeqByLength() ***/
1100 * @brief Checks if sequences in given mseq structure are aligned. By
1101 * definition this is only true, if sequences are of the same length
1102 * and at least one gap was found
1105 * Sequences to check
1107 * @return TRUE if sequences are aligned, FALSE if not
1112 SeqsAreAligned(mseq_t *prMSeq, bool bIsProfile, bool bDealignInputSeqs)
1114 bool bGapFound, bSameLength;
1115 int iSeqIdx; /* sequence counter */
1116 int iSeqPos; /* sequence string position counter */
1118 /* Special case of just one sequence:
1119 * it is arguable that a single sequence qualifies as a profile,
1120 * however, this is what we do at the first stage of MSA anyway.
1121 * So, if there is only 1 sequence it is a 1-profile
1122 * and it is (defined to be) aligned (with itself). FS, r240 -> 241
1124 if (1 == prMSeq->nseqs) {
1129 /* Check if sequences are aligned. For being aligned, the
1130 * sequences have to be of same length (bSameLength) and at least
1131 * one of them has to contain at least one gap (bGapFound)
1135 for (iSeqIdx=0; (iSeqIdx < prMSeq->nseqs); iSeqIdx++) {
1136 if ( (FALSE == bGapFound) ){
1138 iSeqPos<prMSeq->sqinfo[iSeqIdx].len && false==bGapFound;
1140 if (isgap(prMSeq->seq[iSeqIdx][iSeqPos])) {
1142 /* skip rest of sequence */
1146 } /* gap not (yet) found */
1149 if (prMSeq->sqinfo[iSeqIdx].len != prMSeq->sqinfo[iSeqIdx-1].len) {
1150 bSameLength = FALSE;
1151 /* no need to continue search, bSameLength==FALSE is
1152 * sufficient condition */
1156 } /* 0 <= iSeqIdx < prMSeq->nseqs */
1158 Log(&rLog, LOG_FORCED_DEBUG, "bSameLength=%d bGapFound=%d", bSameLength, bGapFound);
1162 if ( (TRUE == bSameLength) && ((TRUE == bGapFound) || (TRUE == bIsProfile)) ) {
1165 if ((FALSE == bSameLength) && (TRUE == bGapFound) && (FALSE == bDealignInputSeqs)){
1166 Log(&rLog, LOG_FORCED_DEBUG, "Potential Problem: Gaps encountered but not all sequences have same length, consider using --dealign");
1171 if (FALSE == bSameLength){
1172 /* if sequences don't have same lengths they can never be profile */
1173 if (TRUE == bGapFound){
1174 Log(&rLog, LOG_FORCED_DEBUG, "Potential Problem: sequences (N=%d) don't have same lengths but contain gaps, consider using --dealign", prMSeq->nseqs);
1178 else { /* here all sequences have same lengths */
1179 if (TRUE == bGapFound){
1180 /* if at least one sequence contains gaps (and all have the same lengths)
1181 then we can be sure it is a profile */
1184 /* here all sequences have same lengths but no sequences contain any gaps */
1185 else if (TRUE == bIsProfile){
1186 /* if the user says it is a profile then it is */
1196 /*** end: SeqsAreAligned() ***/
1201 * @brief Creates a new sequence entry and appends it to an existing mseq
1204 * @param[out] prMSeqDest_p
1205 * Already existing and initialised mseq structure
1206 * @param[in] pcSeqName
1207 * sequence name of the sequence to add
1208 * @param[in] pcSeqRes
1209 * the actual sequence (residues) to add
1211 * @note Don't forget to update the align and type flag if necessary!
1213 * FIXME allow adding of more features
1217 AddSeq(mseq_t **prMSeqDest_p, char *pcSeqName, char *pcSeqRes)
1222 assert(NULL != prMSeqDest_p);
1223 assert(NULL != pcSeqName);
1224 assert(NULL != pcSeqRes);
1226 iSeqIdx = (*prMSeqDest_p)->nseqs;
1228 (*prMSeqDest_p)->seq = (char **)
1229 CKREALLOC((*prMSeqDest_p)->seq, (iSeqIdx+1) * sizeof(char *));
1230 (*prMSeqDest_p)->orig_seq = (char **)
1231 CKREALLOC((*prMSeqDest_p)->orig_seq, (iSeqIdx+1) * sizeof(char *));
1232 (*prMSeqDest_p)->sqinfo = (SQINFO *)
1233 CKREALLOC((*prMSeqDest_p)->sqinfo, (iSeqIdx+1) * sizeof(SQINFO));
1236 (*prMSeqDest_p)->seq[iSeqIdx] = CkStrdup(pcSeqRes);
1237 (*prMSeqDest_p)->orig_seq[iSeqIdx] = CkStrdup(pcSeqRes);
1239 /* should probably get ri of SqInfo altogether in the long run and just
1240 transfer the intersting members into our own struct
1242 sqinfo.flags = 0; /* init */
1244 sqinfo.len = strlen(pcSeqRes);
1245 sqinfo.flags |= SQINFO_LEN;
1247 /* name is an array of SQINFO_NAMELEN length */
1248 strncpy(sqinfo.name, pcSeqName, SQINFO_NAMELEN-1);
1249 sqinfo.name[SQINFO_NAMELEN-1] = '\0';
1250 sqinfo.flags |= SQINFO_NAME;
1252 SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iSeqIdx],
1255 (*prMSeqDest_p)->nseqs++;
1259 /* end of AddSeq() */
1265 * @brief Appends an mseq structure to an already existing one.
1266 * filename will be left untouched.
1268 * @param[in] prMSeqDest_p
1269 * MSeq structure to which to append to
1270 * @param[out] prMSeqToAdd
1271 * MSeq structure which is to append
1275 JoinMSeqs(mseq_t **prMSeqDest_p, mseq_t *prMSeqToAdd)
1280 assert(NULL != prMSeqDest_p && NULL != (*prMSeqDest_p));
1281 assert(NULL != prMSeqToAdd);
1283 if (0 == prMSeqToAdd->nseqs) {
1284 Log(&rLog, LOG_WARN, "Was asked to add 0 sequences");
1288 /* warn on seqtype mismatch and keep original seqtype */
1289 if ((*prMSeqDest_p)->seqtype != prMSeqToAdd->seqtype) {
1290 Log(&rLog, LOG_WARN, "Joining sequences of different type");
1293 /* leave filename as it is */
1296 * copy new seq/s, orig_seq/s, sqinfo/s
1298 iNewNSeq = (*prMSeqDest_p)->nseqs + prMSeqToAdd->nseqs;
1300 (*prMSeqDest_p)->seq = (char **)
1301 CKREALLOC((*prMSeqDest_p)->seq, iNewNSeq * sizeof(char *));
1303 (*prMSeqDest_p)->orig_seq = (char **)
1304 CKREALLOC((*prMSeqDest_p)->orig_seq, iNewNSeq * sizeof(char *));
1306 (*prMSeqDest_p)->sqinfo = (SQINFO *)
1307 CKREALLOC((*prMSeqDest_p)->sqinfo, iNewNSeq * sizeof(SQINFO));
1310 for (iSrcSeqIndex=0; iSrcSeqIndex < prMSeqToAdd->nseqs; iSrcSeqIndex++) {
1311 int iDstSeqIndex = (*prMSeqDest_p)->nseqs++;
1313 (*prMSeqDest_p)->seq[iDstSeqIndex] =
1314 CkStrdup(prMSeqToAdd->seq[iSrcSeqIndex]);
1316 (*prMSeqDest_p)->orig_seq[iDstSeqIndex] =
1317 CkStrdup(prMSeqToAdd->orig_seq[iSrcSeqIndex]);
1319 SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iDstSeqIndex],
1320 & prMSeqToAdd->sqinfo[iSrcSeqIndex]);
1323 (*prMSeqDest_p)->nseqs = iNewNSeq;
1326 /* 2nd arg is bIsProfile, which when set TRUE skips
1327 * the check for gaps. here always check for gaps,
1328 * so set FALSE (main reason is that it is easier), FS, r282 -> */
1329 /* had a problem at this stage, therefore dispense with gap check, FS, r290 -> */
1330 /* 3rd argument is dealignment flag, do not dealign profiles */
1331 (*prMSeqDest_p)->aligned = SeqsAreAligned(*prMSeqDest_p, TRUE/*FALSE*/, FALSE);
1333 (*prMSeqDest_p)->aligned = TRUE;
1338 /*** end: JoinMSeqs() ***/