1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: seq.c 245 2011-06-15 12:38:53Z fabian $
21 * Module for sequence/alignment IO and misc.
23 * This depends heavily on Sean Eddy's squid library, which is obsoleted by
24 * HMMER3's Easel. However, easel doesn't support that many non-aligned input
34 #include "squid/squid.h"
42 #define ALLOW_ONLY_PROTEIN 0 /* requested by Hamish, FS, r244 -> r245 */
48 * @brief Stripped down version of squid's alistat
52 * The alignment to analyse
53 * @param[in] bSampling
54 * For many sequences: samples from pool
55 * @param[in] bReportAll
56 * Report identities for all sequence pairs
58 * Don't have to worry about sequence case because our version of PairwiseIdentity is case insensitive
61 AliStat(mseq_t *prMSeq, bool bSampling, bool bReportAll) {
64 * bSampling = squid's do_fast
65 * bReportAll = squid's allreport
67 float **ppdIdentMx; /* identity matrix (squid: imx) */
68 const int iNumSample = 1000; /* sample size (squid: nsample) */
71 MSA *msa; /* squid's alignment structure */
75 float worst_worst, worst_best, best_best;
78 int nres; /* number of residues */
80 if (bSampling && bReportAll) {
82 "Cannot report all and sample at the same time. Skipping %s()", __FUNCTION__);
85 if (FALSE == prMSeq->aligned) {
87 "Sequences are not aligned. Skipping %s()", __FUNCTION__);
91 /* silence gcc warnings about uninitialized variables
93 worst_worst = worst_best = best_best = 0.0;
99 * FIXME code overlap with WriteAlignment. Make it a function and take
100 * code there (contains more comments) as template
103 msa = MSAAlloc(prMSeq->nseqs,
104 /* derive alignment length from first seq */
105 strlen(prMSeq->seq[0]));
106 for (i=0; i<prMSeq->nseqs; i++) {
107 int key; /* MSA struct internal index for sequence */
108 char *this_name = prMSeq->sqinfo[i].name; /* prMSeq sequence name */
109 char *this_seq = prMSeq->seq[i]; /* prMSeq sequence */
110 SQINFO *this_sqinfo = &prMSeq->sqinfo[i]; /* prMSeq sequence name */
112 key = GKIStoreKey(msa->index, this_name);
113 msa->sqname[key] = sre_strdup(this_name, strlen(this_name));
114 /* setting msa->sqlen[idx] and msa->aseq[idx] */
115 msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key],
116 this_seq, strlen(this_seq));
117 if (this_sqinfo->flags & SQINFO_DESC) {
118 MSASetSeqDescription(msa, key, this_sqinfo->desc);
127 for (i = 0; i < msa->nseq; i++) {
128 int rlen; /* raw sequence length */
129 rlen = DealignedLength(msa->aseq[i]);
131 if (small == -1 || rlen < small) small = rlen;
132 if (large == -1 || rlen > large) large = rlen;
137 avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen,
138 msa->nseq, iNumSample);
143 /* this might be slow...could use openmp inside squid */
144 MakeIdentityMx(msa->aseq, msa->nseq, &ppdIdentMx);
146 printf(" %-15s %5s %7s %-15s %7s %-15s\n",
147 "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)");
148 printf(" --------------- ----- ------- --------------- ------- ---------------\n");
155 for (i = 0; i < msa->nseq; i++) {
158 for (j = 0; j < msa->nseq; j++) {
159 /* closest seq to this one = best */
160 if (i != j && ppdIdentMx[i][j] > best) {
161 best = ppdIdentMx[i][j]; bestj = j;
163 if (ppdIdentMx[i][j] < worst) {
164 worst = ppdIdentMx[i][j]; worstj = j;
169 printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n",
170 msa->sqname[i], DealignedLength(msa->aseq[i]),
171 best * 100., msa->sqname[bestj],
172 worst * 100., msa->sqname[worstj]);
174 if (best > best_best) best_best = best;
175 if (best < worst_best) worst_best = best;
176 if (worst < worst_worst) worst_worst = worst;
177 for (j = 0; j < i; j++)
178 sum += ppdIdentMx[i][j];
180 avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0);
183 FMX2Free(ppdIdentMx);
184 } /* else bSampling */
190 if (msa->name != NULL)
191 printf("Alignment name: %s\n", msa->name);
192 /*printf("Format: %s\n", SeqfileFormat2String(afp->format));*/
193 printf("Number of sequences: %d\n", msa->nseq);
194 printf("Total # residues: %d\n", nres);
195 printf("Smallest: %d\n", small);
196 printf("Largest: %d\n", large);
197 printf("Average length: %.1f\n", (float) nres / (float) msa->nseq);
198 printf("Alignment length: %d\n", msa->alen);
199 printf("Average identity: %.2f%%\n", 100.*avgid);
202 printf("Most related pair: %.2f%%\n", 100.*best_best);
203 printf("Most unrelated pair: %.2f%%\n", 100.*worst_worst);
204 printf("Most distant seq: %.2f%%\n", 100.*worst_best);
209 cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen);
215 /* end of AliStat() */
222 * @brief Shuffle mseq order
225 * mseq structure to shuffle
229 ShuffleMSeq(mseq_t *mseq)
235 /* quick and dirty: create an array with permuted indices and
236 * swap accordingly (which amounts to shuffling twice)
239 PermutationArray(&piPermArray, mseq->nseqs);
241 for (iSeqIndex=0; iSeqIndex<mseq->nseqs; iSeqIndex++) {
242 SeqSwap(mseq, piPermArray[iSeqIndex], iSeqIndex);
247 /*** end: ShuffleMSeq() ***/
251 * @brief Swap two sequences in an mseq_t structure.
254 * Multiple sequence struct
256 * Index of first sequence
258 * Index of seconds sequence
263 SeqSwap(mseq_t *prMSeq, int i, int j)
268 assert(NULL!=prMSeq);
269 assert(i<prMSeq->nseqs && j<prMSeq->nseqs);
275 pcTmp = prMSeq->seq[i];
276 prMSeq->seq[i] = prMSeq->seq[j];
277 prMSeq->seq[j] = pcTmp;
279 pcTmp = prMSeq->orig_seq[i];
280 prMSeq->orig_seq[i] = prMSeq->orig_seq[j];
281 prMSeq->orig_seq[j] = pcTmp;
283 rSqinfoTmp = prMSeq->sqinfo[i];
284 prMSeq->sqinfo[i] = prMSeq->sqinfo[j];
285 prMSeq->sqinfo[j] = rSqinfoTmp;
289 /*** end: SeqSwap() ***/
295 * @brief Dealigns all sequences in mseq structure, updates the
296 * sequence length info and sets aligned to FALSE
299 * The mseq structure to dealign
303 DealignMSeq(mseq_t *mseq)
306 for (i=0; i<mseq->nseqs; i++) {
307 DealignSeq(mseq->seq[i]);
308 mseq->sqinfo[i].len = strlen(mseq->seq[i]);
310 mseq->aligned = FALSE;
314 /*** end: DealinMSeq() ***/
319 * @brief debug output of sqinfo struct
322 * Squid's SQINFO struct for a certain seqeuence
324 * @note useful for debugging only
328 LogSqInfo(SQINFO *sqinfo)
331 /*LOG_DEBUG("sqinfo->flags = %d", sqinfo->flags);*/
332 if (sqinfo->flags & SQINFO_NAME)
333 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->name = %s", sqinfo->name);
335 if (sqinfo->flags & SQINFO_ID)
336 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->id = %s", sqinfo->id);
338 if (sqinfo->flags & SQINFO_ACC)
339 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->acc = %s", sqinfo->acc);
341 if (sqinfo->flags & SQINFO_DESC)
342 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->desc = %s", sqinfo->desc);
344 if (sqinfo->flags & SQINFO_LEN)
345 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->len = %d", sqinfo->len);
347 if (sqinfo->flags & SQINFO_START)
348 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->start = %d", sqinfo->start);
350 if (sqinfo->flags & SQINFO_STOP)
351 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->stop = %d", sqinfo->stop);
353 if (sqinfo->flags & SQINFO_OLEN)
354 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->olen = %d", sqinfo->olen);
356 if (sqinfo->flags & SQINFO_TYPE)
357 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->type = %d", sqinfo->type);
359 if (sqinfo->flags & SQINFO_SS)
360 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->ss = %s", sqinfo->ss);
362 if (sqinfo->flags & SQINFO_SA)
363 Log(&rLog, LOG_FORCED_DEBUG, "sqinfo->sa = %s", sqinfo->sa);
365 /*** end: log_sqinfo ***/
369 * @brief convert int-encoded seqtype to string
371 * @param[in] seqtype int-encoded sequence type
373 * @return character pointer describing the sequence type
377 SeqTypeToStr(int seqtype)
384 case SEQTYPE_PROTEIN:
386 case SEQTYPE_UNKNOWN:
389 Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__);
391 return "Will never get here";
393 /*** end: seqtype_to_str ***/
398 * @brief reads sequences from file
401 * Multiple sequence struct. Must be preallocated.
402 * FIXME: would make more sense to allocate it here.
404 * Sequence file name. If '-' sequence will be read from stdin.
406 * int-encoded sequence type. Set to
407 * SEQTYPE_UNKNOWN for autodetect (guessed from first sequence)
408 * @param[in] iMaxNumSeq
409 * Return an error, if more than iMaxNumSeq have been read
410 * @param[in] iMaxSeqLen
411 * Return an error, if a seq longer than iMaxSeqLen has been read
413 * @return 0 on success, -1 on error
416 * - Depends heavily on squid
417 * - Sequence file format will be guessed
418 * - If supported by squid, gzipped files can be read as well.
421 ReadSequences(mseq_t *prMSeq, char *seqfile, int seqtype,
422 int iMaxNumSeq, int iMaxSeqLen)
424 int fmt = SQFILE_UNKNOWN; /* file format: auto detect if unknown */
425 SQFILE *dbfp; /* sequence file descriptor */
428 int iSeqIdx; /* sequence counter */
429 int iSeqPos; /* sequence string position counter */
431 assert(NULL!=seqfile);
434 /* Try to work around inability to autodetect from a pipe or .gz:
435 * assume FASTA format
437 if (SQFILE_UNKNOWN == fmt &&
438 (Strparse("^.*\\.gz$", seqfile, 0) || strcmp(seqfile, "-") == 0)) {
442 /* Using squid routines to read input. taken from seqstat_main.c. we don't
443 * know if input is aligned, so we use SeqfileOpen instead of MSAFileOpen
444 * etc. NOTE this also means we discard some information, e.g. when
445 * reading from and writing to a stockholm file, all extra MSA
446 * info/annotation will be lost.
450 if (NULL == (dbfp = SeqfileOpen(seqfile, fmt, NULL))) {
451 Log(&rLog, LOG_ERROR, "Failed to open sequence file %s for reading", seqfile);
456 /* FIXME squid's ReadSeq() will exit with fatal error if format is
457 * unknown. This will be a problem for a GUI. Same is true for many squid
460 * The original squid:ReadSeq() dealigns sequences on input. We
461 * use a patched version.
464 while (ReadSeq(dbfp, dbfp->format,
468 if (prMSeq->nseqs+1>iMaxNumSeq) {
469 Log(&rLog, LOG_ERROR, "Maximum number of sequences (=%d) exceeded after reading sequence '%s' from '%s'",
470 iMaxNumSeq, cur_sqinfo.name, seqfile);
473 if ((int)strlen(cur_seq)>iMaxSeqLen) {
474 Log(&rLog, LOG_ERROR, "Sequence '%s' has %d residues and is therefore longer than allowed (max. sequence length is %d)",
475 cur_sqinfo.name, strlen(cur_seq), iMaxSeqLen);
479 /* FIXME: use modified version of AddSeq() that allows handing down SqInfo
482 prMSeq->seq = (char **)
483 CKREALLOC(prMSeq->seq, (prMSeq->nseqs+1) * sizeof(char *));
484 prMSeq->seq[prMSeq->nseqs] = CkStrdup(cur_seq);
487 prMSeq->sqinfo = (SQINFO *)
488 CKREALLOC(prMSeq->sqinfo, (prMSeq->nseqs+1) * sizeof(SQINFO));
489 SeqinfoCopy(&prMSeq->sqinfo[prMSeq->nseqs], &cur_sqinfo);
492 Log(&rLog, LOG_FORCED_DEBUG, "seq no %d: seq = %s", prMSeq->nseqs, prMSeq->seq[prMSeq->nseqs]);
493 LogSqInfo(&prMSeq->sqinfo[prMSeq->nseqs]);
495 /* always guess type from first seq. use squid function and
498 if (0 == prMSeq->nseqs) {
499 int type = Seqtype(prMSeq->seq[prMSeq->nseqs]);
502 prMSeq->seqtype = SEQTYPE_DNA;
505 prMSeq->seqtype = SEQTYPE_RNA;
508 prMSeq->seqtype = SEQTYPE_PROTEIN;
511 prMSeq->seqtype = SEQTYPE_UNKNOWN;
514 Log(&rLog, LOG_FATAL, "Internal error in %s", __FUNCTION__);
517 /* override with given sequence type but check with
518 * automatically detected type and warn if necessary
520 if (SEQTYPE_UNKNOWN != seqtype) {
521 if (prMSeq->seqtype != seqtype) {
522 Log(&rLog, LOG_WARN, "Overriding automatically determined seq-type %s to %s as requested",
523 SeqTypeToStr(prMSeq->seqtype), SeqTypeToStr(seqtype));
524 prMSeq->seqtype = seqtype;
527 /* if type could not be determined and was not set return error */
528 if (SEQTYPE_UNKNOWN == seqtype && SEQTYPE_UNKNOWN == prMSeq->seqtype) {
529 Log(&rLog, LOG_ERROR, "Couldn't guess sequence type from first sequence");
530 FreeSequence(cur_seq, &cur_sqinfo);
536 Log(&rLog, LOG_DEBUG, "seq-no %d: type=%s name=%s len=%d seq=%s",
537 prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype),
538 prMSeq->sqinfo[prMSeq->nseqs].name, prMSeq->sqinfo[prMSeq->nseqs].len,
539 prMSeq->seq[prMSeq->nseqs]);
541 /* FIXME IPUAC and/or case conversion? If yes see
542 * corresponding squid functions. Special treatment of
543 * Stockholm tilde-gaps for ktuple code?
548 FreeSequence(cur_seq, &cur_sqinfo);
552 #if ALLOW_ONLY_PROTEIN
553 if (SEQTYPE_PROTEIN != prMSeq->seqtype) {
554 Log(&rLog, LOG_FATAL, "Sequence type is %s. %s only works on protein.",
555 SeqTypeToStr(prMSeq->seqtype), PACKAGE_NAME);
559 /* Check if sequences are aligned */
560 prMSeq->aligned = SeqsAreAligned(prMSeq);
563 /* keep original sequence as copy and convert "working" sequence
566 prMSeq->orig_seq = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char *));
567 for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) {
569 prMSeq->orig_seq[iSeqIdx] = CkStrdup(prMSeq->seq[iSeqIdx]);
572 /* convert unknown characters according to set seqtype
573 * be conservative, i.e. don't allow any fancy ambiguity
574 * characters to make sure that ktuple code etc. works.
577 /* first on the fly conversion between DNA and RNA
579 if (prMSeq->seqtype==SEQTYPE_DNA)
580 ToDNA(prMSeq->seq[iSeqIdx]);
581 if (prMSeq->seqtype==SEQTYPE_RNA)
582 ToRNA(prMSeq->seq[iSeqIdx]);
584 /* then check of each character
586 for (iSeqPos=0; iSeqPos<(int)strlen(prMSeq->seq[iSeqIdx]); iSeqPos++) {
587 char *res = &(prMSeq->seq[iSeqIdx][iSeqPos]);
591 if (prMSeq->seqtype==SEQTYPE_PROTEIN) {
592 if (NULL == strchr(AMINO_ALPHABET, toupper(*res))) {
593 *res = AMINOACID_ANY;
595 } else if (prMSeq->seqtype==SEQTYPE_DNA) {
596 if (NULL == strchr(DNA_ALPHABET, toupper(*res))) {
597 *res = NUCLEOTIDE_ANY;
599 } else if (prMSeq->seqtype==SEQTYPE_RNA) {
600 if (NULL == strchr(RNA_ALPHABET, toupper(*res))) {
601 *res = NUCLEOTIDE_ANY;
608 prMSeq->filename = CkStrdup(seqfile);
609 Log(&rLog, LOG_INFO, "Read %d sequences (type: %s) from %s",
610 prMSeq->nseqs, SeqTypeToStr(prMSeq->seqtype), prMSeq->filename);
614 /*** end: ReadSequences ***/
618 * @brief allocate and initialise new mseq_t
621 * newly allocated and initialised mseq_t
623 * @note caller has to free by calling FreeMSeq()
629 NewMSeq(mseq_t **prMSeq)
631 *prMSeq = (mseq_t *) CKMALLOC(1 * sizeof(mseq_t));
633 (*prMSeq)->nseqs = 0;
634 (*prMSeq)->seq = NULL;
635 (*prMSeq)->orig_seq = NULL;
636 (*prMSeq)->seqtype = SEQTYPE_UNKNOWN;
637 (*prMSeq)->sqinfo = NULL;
638 (*prMSeq)->filename = NULL;
640 /*** end: NewMSeq ***/
645 * @brief copies an mseq structure
647 * @param[out] prMSeqDest_p
648 * Copy of mseq structure
649 * @param[in] prMSeqSrc
650 * Source mseq structure to copy
652 * @note caller has to free copy by calling FreeMSeq()
656 CopyMSeq(mseq_t **prMSeqDest_p, mseq_t *prMSeqSrc)
659 assert(prMSeqSrc != NULL && prMSeqDest_p != NULL);
661 NewMSeq(prMSeqDest_p);
663 (*prMSeqDest_p)->nseqs = prMSeqSrc->nseqs;
664 (*prMSeqDest_p)->seqtype = prMSeqSrc->seqtype;
665 if (prMSeqSrc->filename!=NULL) {
666 (*prMSeqDest_p)->filename = CkStrdup(prMSeqSrc->filename);
669 (*prMSeqDest_p)->seq = (char **)
670 CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(char *));
671 (*prMSeqDest_p)->orig_seq = (char **)
672 CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(char *));
673 (*prMSeqDest_p)->sqinfo = (SQINFO *)
674 CKMALLOC((*prMSeqDest_p)->nseqs * sizeof(SQINFO));
678 for (i=0; i<(*prMSeqDest_p)->nseqs; i++) {
679 (*prMSeqDest_p)->seq[i] = CkStrdup(prMSeqSrc->seq[i]);
680 (*prMSeqDest_p)->orig_seq[i] = CkStrdup(prMSeqSrc->orig_seq[i]);
681 SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[i], &prMSeqSrc->sqinfo[i]);
684 /*** end: CopyMSeq ***/
692 * The sequence name to search for
694 * The multiple sequence structure to search in
696 * @return -1 on failure, sequence index of matching name otherwise
698 * @warning If sequence name happens to be used twice, only the first
699 * one will be reported back
703 FindSeqName(char *seqname, mseq_t *mseq)
709 for (i=0; i<mseq->nseqs; i++) {
710 if (STR_EQ(mseq->sqinfo[i].name, seqname)) {
717 /*** end: FindSeqName() ***/
721 * @brief Frees an mseq_t and it's members and zeros all members
723 * @param[in] mseq mseq_to to free
725 * @note use in conjunction with NewMSeq()
729 FreeMSeq(mseq_t **mseq)
737 if ((*mseq)->filename) {
738 (*mseq)->filename = CKFREE((*mseq)->filename);
741 for (i=0; i<(*mseq)->nseqs; i++) {
742 FreeSequence((*mseq)->seq[i], &(*mseq)->sqinfo[i]);
743 CKFREE((*mseq)->orig_seq[i]);
746 CKFREE((*mseq)->seq);
748 if ((*mseq)->orig_seq) { /* FIXME (FS): only ptr to ptr freed, actual sequences NOT freed*/
749 CKFREE((*mseq)->orig_seq);
751 if ((*mseq)->sqinfo) {
752 CKFREE((*mseq)->sqinfo);
755 (*mseq)->seqtype = SEQTYPE_UNKNOWN;
760 /*** end: FreeMSeq ***/
764 * @brief Write alignment to file.
767 * The mseq_t struct containing the aligned sequences
768 * @param[in] pcAlnOutfile
769 * The name of the output file
771 * The alignment output format (defined in squid.h)
773 * @return Non-zero on error
775 * @note We create a temporary squid MSA struct in here because we never
776 * use it within clustal. We might be better of using the old clustal
777 * output routines instead.
781 WriteAlignment(mseq_t *mseq, const char *pcAlnOutfile, int outfmt)
784 MSA *msa; /* squid's alignment structure */
786 int key; /* MSA struct internal index for sequence */
787 int alen; /* alignment length */
792 if (MSAFILE_UNKNOWN == outfmt) {
793 Log(&rLog, LOG_ERROR, "Unknown output format chosen");
797 if (NULL == pcAlnOutfile) {
802 if (NULL == (pfOut = fopen(pcAlnOutfile, "w"))) {
803 Log(&rLog, LOG_ERROR, "Could not open file %s for writing", pcAlnOutfile);
809 /* derive alignment length from first seq */
810 alen = strlen(mseq->seq[0]);
812 msa = MSAAlloc(mseq->nseqs, alen);
814 /* basic structure borrowed code from squid-1.9g/a2m.c:ReadA2M()
815 * we actually create a copy of mseq. keeping the pointers becomes
816 * messy when calling MSAFree()
818 for (i=0; i<mseq->nseqs; i++) {
819 char *this_name = mseq->sqinfo[i].name; /* mseq sequence name */
820 char *this_seq = mseq->seq[i]; /* mseq sequence */
821 SQINFO *this_sqinfo = &mseq->sqinfo[i]; /* mseq sequence name */
823 key = GKIStoreKey(msa->index, this_name);
824 msa->sqname[key] = sre_strdup(this_name, strlen(this_name));
826 /* setting msa->sqlen[idx] and msa->aseq[idx] */
827 msa->sqlen[key] = sre_strcat(&(msa->aseq[key]), msa->sqlen[key],
828 this_seq, strlen(this_seq));
830 if (this_sqinfo->flags & SQINFO_DESC) {
831 /* FIXME never get here ... */
832 MSASetSeqDescription(msa, key, this_sqinfo->desc);
834 /* FIXME extend this by copying more stuff according to flags.
835 * See MSAFileRead() in msa.c and used functions there
837 * Problem is that we never parse MSA information as we use squid'sSeqFile
844 /* FIXME Would like to, but can't use MSAVerifyParse(msa) here, as it
845 * will die on error. Need to implement our own version
851 /* The below is copy of MSAFileWrite() which originally only writes to stdout.
854 /* Be sloppy and make a2m and fasta the same. same for vienna (which is
855 the same). same same. can can. boleh boleh */
856 if (outfmt==SQFILE_FASTA)
857 outfmt = MSAFILE_A2M;
858 if (outfmt==SQFILE_VIENNA)
859 outfmt = MSAFILE_VIENNA;
863 WriteA2M(pfOut, msa, 0);
866 WriteA2M(pfOut, msa, 1);
868 case MSAFILE_CLUSTAL:
869 WriteClustal(pfOut, msa);
872 WriteMSF(pfOut, msa);
875 WritePhylip(pfOut, msa);
878 WriteSELEX(pfOut, msa);
880 case MSAFILE_STOCKHOLM:
881 WriteStockholm(pfOut, msa);
884 Log(&rLog, LOG_FATAL, "internal error: %s",
885 "invalid output format should have been detected before");
888 if (use_stdout == FALSE) {
889 (void) fclose(pfOut);
891 "Alignment written to %s", pcAlnOutfile);
897 /*** end of WriteAlignment() ***/
901 * @brief Removes all gap-characters from a sequence.
904 * Sequence to dealign
906 * @note seq will not be reallocated
909 DealignSeq(char *seq)
917 for (aln_pos=0; aln_pos<(int)strlen(seq); aln_pos++) {
918 if (! isgap(seq[aln_pos])) {
919 seq[dealn_pos++] = seq[aln_pos];
922 seq[dealn_pos] = '\0';
926 /*** end: DealignSeq() ***/
931 * @brief Sort sequences by length
934 * mseq to sort by length
936 * Sorting order. 'd' for descending, 'a' for ascending.
941 SortMSeqByLength(mseq_t *prMSeq, const char cOrder)
946 mseq_t *prMSeqCopy = NULL;
948 assert('a'==cOrder || 'd'==cOrder);
951 "FIXME: This modifies sequence ordering. Might not be what user wants. Will change output order as well");
953 piSeqLen = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
954 piOrder = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
955 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
956 piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
958 QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, cOrder, FALSE);
960 CopyMSeq(&prMSeqCopy, prMSeq);
961 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
964 CKFREE(prMSeq->seq[iSeqIndex]);
965 prMSeq->seq[iSeqIndex] = CkStrdup(prMSeqCopy->seq[piOrder[iSeqIndex]]);
967 CKFREE(prMSeq->orig_seq[iSeqIndex]);
968 prMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeqCopy->orig_seq[piOrder[iSeqIndex]]);
970 SeqinfoCopy(&prMSeq->sqinfo[iSeqIndex], &prMSeqCopy->sqinfo[piOrder[iSeqIndex]]);
975 FreeMSeq(&prMSeqCopy);
979 /*** end: SortMSeqByLength() ***/
984 * @brief Checks if sequences in given mseq structure are aligned. By
985 * definition this is only true, if sequences are of the same length
986 * and at least one gap was found
991 * @return TRUE if sequences are aligned, FALSE if not
996 SeqsAreAligned(mseq_t *prMSeq)
998 bool bGapFound, bSameLength;
999 int iSeqIdx; /* sequence counter */
1000 int iSeqPos; /* sequence string position counter */
1002 /* Special case of just one sequence:
1003 * it is arguable that a single sequence qualifies as a profile,
1004 * however, this is what we do at the first stage of MSA anyway.
1005 * So, if there is only 1 sequence it is a 1-profile
1006 * and it is (defined to be) aligned (with itself). FS, r240 -> 241
1008 if (1 == prMSeq->nseqs) {
1013 /* Check if sequences are aligned. For being aligned, the
1014 * sequences have to be of same length (bSameLength) and at least
1015 * one of them has to contain at least one gap (bGapFound)
1019 for (iSeqIdx=0; iSeqIdx<prMSeq->nseqs; iSeqIdx++) {
1020 if (FALSE == bGapFound) {
1022 iSeqPos<prMSeq->sqinfo[iSeqIdx].len && false==bGapFound;
1024 if (isgap(prMSeq->seq[iSeqIdx][iSeqPos])) {
1026 /* skip rest of sequence */
1033 if (prMSeq->sqinfo[iSeqIdx].len != prMSeq->sqinfo[iSeqIdx-1].len) {
1034 bSameLength = FALSE;
1035 /* no need to continue search, bSameLength==FALSE is
1036 * sufficient condition */
1042 Log(&rLog, LOG_FORCED_DEBUG, "bSameLength=%d bGapFound=%d", bSameLength, bGapFound);
1044 if (TRUE == bSameLength && TRUE == bGapFound) {
1051 /*** end: SeqsAreAligned() ***/
1056 * @brief Creates a new sequence entry and appends it to an existing mseq
1059 * @param[out] prMSeqDest_p
1060 * Already existing and initialised mseq structure
1061 * @param[in] pcSeqName
1062 * sequence name of the sequence to add
1063 * @param[in] pcSeqRes
1064 * the actual sequence (residues) to add
1066 * @note Don't forget to update the align and type flag if necessary!
1068 * FIXME allow adding of more features
1072 AddSeq(mseq_t **prMSeqDest_p, char *pcSeqName, char *pcSeqRes)
1077 assert(NULL != prMSeqDest_p);
1078 assert(NULL != pcSeqName);
1079 assert(NULL != pcSeqRes);
1081 iSeqIdx = (*prMSeqDest_p)->nseqs;
1083 (*prMSeqDest_p)->seq = (char **)
1084 CKREALLOC((*prMSeqDest_p)->seq, (iSeqIdx+1) * sizeof(char *));
1085 (*prMSeqDest_p)->orig_seq = (char **)
1086 CKREALLOC((*prMSeqDest_p)->orig_seq, (iSeqIdx+1) * sizeof(char *));
1087 (*prMSeqDest_p)->sqinfo = (SQINFO *)
1088 CKREALLOC((*prMSeqDest_p)->sqinfo, (iSeqIdx+1) * sizeof(SQINFO));
1091 (*prMSeqDest_p)->seq[iSeqIdx] = CkStrdup(pcSeqRes);
1092 (*prMSeqDest_p)->orig_seq[iSeqIdx] = CkStrdup(pcSeqRes);
1094 /* should probably get ri of SqInfo altogether in the long run and just
1095 transfer the intersting members into our own struct
1097 sqinfo.flags = 0; /* init */
1099 sqinfo.len = strlen(pcSeqRes);
1100 sqinfo.flags |= SQINFO_LEN;
1102 /* name is an array of SQINFO_NAMELEN length */
1103 strncpy(sqinfo.name, pcSeqName, SQINFO_NAMELEN-1);
1104 sqinfo.name[SQINFO_NAMELEN-1] = '\0';
1105 sqinfo.flags |= SQINFO_NAME;
1107 SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iSeqIdx],
1110 (*prMSeqDest_p)->nseqs++;
1114 /* end of AddSeq() */
1120 * @brief Appends an mseq structure to an already existing one.
1121 * filename will be left untouched.
1123 * @param[in] prMSeqDest_p
1124 * MSeq structure to which to append to
1125 * @param[out] prMSeqToAdd
1126 * MSeq structure which is to append
1131 JoinMSeqs(mseq_t **prMSeqDest_p, mseq_t *prMSeqToAdd)
1136 assert(NULL != prMSeqDest_p && NULL != (*prMSeqDest_p));
1137 assert(NULL != prMSeqToAdd);
1139 if (0 == prMSeqToAdd->nseqs) {
1140 Log(&rLog, LOG_WARN, "Was asked to add 0 sequences");
1144 /* warn on seqtype mismatch and keep original seqtype */
1145 if ((*prMSeqDest_p)->seqtype != prMSeqToAdd->seqtype) {
1146 Log(&rLog, LOG_WARN, "Joining sequences of different type");
1149 /* leave filename as it is */
1152 * copy new seq/s, orig_seq/s, sqinfo/s
1154 iNewNSeq = (*prMSeqDest_p)->nseqs + prMSeqToAdd->nseqs;
1156 (*prMSeqDest_p)->seq = (char **)
1157 CKREALLOC((*prMSeqDest_p)->seq, iNewNSeq * sizeof(char *));
1159 (*prMSeqDest_p)->orig_seq = (char **)
1160 CKREALLOC((*prMSeqDest_p)->orig_seq, iNewNSeq * sizeof(char *));
1162 (*prMSeqDest_p)->sqinfo = (SQINFO *)
1163 CKREALLOC((*prMSeqDest_p)->sqinfo, iNewNSeq * sizeof(SQINFO));
1166 for (iSrcSeqIndex=0; iSrcSeqIndex < prMSeqToAdd->nseqs; iSrcSeqIndex++) {
1167 int iDstSeqIndex = (*prMSeqDest_p)->nseqs++;
1169 (*prMSeqDest_p)->seq[iDstSeqIndex] =
1170 CkStrdup(prMSeqToAdd->seq[iSrcSeqIndex]);
1172 (*prMSeqDest_p)->orig_seq[iDstSeqIndex] =
1173 CkStrdup(prMSeqToAdd->orig_seq[iSrcSeqIndex]);
1175 SeqinfoCopy(&(*prMSeqDest_p)->sqinfo[iDstSeqIndex],
1176 & prMSeqToAdd->sqinfo[iSrcSeqIndex]);
1179 (*prMSeqDest_p)->nseqs = iNewNSeq;
1181 (*prMSeqDest_p)->aligned = SeqsAreAligned(*prMSeqDest_p);
1185 /*** end: JoinMSeqs() ***/