1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
13 * Compalign -- a program to compare two sequence alignments
14 * SRE, Tue Nov 3 07:38:03 1992
15 * RCS $Id: compalign_main.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $
17 * incorporated into SQUID, Thu Jan 26 16:52:41 1995
19 * Usage: compalign <trusted-alignment> <test-alignment>
21 * Calculate the fractional "identity" between the trusted alignment
22 * and the test alignment. The two files must contain exactly the same
23 * sequences, in exactly the same order.
25 * The identity of the multiple sequence alignments is defined as
26 * the averaged identity over all N(N-1)/2 pairwise alignments.
28 * The fractional identity of two sets of pairwise alignments
29 * is in turn defined as follows (for aligned known sequences k1 and k2,
30 * and aligned test sequences t1 and t2):
32 * matched columns / total columns,
34 * where total columns = the total number of columns in
35 * which there is a valid (nongap) symbol in k1 or k2;
37 * matched columns = the number of columns in which one of the
40 * k1 and k2 both have valid symbols at a given column; t1 and t2
41 * have the same symbols aligned in a column of the t1/t2
44 * k1 has a symbol aligned to a gap in k2; that symbol in t1
45 * is also aligned to a gap;
47 * k2 has a symbol aligned to a gap in k1; that symbol in t2
48 * is also aligned to a gap.
50 * Because scores for all possible pairs are calculated, the
51 * algorithm is of order (N^2)L for N sequences of length L;
52 * large sequence sets will take a while.
54 * Sean Eddy, Tue Nov 3 07:46:59 1992
63 static char banner[] = "compalign - compare two multiple alignments";
65 static char usage[] = "\
66 Usage: compalign [-options] <trusted.ali> <test.ali>\n\
68 -c : only compare under marked #=CS consensus structure\n\
69 -h : print short help and usage info\n\
72 static char experts[] = "\
73 --informat <s> : specify that both alignments are in format <s> (MSF, for instance)\n\
74 --quiet : suppress verbose header (used in regression testing)\n\
77 struct opt_s OPTIONS[] = {
78 { "-c", TRUE, sqdARG_NONE },
79 { "-h", TRUE, sqdARG_NONE },
80 { "--informat", FALSE, sqdARG_STRING },
81 { "--quiet", FALSE, sqdARG_NONE },
83 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
87 main(int argc, char **argv)
89 char *kfile; /* name of file of trusted (known) alignment */
90 char *tfile; /* name of file of test alignment */
91 MSAFILE *kfp; /* open ptr into trusted (known) alignfile */
92 MSAFILE *tfp; /* open ptr into test alignment file */
93 int format; /* expected format of alignment files */
94 MSA *kmsa; /* a trusted (known) alignment */
95 MSA *tmsa; /* a test alignment */
96 char **kraw; /* dealigned trusted seqs */
97 char **traw; /* dealigned test sequences */
98 int idx; /* counter for sequences */
99 int apos; /* position in alignment */
100 float score; /* RESULT: score for the comparison */
102 int cs_only; /* TRUE to compare under #=CS annotation only */
103 int *ref = NULL; /* init only to silence gcc warning */
104 int be_quiet; /* TRUE to suppress verbose header */
110 /***********************************************
112 ***********************************************/
114 format = MSAFILE_UNKNOWN;
118 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
119 &optind, &optname, &optarg))
121 if (strcmp(optname, "-c") == 0) cs_only = TRUE;
122 else if (strcmp(optname, "--quiet") == 0) be_quiet = TRUE;
123 else if (strcmp(optname, "--informat") == 0) {
124 format = String2SeqfileFormat(optarg);
125 if (format == MSAFILE_UNKNOWN)
126 Die("unrecognized sequence file format \"%s\"", optarg);
127 if (! IsAlignmentFormat(format))
128 Die("%s is an unaligned format, can't read as an alignment", optarg);
130 else if (strcmp(optname, "-h") == 0) {
131 Banner(stdout, banner);
138 if (argc - optind != 2)
139 Die("Incorrect number of command line arguments.\n%s\n", usage);
141 kfile = argv[optind++];
142 tfile = argv[optind];
144 if (! be_quiet) Banner(stdout, banner);
146 /***********************************************
147 * Read in the alignments
148 * Capable of handling full Stockholm: >1 alignment/file
149 ***********************************************/
151 if ((kfp = MSAFileOpen(kfile, format, NULL)) == NULL)
152 Die("Trusted alignment file %s could not be opened for reading", kfile);
153 if ((tfp = MSAFileOpen(tfile, format, NULL)) == NULL)
154 Die("Test alignment file %s could not be opened for reading", tfile);
156 while ((kmsa = MSAFileRead(kfp)) != NULL)
158 if ((tmsa = MSAFileRead(tfp)) == NULL)
159 Die("Failed to get a test alignment to match with the trusted alignment");
161 /* test that they're the same! */
162 if (kmsa->nseq != tmsa->nseq)
163 Die("files %s and %s do not contain same number of seqs!\n", kfile, tfile);
165 for (idx = 0; idx < kmsa->nseq; idx++)
167 s2upper(kmsa->aseq[idx]);
168 s2upper(tmsa->aseq[idx]);
170 /* another sanity check */
171 for (idx = 0; idx < kmsa->nseq; idx++)
172 if (strcmp(kmsa->sqname[idx], tmsa->sqname[idx]) != 0)
173 Die("seqs in %s and %s don't seem to be in the same order\n (%s != %s)",
174 kfile, tfile, kmsa->sqname[idx], tmsa->sqname[idx]);
176 /* and *another* sanity check */
177 DealignAseqs(kmsa->aseq, kmsa->nseq, &kraw);
178 DealignAseqs(tmsa->aseq, tmsa->nseq, &traw);
179 for (idx = 0; idx < kmsa->nseq; idx++)
180 if (strcmp(kraw[idx], traw[idx]) != 0)
181 Die("raw seqs in %s and %s are not the same (died at %s, number %d)\n",
182 kfile, tfile, kmsa->sqname[idx], idx);
183 Free2DArray((void **) kraw, kmsa->nseq);
184 Free2DArray((void **) traw, tmsa->nseq);
188 if (kmsa->ss_cons == NULL)
189 Die("Trusted alignment %s has no consensus structure annotation\n -- can't use -c!\n",
191 ref = (int *) MallocOrDie (sizeof(int) * kmsa->alen);
192 for (apos = 0; apos < kmsa->alen; apos++)
193 ref[apos] = (isgap(kmsa->ss_cons[apos])) ? FALSE : TRUE;
196 /***********************************************
197 * Compare the alignments, print results
198 ***********************************************/
201 score = CompareRefMultAlignments(ref, kmsa->aseq, tmsa->aseq, kmsa->nseq);
203 score = CompareMultAlignments(kmsa->aseq, tmsa->aseq, kmsa->nseq);
205 printf("Trusted alignment: %s\n", kmsa->name != NULL ? kmsa->name : kfile);
206 printf("Test alignment: %s\n", tmsa->name != NULL ? tmsa->name : tfile);
207 printf("Total sequences: %d\n", kmsa->nseq);
208 printf("Alignment identity: %.4f\n", score);
211 if (cs_only) free(ref);