1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * Fri Jan 27 10:41:41 1995
13 * CVS $Id: alistat_main.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $
15 * Look at an alignment file, determine some simple statistics.
24 static char banner[] = "alistat - show some simple statistics on an alignment file";
26 static char usage[] = "\
27 Usage: alistat [-options] <alignment file>\n\
29 -a : report per-sequence info, not just a summary\n\
30 -f : fast: estimate average %id by sampling (not compatible with -a)\n\
31 -h : help: display usage and version\n\
32 -q : quiet: suppress verbose header\n\
35 static char experts[] = "\
37 --consensus <f>: write majority rule consensus sequence(s) in FASTA\n\
39 --identmx <f> : save a report on all NxN pairwise identities to file <f>\n\
40 --informat <s> : specify alignment file format <s>\n\
41 allowed formats: SELEX, MSF, Clustal, a2m, PHYLIP\n\
44 struct opt_s OPTIONS[] = {
45 { "-a", TRUE, sqdARG_NONE },
46 { "-f", TRUE, sqdARG_NONE },
47 { "-h", TRUE, sqdARG_NONE },
48 { "-q", TRUE, sqdARG_NONE },
49 { "--consensus", FALSE, sqdARG_STRING },
50 { "--identmx", FALSE, sqdARG_STRING },
51 { "--informat", FALSE, sqdARG_STRING },
53 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
56 main(int argc, char **argv)
58 char *afile; /* name of aligned sequence file */
59 MSAFILE *afp; /* pointer to open alignment file*/
60 MSA *msa; /* multiple sequence alignment */
61 int fmt; /* format of afile */
62 int rlen; /* raw sequence length */
63 int nres; /* number of residues */
64 float **imx; /* identity matrix */
68 float sum, best, worst;
69 float worst_worst, worst_best, best_best;
78 char *identmx_report; /* file to save identity matrix info to */
79 FILE *identmx_fp = NULL;
85 /* These inits are solely to silence gcc warnings about
86 * uninitialized variables
88 worst_worst = worst_best = best_best = 0.0;
91 /***********************************************
93 ***********************************************/
95 fmt = MSAFILE_UNKNOWN; /* by default, we autodetect file format */
100 identmx_report = NULL;
102 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
103 &optind, &optname, &optarg))
105 if (strcmp(optname, "-a") == 0) { allreport = TRUE; }
106 else if (strcmp(optname, "-f") == 0) { do_fast = TRUE; }
107 else if (strcmp(optname, "-q") == 0) { be_quiet = TRUE; }
108 else if (strcmp(optname, "--consensus") == 0) { consfile = optarg; }
109 else if (strcmp(optname, "--identmx") == 0) { identmx_report = optarg; }
110 else if (strcmp(optname, "--informat") == 0) {
111 fmt = String2SeqfileFormat(optarg);
112 if (fmt == MSAFILE_UNKNOWN)
113 Die("unrecognized sequence file format \"%s\"", optarg);
114 if (! IsAlignmentFormat(fmt))
115 Die("%s is an unaligned format, can't read as an alignment", optarg);
117 else if (strcmp(optname, "-h") == 0) {
118 Banner(stdout, banner);
125 if (argc - optind != 1) Die("Incorrect number of arguments.\n%s\n", usage);
126 afile = argv[optind];
128 if (do_fast && allreport)
129 Die("Verbose reports (-a, --identmx) are incompatible with fast sampling (-f)");
130 if (do_fast && identmx_report != NULL)
131 Die("Verbose reports (-a, --identmx) are incompatible with fast sampling (-f)");
134 Banner(stdout, banner);
136 /***********************************************
137 * Loop over every alignment in the file.
138 ***********************************************/
140 if ((afp = MSAFileOpen(afile, fmt, NULL)) == NULL)
141 Die("Alignment file %s could not be opened for reading", afile);
143 if (consfile != NULL && (consfp = fopen(consfile, "w")) == NULL)
144 Die("Failed to open consensus sequence file %s for writing", consfile);
146 if (identmx_report != NULL && (identmx_fp = fopen(identmx_report, "w")) == NULL)
147 Die("Failed to open identity matrix report file %s for writing", identmx_report);
149 while ((msa = MSAFileRead(afp)) != NULL)
151 for (i = 0; i < msa->nseq; i++) s2upper(msa->aseq[i]);
153 /* Statistics we always collect:
154 * unaligned sequence lengths; mean and range
158 for (i = 0; i < msa->nseq; i++)
160 rlen = DealignedLength(msa->aseq[i]);
162 if (small == -1 || rlen < small) small = rlen;
163 if (large == -1 || rlen > large) large = rlen;
166 /* Statistics we have to be careful about
167 * collecting, because of time constraints on NxN operations
172 avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen, msa->nseq,
177 /* In a full report, for each sequence, find the best relative,
178 * and the worst relative. For overall statistics, save the
179 * worst best (most distant single seq) and the best best
180 * (most closely related pair) and the worst worst (most
181 * distantly related pair) and yes, I know it's confusing.
184 MakeIdentityMx(msa->aseq, msa->nseq, &imx);
186 printf(" %-15s %5s %7s %-15s %7s %-15s\n",
187 "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)");
188 printf(" --------------- ----- ------- --------------- ------- ---------------\n");
191 /* Print the identity matrix report: one line per pair of sequences.
193 if (identmx_report != NULL)
195 for (i = 0; i < msa->nseq; i++)
196 for (j = i+1; j < msa->nseq; j++)
197 fprintf(identmx_fp, "%-4d %-4d %-15s %-15s %.3f\n",
198 i, j, msa->sqname[i], msa->sqname[j], imx[i][j]);
205 for (i = 0; i < msa->nseq; i++)
209 for (j = 0; j < msa->nseq; j++)
210 { /* closest seq to this one = best */
211 if (i != j && imx[i][j] > best)
212 { best = imx[i][j]; bestj = j; }
213 if (imx[i][j] < worst)
214 { worst = imx[i][j]; worstj = j; }
218 printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n",
219 msa->sqname[i], DealignedLength(msa->aseq[i]),
220 best * 100., msa->sqname[bestj],
221 worst * 100., msa->sqname[worstj]);
223 if (best > best_best) best_best = best;
224 if (best < worst_best) worst_best = best;
225 if (worst < worst_worst) worst_worst = worst;
226 for (j = 0; j < i; j++)
230 avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0);
231 if (allreport) puts("");
236 * Some fields aren't available if -f (fast) was chosen.
238 if (msa->name != NULL)
239 printf("Alignment name: %s\n", msa->name);
240 printf("Format: %s\n", SeqfileFormat2String(afp->format));
241 printf("Number of sequences: %d\n", msa->nseq);
242 printf("Total # residues: %d\n", nres);
243 printf("Smallest: %d\n", small);
244 printf("Largest: %d\n", large);
245 printf("Average length: %.1f\n", (float) nres / (float) msa->nseq);
246 printf("Alignment length: %d\n", msa->alen);
247 printf("Average identity: %.0f%%\n", 100.*avgid);
249 printf("Most related pair: %.0f%%\n", 100.*best_best);
250 printf("Most unrelated pair: %.0f%%\n", 100.*worst_worst);
251 printf("Most distant seq: %.0f%%\n", 100.*worst_best);
254 /* Save majority rule consensus sequence if we were asked
256 if (consfile != NULL) {
258 cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen);
259 WriteSimpleFASTA(consfp, cs,
260 msa->name != NULL? msa->name : "consensus",
263 printf("Consensus: written to %s\n", consfile);
271 if (consfile != NULL) fclose(consfp);