1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * Wed Aug 10 15:47:14 1994
14 * Look at a sequence file, determine some simple statistics.
15 * CVS $Id: seqstat_main.c,v 1.1.1.1 2005/03/22 08:34:29 cmzmasek Exp $
25 static char banner[] = "seqstat - show some simple statistics on a sequence file";
27 static char usage[] = "\
28 Usage: seqstat [-options] <seqfile>\n\
30 -a : report per-sequence info, not just a summary\n\
31 -h : help; display usage and version\n\
34 static char experts[] = "\
35 --gccomp : with -a, include GC composition in report (DNA/RNA only)\n\
36 --informat <s> : specify sequence file format <s>\n\
37 --quiet : suppress verbose header (used in regression testing)\n\
40 struct opt_s OPTIONS[] = {
41 { "-a", TRUE, sqdARG_NONE },
42 { "-h", TRUE, sqdARG_NONE },
43 { "--gccomp", FALSE, sqdARG_NONE },
44 { "--informat", FALSE, sqdARG_STRING },
45 { "--quiet", FALSE, sqdARG_NONE },
47 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
49 static float gc_composition(char *seq);
52 main(int argc, char **argv)
54 char *seqfile; /* name of sequence file */
55 SQFILE *dbfp; /* open sequence file */
56 int fmt; /* format of seqfile */
57 char *seq; /* sequence */
58 SQINFO sqinfo; /* extra info about sequence */
60 long long small; /* smallest length */
61 long long large; /* largest length */
62 long long total; /* total length */
63 int type; /* kAmino, kDNA, kRNA, or kOtherSeq */
65 int allreport; /* TRUE to do a short table for each sequence */
66 int be_quiet; /* TRUE to suppress header */
67 int do_gccomp; /* TRUE to include GC composition in per-seq report */
68 float gc; /* fractional gc composition, 0..1 */
74 /***********************************************
76 ***********************************************/
78 fmt = SQFILE_UNKNOWN; /* default: autodetect format */
79 allreport = FALSE; /* default: file summary only */
80 be_quiet = FALSE; /* show header info by default */
81 type = kOtherSeq; /* just to silence gcc uninit warning */
84 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
85 &optind, &optname, &optarg))
87 if (strcmp(optname, "-a") == 0) allreport = TRUE;
88 else if (strcmp(optname, "--quiet") == 0) be_quiet = TRUE;
89 else if (strcmp(optname, "--gccomp") == 0) do_gccomp = TRUE;
91 else if (strcmp(optname, "--informat") == 0) {
92 fmt = String2SeqfileFormat(optarg);
93 if (fmt == SQFILE_UNKNOWN)
94 Die("unrecognized sequence file format \"%s\"", optarg);
96 else if (strcmp(optname, "-h") == 0) {
97 Banner(stdout, banner);
104 if (argc - optind != 1) Die("%s\n", usage);
105 seqfile = argv[argc-1];
107 if (! be_quiet) Banner(stdout, banner);
109 /***********************************************
111 ***********************************************/
113 if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL)
114 Die("Failed to open sequence file %s for reading", seqfile);
117 printf(" %-15s %-5s %s%s\n", " NAME", "LEN",
118 do_gccomp? " f_GC " : "",
120 printf(" --------------- ----- %s-----------\n",
121 do_gccomp ? "----- " : "");
128 while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo))
130 if (nseqs == 0) type = Seqtype(seq);
131 if (do_gccomp) gc = gc_composition(seq);
135 printf("* %-15s %5d %.3f %-50.50s\n", sqinfo.name, sqinfo.len,
137 sqinfo.flags & SQINFO_DESC ? sqinfo.desc : "");
139 printf("* %-15s %5d %-50.50s\n", sqinfo.name, sqinfo.len,
140 sqinfo.flags & SQINFO_DESC ? sqinfo.desc : "");
144 if (small == -1 || sqinfo.len < small) small = (long long) sqinfo.len;
145 if (large == -1 || sqinfo.len > large) large = (long long) sqinfo.len;
146 total += (long long) sqinfo.len;
148 FreeSequence(seq, &sqinfo);
150 if (allreport) puts("");
152 printf("Format: %s\n", SeqfileFormat2String(dbfp->format));
153 printf("Type (of 1st seq): ");
156 case kDNA: puts("DNA"); break;
157 case kRNA: puts("RNA"); break;
158 case kAmino: puts("Protein"); break;
159 case kOtherSeq: puts("Unknown"); break;
160 default: Die("oops.");
162 printf("Number of sequences: %d\n", nseqs);
163 printf("Total # residues: %lld\n", total);
164 printf("Smallest: %lld\n", small);
165 printf("Largest: %lld\n", large);
166 printf("Average length: %.1f\n", (float) total / (float) nseqs);
174 /* Function: gc_composition()
175 * Date: SRE, Mon Apr 23 10:01:48 2001 [St. Louis]
177 * Purpose: Calculate the fractional GC composition of
178 * an input RNA or DNA sequence. Deals appropriately
179 * with IUPAC degeneracy. Case-insensitive.
180 * Ignores gap symbols. Other unexpected characters
181 * make it die with an error (protein, for instance).
183 * Args: seq - the DNA or RNA sequence
185 * Returns: fractional GC composition, 0-1
188 gc_composition(char *seq)
195 for (; *seq != '\0'; seq++)
197 if (isgap(c)) continue;
199 c = toupper((int) *seq);
205 case 'S': gc += 1.0; break;
210 case 'W': gc += 0.0; break;
216 case 'K': gc += 0.5; break;
219 case 'D': gc += 0.3333; break;
222 case 'V': gc += 0.6667; break;
225 Die("unrecognized nucleic acid character %c in sequence", c);