1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * Mon Sep 13 13:06:51 1993
14 * sreformat - reformat sequence files.
15 * renamed sreformat from reformat, Tue Jun 30 10:53:38 1998
17 * CVS $Id: sreformat_main.c,v 1.1.1.1 2005/03/22 08:34:25 cmzmasek Exp $
27 static char banner[] = "sreformat - convert between sequence formats";
29 static char usage[] = "\
30 Usage: sreformat [-options] <format> <seqfile>\n\
31 Output format choices: Unaligned Aligned\n\
32 ----------- -------\n\
40 Available options are:\n\
41 -h : help; print brief help on version and usage\n\
42 -d : force DNA alphabet for nucleic acid sequence\n\
43 -r : force RNA alphabet for nucleic acid sequence\n\
44 -l : force lower case\n\
45 -u : force upper case\n\
46 -x : convert non-IUPAC chars in DNA to N's for IUPAC/BLAST compatibility\n\
49 static char experts[] = "\
51 --informat <s>: input sequence file is in format <s>\n\
52 --mingap : remove columns containing all gaps (seqfile=alignment)\n\
53 --nogap : remove columns containing any gaps (seqfile=alignment)\n\
54 --pfam : modify Stockholm format output to be in PFAM style (1 line/seq)\n\
55 --sam : try to convert gaps to SAM style (seqfile=alignment)\n\
56 --samfrac <x> : convert to SAM convention; cols w/ gapfrac > x are inserts\n\
57 --gapsym <c> : convert all gaps to character '<c>'\n\
60 static struct opt_s OPTIONS[] = {
61 { "-d", TRUE, sqdARG_NONE },
62 { "-h", TRUE, sqdARG_NONE },
63 { "-l", TRUE, sqdARG_NONE },
64 { "-r", TRUE, sqdARG_NONE },
65 { "-u", TRUE, sqdARG_NONE },
66 { "-x", TRUE, sqdARG_NONE },
67 { "--gapsym", FALSE, sqdARG_CHAR },
68 { "--informat",FALSE, sqdARG_STRING },
69 { "--mingap", FALSE, sqdARG_NONE },
70 { "--nogap", FALSE, sqdARG_NONE },
71 { "--pfam", FALSE, sqdARG_NONE },
72 { "--sam", FALSE, sqdARG_NONE },
73 { "--samfrac", FALSE, sqdARG_FLOAT },
75 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
78 main(int argc, char **argv)
80 char *seqfile; /* name of sequence file */
82 SQFILE *dbfp; /* open sequence file */
83 int fmt; /* format of seqfile */
84 int outfmt; /* output format */
85 char *seq; /* sequence */
89 int force_rna; /* TRUE to force RNA alphabet */
90 int force_dna; /* TRUE to force DNA alphabet */
91 int force_lower; /* TRUE to force lower case */
92 int force_upper; /* TRUE to force upper case */
93 int x_is_bad; /* TRUE to convert X to N */
94 int do_mingap; /* TRUE to remove columns containing all gaps */
95 int do_nogap; /* TRUE to remove columns containing any gaps */
96 int do_pfam; /* TRUE to make SELEX -> PFAM */
97 int samize; /* TRUE to SAMize an A2M conversion */
98 float samfrac; /* -1, or gap fraction for a SAM conversion */
99 int expect_alignment; /* TRUE to expect an input alignment to convert */
100 char gapsym; /* 0 if unset; else = character to use for gaps */
102 char *optname; /* name of option found by Getopt() */
103 char *optarg; /* argument found by Getopt() */
104 int optind; /* index in argv[] */
106 /***********************************************
108 ***********************************************/
120 fmt = SQFILE_UNKNOWN;
121 expect_alignment = FALSE;
124 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
125 &optind, &optname, &optarg)) {
126 if (strcmp(optname, "-a") == 0) expect_alignment= TRUE;
127 else if (strcmp(optname, "-d") == 0) force_dna = TRUE;
128 else if (strcmp(optname, "-l") == 0) force_lower = TRUE;
129 else if (strcmp(optname, "-r") == 0) force_rna = TRUE;
130 else if (strcmp(optname, "-u") == 0) force_upper = TRUE;
131 else if (strcmp(optname, "-x") == 0) x_is_bad = TRUE;
132 else if (strcmp(optname, "--gapsym") == 0) gapsym = *optarg;
133 else if (strcmp(optname, "--mingap") == 0) do_mingap = TRUE;
134 else if (strcmp(optname, "--nogap") == 0) do_nogap = TRUE;
135 else if (strcmp(optname, "--pfam") == 0) do_pfam = TRUE;
136 else if (strcmp(optname, "--sam") == 0) samize = TRUE;
137 else if (strcmp(optname, "--samfrac") == 0) samfrac = atof(optarg);
138 else if (strcmp(optname, "--informat") == 0) {
139 fmt = String2SeqfileFormat(optarg);
140 if (fmt == SQFILE_UNKNOWN)
141 Die("unrecognized sequence file format \"%s\"", optarg);
143 else if (strcmp(optname, "-h") == 0) {
144 Banner(stdout, banner);
151 if (argc - optind != 2)
153 if (force_lower && force_upper)
154 Die("Can't force both upper case and lower case. Stop trying to confuse me.\n%s",
156 if (force_rna && force_dna)
157 Die("Can't force both RNA and DNA. Stop trying to find bugs. You'll be sorry.\n%s",
160 format = argv[optind]; optind++;
161 seqfile = argv[optind]; optind++;
163 /***********************************************
164 * Figure out what format we're supposed to write
165 ***********************************************/
167 if ((outfmt = String2SeqfileFormat(format)) == SQFILE_UNKNOWN)
168 Die("Unknown output format %s\n%s", format, usage);
170 /***********************************************
171 * Reformat the file, printing to stdout.
172 ***********************************************/
174 /* If the output format is an alignment, then the input format
175 * has to be an alignment.
177 if (IsAlignmentFormat(outfmt))
182 if ((afp = MSAFileOpen(seqfile, fmt, NULL)) == NULL)
183 Die("Alignment file %s could not be opened for reading", seqfile);
185 while ((msa = MSAFileRead(afp)) != NULL)
187 /* If asked, convert upper/lower convention and
188 * gap character conventions now
190 if (do_mingap) MSAMingap(msa);
191 if (do_nogap) MSANogap(msa);
192 if (gapsym) AlignmentHomogenousGapsym(msa->aseq, msa->nseq, msa->alen, gapsym);
193 if (samize) SAMizeAlignment(msa->aseq, msa->nseq, msa->alen);
194 if (samfrac >= 0) SAMizeAlignmentByGapFrac(msa->aseq, msa->nseq, msa->alen, samfrac);
196 for (i = 0; i < msa->nseq; i++)
198 if (force_dna) ToDNA(msa->aseq[i]);
199 if (force_rna) ToRNA(msa->aseq[i]);
200 if (x_is_bad) ToIUPAC(msa->aseq[i]);
201 if (force_lower) s2lower(msa->aseq[i]);
202 if (force_upper) s2upper(msa->aseq[i]);
205 /* This code block can be replaced with a
206 * MSAFileWrite() call someday... SRE Sun Apr 22 19:17:19 2001
209 case MSAFILE_A2M: WriteA2M(stdout, msa); break;
210 case MSAFILE_CLUSTAL: WriteClustal(stdout, msa); break;
211 case MSAFILE_MSF: WriteMSF(stdout, msa); break;
212 case MSAFILE_PHYLIP: WritePhylip(stdout, msa); break;
214 if (do_pfam) WriteSELEXOneBlock(stdout, msa);
215 else WriteSELEX(stdout, msa);
217 case MSAFILE_EPS: EPSWriteSmallMSA(stdout, msa); break;
218 case MSAFILE_STOCKHOLM:
219 if (do_pfam) WriteStockholmOneBlock(stdout, msa);
220 else WriteStockholm(stdout, msa);
223 Die("can't write. no such alignment format %d\n", outfmt);
232 if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL)
233 Die("Failed to open sequence file %s for reading", seqfile);
235 while (ReadSeq(dbfp, fmt, &seq, &sqinfo))
237 if (force_dna) ToDNA(seq);
238 if (force_rna) ToRNA(seq);
239 if (x_is_bad) ToIUPAC(seq);
240 if (force_lower) s2lower(seq);
241 if (force_upper) s2upper(seq);
243 WriteSeq(stdout, outfmt, seq, &sqinfo);
244 FreeSequence(seq, &sqinfo);