1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
11 /* sfetch_main.c, Fri Dec 25 14:22:17 1992, SRE
13 * sfetch -- a program to extract subsequences from a sequence database
14 * Renamed from "getseq" SRE, Tue Jan 19 10:47:42 1999 (GCG clash)
16 * CVS $Id: sfetch_main.c,v 1.1.1.1 2005/03/22 08:34:20 cmzmasek Exp $
25 static char banner[] = "sfetch - retrieve a specified sequence from a file";
27 static char usage[] = "\
28 Usage: sfetch [-options] <seqname>\n\
29 or: sfetch [-options] .\n\
30 (The second version fetches the first seq in the file.)\n\
31 Get a sequence from a database.\n\
33 -a : name is an accession number, not a key\n\
34 -d <seqfile> : get sequence from <seqfile>\n\
35 -D <database> : instead, get sequence from main database\n\
36 -h : help; print version and usage info\n\
37 -r <newname> : rename the fragment <newname>\n\
38 -f <from> : from which residue (1..N)\n\
39 -t <to> : to which residue (1..N)\n\
40 -o <outfile> : direct output to <outfile>\n\
41 -F <format> : use output format of <format>; see below for\n\
42 list. Default is original format of database.\n\
44 Available output formats include:\n\
51 Available databases are: (if $env variables are set correctly)\n\
52 -Dsw $SWDIR SwissProt\n\
55 -Dgb $GBDIR GenBank\n\
56 -Dwp $WORMDIR WormPep\n\
59 static char experts[] = "\
60 --informat <s> : specify input sequence file format <s>\n\
63 struct opt_s OPTIONS[] = {
64 { "-a", TRUE, sqdARG_NONE },
65 { "-d", TRUE, sqdARG_STRING },
66 { "-f", TRUE, sqdARG_INT },
67 { "-h", TRUE, sqdARG_NONE },
68 { "-o", TRUE, sqdARG_STRING },
69 { "-r", TRUE, sqdARG_STRING },
70 { "-t", TRUE, sqdARG_INT },
71 { "-D", TRUE, sqdARG_STRING },
72 { "-F", TRUE, sqdARG_STRING },
73 { "--informat", FALSE, sqdARG_STRING },
75 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
77 /* dbenv maps command line database selection to an environment
78 * variable, from which the database directory is obtained.
81 char *dbname; /* name of database, as used on command line */
82 char *ssiname; /* name of GSI index file to look for */
83 char *envname; /* environment var to get directory path from*/
84 char *entryend; /* string signifying end of entry */
85 int addend; /* TRUE if entryend line is part of entry */
88 { "sw", "swiss.ssi", "SWDIR", "//", TRUE},
89 { "pir", "pir.ssi", "PIRDIR", "///", TRUE},
90 { "em", "embl.ssi", "EMBLDIR", "//", TRUE},
91 { "gb", "genbank.ssi","GBDIR", "//", TRUE},
92 { "wp", "wormpep.ssi","WORMDIR", ">", FALSE},
93 { "owl", "owl.ssi", "OWLDIR", ">", FALSE}, /* use FASTA OWL version */
95 #define NUMDBS (sizeof(dbenv) / sizeof(struct dbenv_s))
98 main(int argc, char **argv)
100 char *dbname; /* master database to search */
101 char *seqfile; /* name of sequence file to read */
102 char *ssifile; /* name of SSI index file (if one exists) */
103 SQFILE *seqfp; /* pointer to open sequence file */
104 char *getname; /* name of sequence to get from */
105 int from; /* starting residue, 1..N */
106 int to; /* ending residue, 1..N */
107 char *outfile; /* name of file to put output to */
108 FILE *outfp; /* file pointer to put output to */
109 int format; /* format of seqfile */
110 int outfmt; /* output format */
111 char *seq; /* current working sequence */
113 char *frag; /* extracted subsequence */
114 int source_start; /* start of seq on original source 1..N */
115 int source_stop; /* end of seq on original source 1..N */
116 int source_orient; /* sign of parent: -1 revcomp, +1 normal*/
117 char *ss; /* secondary structure representation */
119 SSIFILE *ssi; /* open SSI index file */
120 SSIOFFSET ssi_offset; /* disk offset for locating sequence */
121 int used_ssi; /* TRUE if SSI file was used (don't scan) */
122 int status; /* status returned by an SSI call */
124 char *rename; /* new name to give fragment */
125 int reverse_complement; /* do we have to reverse complement? */
127 int getfirst; /* TRUE to extract from the first seq, w/o looking at name */
128 char *outformat; /* output format string */
129 int by_accession; /* TRUE if name is accession number not key */
137 /***********************************************
138 * Parse the command line
139 ***********************************************/
141 /* initializations and defaults */
142 format = SQFILE_UNKNOWN; /* autodetect default, overridden by --informat or SSI files */
143 reverse_complement = 0;
150 to = -1; /* flag that says do the whole thing */
155 by_accession = FALSE;
158 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
159 &optind, &optname, &optarg))
161 if (strcmp(optname, "-a") == 0) { by_accession = TRUE; }
162 else if (strcmp(optname, "-d") == 0) { seqfile = optarg; }
163 else if (strcmp(optname, "-f") == 0) {
164 from = atoi(optarg); getall = FALSE;
166 else if (strcmp(optname, "-t") == 0) {
167 to = atoi(optarg); getall = FALSE;
169 else if (strcmp(optname, "-r") == 0) { rename = optarg; }
170 else if (strcmp(optname, "-o") == 0) { outfile = optarg; }
171 else if (strcmp(optname, "-D") == 0) { dbname = optarg; }
172 else if (strcmp(optname, "-F") == 0) { outformat = optarg; }
173 else if (strcmp(optname, "--informat") == 0) {
174 format = String2SeqfileFormat(optarg);
175 if (format == SQFILE_UNKNOWN)
176 Die("unrecognized input sequence file format \"%s\"", optarg);
178 else if (strcmp(optname, "-h") == 0) {
179 Banner(stdout, banner);
186 if (argc - optind != 1)
187 Die("Incorrect number of command line arguments.\n%s\n", usage);
189 getname = argv[optind];
190 if (strcmp(getname, ".") == 0) getfirst = TRUE;
192 if (getfirst && seqfile == NULL)
193 Die("You need to specify -d <seqfile> to retrieve a first sequence.\n%s",
196 /***********************************************
197 * Get name of file to look through, and disk offset,
198 * using SSI file if one exists. Three possibilities:
199 * 1) Look in main DB, which has SSI index in the directory
200 * 2) Look in a file, which has associated SSI index
201 * 3) Look in an unindexed file
202 ***********************************************/
204 if (dbname != NULL && seqfile != NULL)
205 Die("Can't fetch from *both* a database %s and a file %s\n%s",
206 dbname, seqfile, usage);
207 if (dbname == NULL && seqfile == NULL)
208 { /* try to guess SwissProt, stupidly, but usually works */
209 if (strchr(getname, '_') != NULL) dbname = Strdup("sw");
210 else Die("You have to specify either a database or a seqfile\n%s", usage);
213 if (dbname != NULL) /* Main database. GSI index mandatory. */
218 /* find which db this is */
219 for (dbidx = 0; dbidx < NUMDBS; dbidx++)
220 if (strcmp(dbenv[dbidx].dbname, dbname) == 0)
223 Die("No such main database %s\n%s", dbname, usage);
225 /* get directory name */
226 if ((dbdir = getenv(dbenv[dbidx].envname)) == NULL)
227 Die("Environment variable %s is not set.\n%s",
228 dbenv[dbidx].envname, usage);
230 ssifile = (char *) MallocOrDie
231 ((strlen(dbdir) + strlen(dbenv[dbidx].ssiname) + 2) * sizeof(char));
232 sprintf(ssifile, "%s/%s", dbdir, dbenv[dbidx].ssiname);
233 if ((status = SSIOpen(ssifile, &ssi)) != 0)
234 Die("Failed to open SSI index file %s in directory %s\n%s",
235 dbenv[dbidx].ssiname, dbdir, usage);
236 /* get seqfile name, file format, and offset */
237 if ((status = SSIGetOffsetByName(ssi, getname, &fh, &ssi_offset)) != 0)
238 Die("Failed to find key %s in SSI file %s", getname, ssifile);
239 if ((status = SSIFileInfo(ssi, fh, &dbfile, &format)) != 0)
240 Die("SSI error: %s", SSIErrorString(status));
242 /* set up proper seqfile, with path */
243 seqfile = (char *) MallocOrDie
244 ((strlen(dbdir) + strlen(dbfile) + 2) * sizeof(char));
245 sprintf(seqfile, "%s/%s", dbdir, dbfile);
249 else if (! getfirst) /* Sequence file. SSI index optional. */
254 ssifile = (char *) MallocOrDie ((strlen(seqfile) + 5) * sizeof(char));
255 sprintf(ssifile, "%s.ssi", seqfile);
256 if ((status = SSIOpen(ssifile, &ssi)) == 0)
258 SQD_DPRINTF1(("Opened SSI index %s...\n", ssifile));
259 if ((status = SSIGetOffsetByName(ssi, getname, &fh, &ssi_offset)) != 0)
260 Die("Failed to find key %s in SSI file %s", getname, ssifile);
261 if ((status = SSIFileInfo(ssi, fh, &dbfile, &format)) != 0)
262 Die("SSI error: %s", SSIErrorString(status));
269 /***********************************************
271 ***********************************************/
273 if ((seqfp = SeqfileOpen(seqfile, format, NULL)) == NULL)
274 Die("Failed to open sequence database file %s\n%s\n", seqfile, usage);
276 SeqfilePosition(seqfp, &ssi_offset);
278 /***********************************************
280 ***********************************************/
282 /* Determine output format. Default: use same as input. Override: -F option.
284 outfmt = seqfp->format;
285 if (outformat != NULL)
287 outfmt = String2SeqfileFormat(outformat);
288 if (outfmt == SQFILE_UNKNOWN)
289 Die("Unknown output format %s\n%s", outformat, usage);
290 if (IsAlignmentFormat(outfmt))
291 Die("Can't output a single sequence in an alignment format (%s)\n", outformat);
293 /* open output file for writing;
294 use stdout by default */
295 if (outfile == NULL) outfp = stdout;
296 else if ((outfp = fopen(outfile, "w")) == NULL)
297 Die("cannot open %s for output\n", outfile);
300 /***********************************************
302 ***********************************************/
304 /* If this is a simple fetch of the complete sequence
305 * in native format, and we've been positioned in the file
306 * by an SSI index file, we can just read right from the file,
307 * partially bypassing the ReadSeq() API, and probably
308 * putting our fingers a little too deep into the seqfp object.
310 if (getall && used_ssi && outfmt == format && dbname != NULL)
316 if (dbidx == -1) Die("That's weird. No database index available.");
317 endlen = strlen(dbenv[dbidx].entryend);
318 fputs(seqfp->buf, outfp); /* always do first line */
319 /* fputs("\n", outfp); */ /* buf has its /n */
320 while (sre_fgets(&buf, &buflen, seqfp->f) != NULL)
322 if (strncmp(buf, dbenv[dbidx].entryend, endlen) == 0)
324 if (dbenv[dbidx].addend) fputs(buf, outfp);
329 if (buf != NULL) free(buf);
331 else /* else, the hard way with ReadSeq */
336 while (ReadSeq(seqfp, format, &seq, &sqinfo))
338 if (used_ssi) /* GSI file puts us right on our seq. */
340 else if (getfirst) /* Use the first seq in the file. */
342 else if (by_accession &&
343 (sqinfo.flags & SQINFO_ACC) &&
344 strcmp(sqinfo.acc, getname) == 0)
346 else if (strcmp(sqinfo.name, getname) == 0)
349 FreeSequence(seq, &sqinfo);
354 Die("failed to extract the subsequence %s\n%s", getname, usage);
361 else if (from == -1) from = 1;
362 else if (to == -1) to = sqinfo.len;
364 if (to > sqinfo.len || from > sqinfo.len)
365 Warn("Extracting beyond the length of the sequence");
366 if (to < 1 || from < 1)
367 Warn("Extracting beyond the beginning of the sequence");
369 /* check for reverse complement */
370 if (to != -1 && from > to)
372 int swapfoo; /* temp variable for swapping coords */
374 reverse_complement = TRUE;
375 swapfoo = from; from = to; to = swapfoo;
377 if (to > sqinfo.len) to = sqinfo.len;
378 if (from < 1) from = 1;
380 if ((frag = (char *) calloc (to-from+2, sizeof(char))) == NULL)
381 Die("memory error\n");
383 if (strncpy(frag, seq+from-1, to-from+1) == NULL)
384 Die("strncpy() failed\n");
386 if (sqinfo.flags & SQINFO_SS)
388 if ((ss = (char *) calloc (to-from+2, sizeof(char))) == NULL)
389 Die("memory error\n");
390 if (strncpy(ss, sqinfo.ss+from-1, to-from+1) == NULL)
391 Die("strncpy() failed\n");
396 if (reverse_complement)
398 char *revfrag; /* temp variable for reverse complement */
399 int swapfoo; /* temp variable for swapping coords back */
401 if ((revfrag = calloc ( to-from+2, sizeof(char))) == NULL)
402 Die("memory failure\n");
403 revcomp(revfrag, frag);
406 swapfoo = from; from = to; to = swapfoo;
408 /* reverse complement nullifies secondary structure */
409 if (sqinfo.flags & SQINFO_SS)
410 { free(sqinfo.ss); sqinfo.flags &= ~SQINFO_SS; }
413 if (! (sqinfo.flags & SQINFO_ID))
414 SetSeqinfoString(&sqinfo, sqinfo.name, SQINFO_ID);
416 if (! (sqinfo.flags & SQINFO_OLEN))
417 { sqinfo.olen = sqinfo.len; sqinfo.flags |= SQINFO_OLEN; }
419 sqinfo.len = (to > from) ? to-from+1 : from-to+1;
420 sqinfo.flags |= SQINFO_LEN;
423 SetSeqinfoString(&sqinfo, rename, SQINFO_NAME);
425 source_start = (sqinfo.flags & SQINFO_START) ? sqinfo.start : 1;
426 source_stop = (sqinfo.flags & SQINFO_STOP) ? sqinfo.stop : sqinfo.len;
427 source_orient= (source_stop > source_start) ? 1 : -1;
429 sqinfo.start = source_start + (from- 1) * source_orient;
430 sqinfo.stop = source_start + (to - 1) * source_orient;
431 sqinfo.flags |= SQINFO_START | SQINFO_STOP;
433 WriteSeq(outfp, outfmt, frag, &sqinfo);
435 FreeSequence(seq, &sqinfo);
439 printf("Fragment written to file %s\n", outfile);