1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
13 * SRE, Mon Sep 25 11:43:58 2000
15 * Split sequences into smaller chunks of defined size and overlap;
16 * output a FASTA file.
19 * still working in 32 bits -- no sequence can be more than 2 GB
21 * CVS $Id: seqsplit_main.c,v 1.1.1.1 2005/03/22 08:34:26 cmzmasek Exp $
29 static char banner[] = "seqsplit - split seqs into chunks of defined size and overlap";
31 static char usage[] = "\
32 Usage: seqsplit [-options] <seqfile>\n\
34 -h : help; display usage and version\n\
35 -o <file> : output the new FASTA file to <file>\n\
38 static char experts[] = "\
39 --informat <s> : specify sequence file format <s>\n\
40 --length <n> : set max length of each unique seq frag to <n>\n\
41 --overlap <n> : set overlap length to <n> (total frag size = length+overlap)\n\
44 struct opt_s OPTIONS[] = {
45 { "-h", TRUE, sqdARG_NONE },
46 { "-o", TRUE, sqdARG_STRING },
47 { "--informat", FALSE, sqdARG_STRING },
48 { "--length", FALSE, sqdARG_INT },
49 { "--overlap", FALSE, sqdARG_INT },
51 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
55 main(int argc, char **argv)
57 char *seqfile; /* name of sequence file */
58 char *outfile; /* name of output file */
59 SQFILE *dbfp; /* open sequence file */
60 FILE *ofp; /* open output file */
61 int fmt; /* format of seqfile */
62 char *seq; /* sequence */
63 SQINFO sqinfo; /* extra info about sequence */
64 char *seqfrag; /* space for a seq fragment */
65 int fraglength; /* length of unique seq per frag */
66 int overlap; /* length of overlap. frags are fraglength+overlap*/
67 char seqname[256]; /* renamed fragment, w/ coord info */
68 int num; /* number of this fragment */
69 int pos; /* position in a sequence */
70 int len; /* length of a fragment */
73 int nseqs; /* total number of sequences */
74 int nsplit; /* number of seqs that get split */
75 int nnewfrags; /* total number of new fragments */
81 /***********************************************
83 ***********************************************/
85 fmt = SQFILE_UNKNOWN; /* default: autodetect */
90 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
91 &optind, &optname, &optarg))
93 if (strcmp(optname, "-o") == 0) outfile = optarg;
94 else if (strcmp(optname, "--length") == 0) fraglength = atoi(optarg);
95 else if (strcmp(optname, "--overlap") == 0) overlap = atoi(optarg);
96 else if (strcmp(optname, "--informat") == 0) {
97 fmt = String2SeqfileFormat(optarg);
98 if (fmt == SQFILE_UNKNOWN)
99 Die("unrecognized sequence file format \"%s\"", optarg);
101 else if (strcmp(optname, "-h") == 0) {
102 Banner(stdout, banner);
109 if (argc - optind != 1) Die("%s\n", usage);
110 seqfile = argv[argc-1];
112 seqfrag = MallocOrDie(sizeof(char) * (fraglength+overlap));
113 seqfrag[fraglength+overlap] = '\0';
115 /***********************************************
117 ***********************************************/
119 if (outfile == NULL) ofp = stdout;
121 if ((ofp = fopen(outfile, "w")) == NULL)
122 Die("Failed to open output sequence file %s for writing", outfile);
125 if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL)
126 Die("Failed to open sequence file %s for reading", seqfile);
128 nseqs = nsplit = nnewfrags = 0;
129 while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo))
132 if (sqinfo.flags & SQINFO_DESC) desc = sqinfo.desc;
135 if (sqinfo.len <= fraglength+overlap) {
136 WriteSimpleFASTA(ofp, seq, sqinfo.name, desc);
142 for (pos = 0; pos < sqinfo.len; pos += fraglength)
144 if (sqinfo.len - pos <= overlap) continue;
145 strncpy(seqfrag, seq+pos, fraglength+overlap);
146 len = strlen(seqfrag);
147 sprintf(seqname, "%s/frag%d/%d-%d",
148 sqinfo.name, num, pos+1, pos+len);
149 WriteSimpleFASTA(ofp, seqfrag, seqname, desc);
153 FreeSequence(seq, &sqinfo);
156 if (outfile != NULL) fclose(ofp);
158 printf("Total # of seqs: %d\n", nseqs);
159 printf("Affected by splitting: %d\n", nsplit);
160 printf("New # of seqs: %d\n", nseqs-nsplit + nnewfrags);