2 /*****************************************************************************/
4 /*** #include precomputed ln(fact(n)) array in "lnfac.h" ***/
5 /*****************************************************************************/
7 /*--------------------------------------------------------------(includes)---*/
12 /*---------------------------------------------------------------(defines)---*/
14 #define LENCORRLIM 120
15 #define MIN(a,b) ((a) <= (b) ? (a) : (b))
17 /*---------------------------------------------------------------(structs)---*/
26 /*---------------------------------------------------------------(globals)---*/
38 int singleseq = FALSE;
39 int prettyseq = FALSE;
40 int prettytree = TRUE;
44 double getK1(), getprob(), lnperm(), lnass();
46 /*------------------------------------------------------------------(main)---*/
58 /* readlencorr(); */ /* #include lencorr file */
59 getparams(argc, argv);
62 if ((db=opendbase(argv[1]))==NULL)
64 fprintf(stderr, "Error opening file %s\n", argv[1]);
68 for (seq=firstseq(db); seq!=NULL; seq=nextseq(db))
70 segs = (struct Segment *) NULL;
71 segseq(seq, &segs, 0);
74 if (singleseq) singreport(seq, segs);
75 else if (prettyseq) prettyreport(seq, segs);
76 else if (prettytree) pretreereport(seq, segs);
77 else report(seq, segs);
87 /*-------------------------------------------------------------(getparams)---*/
106 for (i=2; argc>i && argv[i][0]!='-'; i++)
110 window = atoi(argv[2]);
114 hicut = atof(argv[3]); /*- trigger is now 'hicut' -*/
118 locut = atof(argv[4]); /*- extension is now 'locut' -*/
124 fprintf(stderr, "Warning: trigger complexity less than extension\n");
128 downset = (window+1)/2 - 1;
129 upset = window - downset;
135 while ((c=getopt(nargc, nargv, "m:olhaxpqc:nt:"))!=-1)
140 hilenmin = atoi(optarg);
187 charline = atoi(optarg);
193 maxtrim = atoi(optarg);
196 fprintf(stderr, "Unknown option.\n");
206 /*---------------------------------------------------------------(segment)---*/
208 segseq(seq, segs, offset)
209 struct Sequence *seq;
210 struct Segment **segs;
213 {struct Segment *seg, *leftsegs;
214 struct Sequence *leftseq;
215 int first, last, lowlim;
217 int leftend, rightend, lend, rend;
218 double *H, *seqent();
224 last = seq->length - upset;
227 for (i=first; i<=last; i++)
229 if (H[i]>=hicut && H[i]!=-1)
231 loi = findlo(i, lowlim, H);
232 hii = findhi(i, last, H);
234 leftend = loi - downset;
235 rightend = hii + upset - 1;
237 trim(openwin(seq, leftend, rightend-leftend+1), &leftend, &rightend);
239 if (i+upset-1<leftend) /* check for trigger window in left trim */
241 lend = loi - downset;
244 leftseq = openwin(seq, lend, rend-lend+1);
245 leftsegs = (struct Segment *) NULL;
246 segseq(leftseq, &leftsegs, offset+lend);
249 if (*segs==NULL) *segs = leftsegs;
250 else appendseg(*segs, leftsegs);
254 /* trim(openwin(seq, lend, rend-lend+1), &lend, &rend);
255 seg = (struct Segment *) malloc(sizeof(struct Segment));
258 seg->next = (struct Segment *) NULL;
259 if (segs==NULL) segs = seg;
260 else appendseg(segs, seg); */
263 seg = (struct Segment *) malloc(sizeof(struct Segment));
264 seg->begin = leftend + offset;
265 seg->end = rightend + offset;
266 seg->next = (struct Segment *) NULL;
268 if (*segs==NULL) *segs = seg;
269 else appendseg(*segs, seg);
271 i = min(hii, rightend+downset);
273 /* i = hii; this ignores the trimmed residues... */
281 /*----------------------------------------------------------------(seqent)---*/
284 struct Sequence *seq;
286 {struct Sequence *win;
290 if (window>seq->length)
292 return((double *) NULL);
295 H = (double *) malloc(seq->length*sizeof(double));
297 for (i=0; i<seq->length; i++)
302 win = openwin(seq, 0, window);
306 last = seq->length - upset;
308 for (i=first; i<=last; i++)
310 if (seq->punctuation && hasdash(win))
322 /*---------------------------------------------------------------(hasdash)---*/
325 struct Sequence *win;
327 register char *seq, *seqmax;
330 seqmax = seq + win->length;
332 while (seq < seqmax) {
339 /*----------------------------------------------------------------(findlo)---*/
347 for (j=i; j>=limit; j--)
350 if (H[j]<locut) break;
356 /*----------------------------------------------------------------(findhi)---*/
364 for (j=i; j<=limit; j++)
367 if (H[j]<locut) break;
373 /*------------------------------------------------------------------(trim)---*/
375 trim(seq, leftend, rightend)
376 struct Sequence *seq;
377 int *leftend, *rightend;
379 {struct Sequence *win;
385 /* fprintf(stderr, "%d %d\n", *leftend, *rightend); */
388 rend = seq->length - 1;
390 if ((seq->length-maxtrim)>minlen) minlen = seq->length-maxtrim;
393 for (len=seq->length; len>minlen; len--)
395 win = openwin(seq, 0, len);
402 K1 = getK1(win->state, len);
409 shift = shiftwin1(win);
415 /* fprintf(stderr, "%d-%d ", *leftend, *rightend); */
417 *leftend = *leftend + lend;
418 *rightend = *rightend - (seq->length - rend - 1);
420 /* fprintf(stderr, "%d-%d\n", *leftend, *rightend); */
426 /*---------------------------------------------------------------(getprob)---*/
428 double getprob(sv, total)
434 #define LN20 2.9957322735539909
435 totseq = ((double) total) * LN20;
437 ans = lnass(sv) + lnperm(sv, total) - totseq;
442 /*-----------------------------------------------------------------(getK1)---*/
444 double getK1(sv, total)
450 ans = lnperm(sv, total) / ((double) total);
455 /*----------------------------------------------------------------(lnperm)---*/
457 double lnperm(sv, tot)
466 for (i=0; sv[i]!=0; i++)
474 /*-----------------------------------------------------------------(lnass)---*/
480 register int svi, svim1;
481 register int class, total;
491 for (i=0;; svim1 = svi) {
496 else if ((svi = *++sv) == svim1) {
517 /*-------------------------------------------------------------(mergesegs)---*/
520 struct Sequence *seq;
521 struct Segment *segs;
523 {struct Segment *seg, *nextseg;
526 if (overlaps) return;
527 if (segs==NULL) return;
529 if (segs->begin<hilenmin) segs->begin = 0;
534 while (nextseg!=NULL)
536 if (seg->end>=nextseg->begin) /* overlapping segments */
538 seg->end = nextseg->end;
539 seg->next = nextseg->next;
544 len = nextseg->begin - seg->end - 1;
545 if (len<hilenmin) /* short hient segment */
547 seg->end = nextseg->end;
548 seg->next = nextseg->next;
557 len = seq->length - seg->end - 1;
558 if (len<hilenmin) seg->end = seq->length - 1;
563 /*----------------------------------------------------------------(report)---*/
566 struct Sequence *seq;
567 struct Segment *segs;
569 {struct Sequence *subseq;
570 struct Segment *seg, *nextseg;
577 seqout(seq, hi, 1, seq->length);
578 /* fputc('\n', stdout); -- for spacing after each sequence */
584 subseq = openwin(seq, 0, segs->begin);
586 seqout(subseq, hi, 1, segs->begin);
590 for (seg=segs; seg!=NULL; seg=seg->next)
592 subseq = openwin(seq, seg->begin, seg->end-seg->begin+1);
594 seqout(subseq, lo, seg->begin+1, seg->end+1);
604 if (nextseg->begin<=seg->end)
606 fprintf(stderr, "Overlapping segments: %s\n", seq->id);
610 if (nextseg->begin==seg->end+1)
615 subseq = openwin(seq, seg->end+1, nextseg->begin-seg->end-1);
617 seqout(subseq, hi, seg->end+2, nextseg->begin);
621 if (seg->end+1==seq->length)
623 /* fputc('\n', stdout); -- for spacing after each sequence */
627 subseq = openwin(seq, seg->end+1, seq->length-seg->end-1);
629 seqout(subseq, hi, seg->end+2, seq->length);
632 /* fputc('\n', stdout); -- for spacing after each sequence */
636 /*------------------------------------------------------------(singreport)---*/
638 singreport(seq, segs)
639 struct Sequence *seq;
640 struct Segment *segs;
642 char *proseq, *proseqmax;
644 int begin, end, i, ctr;
647 proseqmax = proseq + seq->length;
648 upper(proseq, seq->length);
650 for (seg=segs; seg!=NULL; seg=seg->next) {
653 memset(proseq + begin, 'x', end - begin +1);
656 fprintf(stdout, "%s\n", seq->header);
658 for (i=0, ctr=0; proseq < proseqmax; ++i, ++ctr, ++proseq) {
663 putc(*proseq, stdout);
667 if (putc('\n', stdout) == EOF) {
668 fprintf(stderr, "premature EOF on write\n");
673 /*----------------------------------------------------------(prettyreport)---*/
675 prettyreport(seq, segs)
676 struct Sequence *seq;
677 struct Segment *segs;
680 char *proseq, *proseqmax;
683 int begin, end, i, ctr;
686 leftspace = (int) ceil(log10((double)seq->length));
687 sprintf(format, "%%%dd ", leftspace);
689 upper(proseq = seq->seq, seq->length);
691 for (seg=segs; seg!=NULL; seg=seg->next) {
694 lower(proseq + begin, end - begin + 1);
697 fprintf(stdout, "%s\n", seq->header);
700 for (i=0, ctr=1; i<charline; ++i, ++ctr) {
709 fprintf(stdout, format, 1);
711 proseqmax = proseq + seq->length;
712 for (i=0, ctr=0; proseq < proseqmax; ++i, ++ctr, ++proseq) {
715 fprintf(stdout, format, i+1);
718 putc(*proseq, stdout);
721 fprintf(stdout, " %d\n", seq->length);
722 if (putc('\n', stdout) == EOF) {
723 fprintf(stderr, "premature EOF on write\n");
728 /*---------------------------------------------------------(pretreereport)---*/
730 pretreereport(seq, segs)
731 struct Sequence *seq;
732 struct Segment *segs;
735 struct Sequence *win;
737 char buffer[100], leftfmt[20], rightfmt[20];
739 int i, left, right, len;
740 int current, nextloent;
743 cline = charline / 2;
745 fprintf(stdout, "%s\n\n", seq->header);
746 sprintf(leftfmt, "%%%ds", cline);
747 sprintf(rightfmt, "%%-%ds", cline);
751 for (seg=segs; ; seg=seg->next) {
753 nextloent = seq->length;
755 nextloent = seg->begin;
757 if (current < nextloent) {
759 right = nextloent - 1;
760 len = right - left + 1;
761 win = openwin(seq, left, len);
762 upper(curseq = win->seq, win->length);
765 fprintf(stdout, " %4d-%-4d ", left+1, right+1);
769 fwrite(curseq, len, 1, stdout);
774 fwrite(curseq, cline, 1, stdout);
784 if (seg==NULL) break;
788 len = right - left + 1;
789 win = openwin(seq, left, len);
790 lower(curseq = win->seq, win->length);
794 fprintf(stdout, "%*s", cline-i, "");
795 fwrite(curseq, i, 1, stdout);
796 fprintf(stdout, " %4d-%-4d ", left+1, right+1);
807 fwrite(curseq, i, 1, stdout);
821 /*-----------------------------------------------------------------(space)---*/
826 static char spaces[] =
827 {' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
828 ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
829 ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
830 ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
831 ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' '
836 i = MIN(len, sizeof(spaces)/sizeof(spaces[0]));
837 fwrite(spaces, i, 1, stdout);
842 /*----------------------------------------------------------------(seqout)---*/
844 seqout(seq, hilo, begin, end)
845 struct Sequence *seq;
852 char *proseq, *proseqmax, *id, *header;
853 char outbuf[HDRLEN+1];
858 if (hionly && hilo==lo) return;
859 if (loonly && hilo==hi) return;
862 proseqmax = proseq + seq->length;
864 if (id==NULL) id = seq->parent->id;
865 header = seq->header;
866 if (header==NULL) header = seq->parent->header;
868 iend = findchar(header, ' ');
869 if (iend!=-1) header = header+iend;
873 fprintf(stdout, ">%s(%d-%d)", id, begin, end);
874 /* if (iend!=-1 && strlen(header)<=HDRLEN) fprintf(stdout, "%s", header);
875 else if (iend!=-1) for (i=0; i<HDRLEN; i++) putc(header[i], stdout); */
876 fprintf(stdout, " complexity=%4.2f (%d/%4.2f/%4.2f)\n",
877 seq->entropy, window, locut, hicut);
881 fprintf(stdout, ">%s(%d-%d)", id, begin, end);
882 if (iend!=-1) /* fprintf(stdout, "%s\n", header); */
884 i = MIN(HDRLEN, strlen(header));
885 fwrite(header, i, 1, stdout);
888 else putc('\n', stdout);
893 lower(proseq, seq->length);
895 else if (hilo==hi && seq->length>=hilenmin)
897 upper(proseq, seq->length);
901 lower(proseq, seq->length);
904 for (; proseq < proseqmax; proseq+=i) {
905 i = MIN(charline, proseqmax - proseq);
906 fwrite(proseq, i, 1, stdout);
910 if (putc('\n', stdout) == EOF) {
911 fprintf(stderr, "premature EOF on write\n");
916 /*-------------------------------------------------------------(appendseg)---*/
919 struct Segment *segs, *seg;
921 {struct Segment *temp;
926 if (temp->next==NULL)
940 /*--------------------------------------------------------------(freesegs)---*/
943 struct Segment *segs;
945 {struct Segment *temp;
955 /*-----------------------------------------------------------------(usage)---*/
961 Usage: hiseg <file> <window> <hicut> <locut> <options>\n\
962 <file> - filename containing fasta-formatted sequence(s) \n\
963 <window> - OPTIONAL window size (default 12) \n\
964 <hicut> - OPTIONAL high (trigger) complexity (default 2.5) \n\
965 <locut> - OPTIONAL low (extension) complexity (default 2.2) \n\
967 -x each input sequence is represented by a single output \n\
968 sequence with low-complexity regions replaced by \n\
969 strings of 'x' characters \n\
970 -c <chars> number of sequence characters/line (default 60)\n\
971 -m <size> minimum length for a high-complexity segment \n\
972 (default 0). Shorter segments are merged with adjacent \n\
973 low-complexity segments \n\
974 -l show only low-complexity segments (fasta format) \n\
975 -h show only high-complexity segments (fasta format) \n\
976 -a show all segments (fasta format) \n\
977 -n do not add complexity information to the header line \n\
978 -o show overlapping low-complexity segments (default merge) \n\
979 -t <maxtrim> maximum trimming of raw segment (default 100) \n\
980 -p prettyprint each segmented sequence (tree format) \n\
981 -q prettyprint each segmented sequence (block format) \n");