2 /*****************************************************************************/
4 /*** #include precomputed ln(fact(n)) array in "lnfac.h" ***/
5 /*****************************************************************************/
7 /*--------------------------------------------------------------(includes)---*/
12 /*---------------------------------------------------------------(defines)---*/
14 #define LENCORRLIM 120
15 #define MIN(a,b) ((a) <= (b) ? (a) : (b))
17 /*---------------------------------------------------------------(structs)---*/
26 /*---------------------------------------------------------------(globals)---*/
38 int singleseq = FALSE;
39 int prettyseq = FALSE;
40 int prettytree = TRUE;
44 double getprob(), lnperm(), lnass();
46 /*------------------------------------------------------------------(main)---*/
58 /* readlencorr(); */ /* #include lencorr file */
59 getparams(argc, argv);
62 if ((db=opendbase(argv[1]))==NULL)
64 fprintf(stderr, "Error opening file %s\n", argv[1]);
68 for (seq=firstseq(db); seq!=NULL; seq=nextseq(db))
70 segs = (struct Segment *) NULL;
71 segseq(seq, &segs, 0);
74 if (singleseq) singreport(seq, segs);
75 else if (prettyseq) prettyreport(seq, segs);
76 else if (prettytree) pretreereport(seq, segs);
77 else report(seq, segs);
87 /*-------------------------------------------------------------(getparams)---*/
106 for (i=2; argc>i && argv[i][0]!='-'; i++)
110 window = atoi(argv[2]);
114 locut = atof(argv[3]);
119 hicut = atof(argv[4]);
125 fprintf(stderr, "Warning: trigger complexity greater than extension\n");
129 downset = (window+1)/2 - 1;
130 upset = window - downset;
136 while ((c=getopt(nargc, nargv, "m:olhaxpqc:nt:"))!=-1)
141 hilenmin = atoi(optarg);
188 charline = atoi(optarg);
194 maxtrim = atoi(optarg);
197 fprintf(stderr, "Unknown option.\n");
207 /*---------------------------------------------------------------(segment)---*/
209 segseq(seq, segs, offset)
210 struct Sequence *seq;
211 struct Segment **segs;
214 {struct Segment *seg, *leftsegs;
215 struct Sequence *leftseq;
216 int first, last, lowlim;
218 int leftend, rightend, lend, rend;
219 double *H, *seqent();
225 last = seq->length - upset;
228 for (i=first; i<=last; i++)
230 if (H[i]<=locut && H[i]!=-1)
232 loi = findlo(i, lowlim, H);
233 hii = findhi(i, last, H);
235 leftend = loi - downset;
236 rightend = hii + upset - 1;
238 trim(openwin(seq, leftend, rightend-leftend+1), &leftend, &rightend);
240 if (i+upset-1<leftend) /* check for trigger window in left trim */
242 lend = loi - downset;
245 leftseq = openwin(seq, lend, rend-lend+1);
246 leftsegs = (struct Segment *) NULL;
247 segseq(leftseq, &leftsegs, offset+lend);
250 if (*segs==NULL) *segs = leftsegs;
251 else appendseg(*segs, leftsegs);
255 /* trim(openwin(seq, lend, rend-lend+1), &lend, &rend);
256 seg = (struct Segment *) malloc(sizeof(struct Segment));
259 seg->next = (struct Segment *) NULL;
260 if (segs==NULL) segs = seg;
261 else appendseg(segs, seg); */
264 seg = (struct Segment *) malloc(sizeof(struct Segment));
265 seg->begin = leftend + offset;
266 seg->end = rightend + offset;
267 seg->next = (struct Segment *) NULL;
269 if (*segs==NULL) *segs = seg;
270 else appendseg(*segs, seg);
272 i = min(hii, rightend+downset);
274 /* i = hii; this ignores the trimmed residues... */
282 /*----------------------------------------------------------------(seqent)---*/
285 struct Sequence *seq;
287 {struct Sequence *win;
291 if (window>seq->length)
293 return((double *) NULL);
296 H = (double *) malloc(seq->length*sizeof(double));
298 for (i=0; i<seq->length; i++)
303 win = openwin(seq, 0, window);
307 last = seq->length - upset;
309 for (i=first; i<=last; i++)
311 if (seq->punctuation && hasdash(win))
323 /*---------------------------------------------------------------(hasdash)---*/
326 struct Sequence *win;
328 register char *seq, *seqmax;
331 seqmax = seq + win->length;
333 while (seq < seqmax) {
340 /*----------------------------------------------------------------(findlo)---*/
348 for (j=i; j>=limit; j--)
351 if (H[j]>hicut) break;
357 /*----------------------------------------------------------------(findhi)---*/
365 for (j=i; j<=limit; j++)
368 if (H[j]>hicut) break;
374 /*------------------------------------------------------------------(trim)---*/
376 trim(seq, leftend, rightend)
377 struct Sequence *seq;
378 int *leftend, *rightend;
380 {struct Sequence *win;
381 double prob, minprob;
386 /* fprintf(stderr, "%d %d\n", *leftend, *rightend); */
389 rend = seq->length - 1;
391 if ((seq->length-maxtrim)>minlen) minlen = seq->length-maxtrim;
394 for (len=seq->length; len>minlen; len--)
396 win = openwin(seq, 0, len);
402 prob = getprob(win->state, len);
409 shift = shiftwin1(win);
415 /* fprintf(stderr, "%d-%d ", *leftend, *rightend); */
417 *leftend = *leftend + lend;
418 *rightend = *rightend - (seq->length - rend - 1);
420 /* fprintf(stderr, "%d-%d\n", *leftend, *rightend); */
426 /*---------------------------------------------------------------(getprob)---*/
428 double getprob(sv, total)
434 #define LN20 2.9957322735539909
435 totseq = ((double) total) * LN20;
437 ans = lnass(sv) + lnperm(sv, total) - totseq;
442 /*----------------------------------------------------------------(lnperm)---*/
444 double lnperm(sv, tot)
453 for (i=0; sv[i]!=0; i++)
461 /*-----------------------------------------------------------------(lnass)---*/
467 register int svi, svim1;
468 register int class, total;
478 for (i=0;; svim1 = svi) {
483 else if ((svi = *++sv) == svim1) {
504 /*-------------------------------------------------------------(mergesegs)---*/
507 struct Sequence *seq;
508 struct Segment *segs;
510 {struct Segment *seg, *nextseg;
513 if (overlaps) return;
514 if (segs==NULL) return;
516 if (segs->begin<hilenmin) segs->begin = 0;
521 while (nextseg!=NULL)
523 if (seg->end>=nextseg->begin) /* overlapping segments */
525 seg->end = nextseg->end;
526 seg->next = nextseg->next;
531 len = nextseg->begin - seg->end - 1;
532 if (len<hilenmin) /* short hient segment */
534 seg->end = nextseg->end;
535 seg->next = nextseg->next;
544 len = seq->length - seg->end - 1;
545 if (len<hilenmin) seg->end = seq->length - 1;
550 /*----------------------------------------------------------------(report)---*/
553 struct Sequence *seq;
554 struct Segment *segs;
556 {struct Sequence *subseq;
557 struct Segment *seg, *nextseg;
564 seqout(seq, hi, 1, seq->length);
565 /* fputc('\n', stdout); -- for spacing after each sequence */
571 subseq = openwin(seq, 0, segs->begin);
573 seqout(subseq, hi, 1, segs->begin);
577 for (seg=segs; seg!=NULL; seg=seg->next)
579 subseq = openwin(seq, seg->begin, seg->end-seg->begin+1);
581 seqout(subseq, lo, seg->begin+1, seg->end+1);
591 if (nextseg->begin<=seg->end)
593 fprintf(stderr, "Overlapping segments: %s\n", seq->id);
597 if (nextseg->begin==seg->end+1)
602 subseq = openwin(seq, seg->end+1, nextseg->begin-seg->end-1);
604 seqout(subseq, hi, seg->end+2, nextseg->begin);
608 if (seg->end+1==seq->length)
610 /* fputc('\n', stdout); -- for spacing after each sequence */
614 subseq = openwin(seq, seg->end+1, seq->length-seg->end-1);
616 seqout(subseq, hi, seg->end+2, seq->length);
619 /* fputc('\n', stdout); -- for spacing after each sequence */
623 /*------------------------------------------------------------(singreport)---*/
625 singreport(seq, segs)
626 struct Sequence *seq;
627 struct Segment *segs;
629 char *proseq, *proseqmax;
631 int begin, end, i, ctr;
634 proseqmax = proseq + seq->length;
635 upper(proseq, seq->length);
637 for (seg=segs; seg!=NULL; seg=seg->next) {
640 memset(proseq + begin, 'x', end - begin +1);
643 fprintf(stdout, "%s\n", seq->header);
645 for (i=0, ctr=0; proseq < proseqmax; ++i, ++ctr, ++proseq) {
650 putc(*proseq, stdout);
654 if (putc('\n', stdout) == EOF) {
655 fprintf(stderr, "premature EOF on write\n");
660 /*----------------------------------------------------------(prettyreport)---*/
662 prettyreport(seq, segs)
663 struct Sequence *seq;
664 struct Segment *segs;
667 char *proseq, *proseqmax;
670 int begin, end, i, ctr;
673 leftspace = (int) ceil(log10((double)seq->length));
674 sprintf(format, "%%%dd ", leftspace);
676 upper(proseq = seq->seq, seq->length);
678 for (seg=segs; seg!=NULL; seg=seg->next) {
681 lower(proseq + begin, end - begin + 1);
684 fprintf(stdout, "%s\n", seq->header);
687 for (i=0, ctr=1; i<charline; ++i, ++ctr) {
696 fprintf(stdout, format, 1);
698 proseqmax = proseq + seq->length;
699 for (i=0, ctr=0; proseq < proseqmax; ++i, ++ctr, ++proseq) {
702 fprintf(stdout, format, i+1);
705 putc(*proseq, stdout);
708 fprintf(stdout, " %d\n", seq->length);
709 if (putc('\n', stdout) == EOF) {
710 fprintf(stderr, "premature EOF on write\n");
715 /*---------------------------------------------------------(pretreereport)---*/
717 pretreereport(seq, segs)
718 struct Sequence *seq;
719 struct Segment *segs;
722 struct Sequence *win;
724 char buffer[100], leftfmt[20], rightfmt[20];
726 int i, left, right, len;
727 int current, nextloent;
730 cline = charline / 2;
732 fprintf(stdout, "%s\n\n", seq->header);
733 sprintf(leftfmt, "%%%ds", cline);
734 sprintf(rightfmt, "%%-%ds", cline);
738 for (seg=segs; ; seg=seg->next) {
740 nextloent = seq->length;
742 nextloent = seg->begin;
744 if (current < nextloent) {
746 right = nextloent - 1;
747 len = right - left + 1;
748 win = openwin(seq, left, len);
749 upper(curseq = win->seq, win->length);
752 fprintf(stdout, " %4d-%-4d ", left+1, right+1);
756 fwrite(curseq, len, 1, stdout);
761 fwrite(curseq, cline, 1, stdout);
771 if (seg==NULL) break;
775 len = right - left + 1;
776 win = openwin(seq, left, len);
777 lower(curseq = win->seq, win->length);
781 fprintf(stdout, "%*s", cline-i, "");
782 fwrite(curseq, i, 1, stdout);
783 fprintf(stdout, " %4d-%-4d ", left+1, right+1);
794 fwrite(curseq, i, 1, stdout);
808 /*-----------------------------------------------------------------(space)---*/
813 static char spaces[] =
814 {' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
815 ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
816 ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
817 ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
818 ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' '
823 i = MIN(len, sizeof(spaces)/sizeof(spaces[0]));
824 fwrite(spaces, i, 1, stdout);
829 /*----------------------------------------------------------------(seqout)---*/
831 seqout(seq, hilo, begin, end)
832 struct Sequence *seq;
839 char *proseq, *proseqmax, *id, *header;
840 char outbuf[HDRLEN+1];
845 if (hionly && hilo==lo) return;
846 if (loonly && hilo==hi) return;
849 proseqmax = proseq + seq->length;
851 if (id==NULL) id = seq->parent->id;
852 header = seq->header;
853 if (header==NULL) header = seq->parent->header;
855 iend = findchar(header, ' ');
856 if (iend!=-1) header = header+iend;
860 fprintf(stdout, ">%s(%d-%d)", id, begin, end);
861 /* if (iend!=-1 && strlen(header)<=HDRLEN) fprintf(stdout, "%s", header);
862 else if (iend!=-1) for (i=0; i<HDRLEN; i++) putc(header[i], stdout); */
863 fprintf(stdout, " complexity=%4.2f (%d/%4.2f/%4.2f)\n",
864 seq->entropy, window, locut, hicut);
868 fprintf(stdout, ">%s(%d-%d)", id, begin, end);
869 if (iend!=-1) /* fprintf(stdout, "%s\n", header); */
871 i = MIN(HDRLEN, strlen(header));
872 fwrite(header, i, 1, stdout);
875 else putc('\n', stdout);
880 lower(proseq, seq->length);
882 else if (hilo==hi && seq->length>=hilenmin)
884 upper(proseq, seq->length);
888 lower(proseq, seq->length);
891 for (; proseq < proseqmax; proseq+=i) {
892 i = MIN(charline, proseqmax - proseq);
893 fwrite(proseq, i, 1, stdout);
897 if (putc('\n', stdout) == EOF) {
898 fprintf(stderr, "premature EOF on write\n");
903 /*-------------------------------------------------------------(appendseg)---*/
906 struct Segment *segs, *seg;
908 {struct Segment *temp;
913 if (temp->next==NULL)
927 /*--------------------------------------------------------------(freesegs)---*/
930 struct Segment *segs;
932 {struct Segment *temp;
942 /*-----------------------------------------------------------------(usage)---*/
948 Usage: seg <file> <window> <locut> <hicut> <options>\n\
949 <file> - filename containing fasta-formatted sequence(s) \n\
950 <window> - OPTIONAL window size (default 12) \n\
951 <locut> - OPTIONAL low (trigger) complexity (default 2.2) \n\
952 <hicut> - OPTIONAL high (extension) complexity (default locut + 0.3) \n\
954 -x each input sequence is represented by a single output \n\
955 sequence with low-complexity regions replaced by \n\
956 strings of 'x' characters \n\
957 -c <chars> number of sequence characters/line (default 60)\n\
958 -m <size> minimum length for a high-complexity segment \n\
959 (default 0). Shorter segments are merged with adjacent \n\
960 low-complexity segments \n\
961 -l show only low-complexity segments (fasta format) \n\
962 -h show only high-complexity segments (fasta format) \n\
963 -a show all segments (fasta format) \n\
964 -n do not add complexity information to the header line \n\
965 -o show overlapping low-complexity segments (default merge) \n\
966 -t <maxtrim> maximum trimming of raw segment (default 100) \n\
967 -p prettyprint each segmented sequence (tree format) \n\
968 -q prettyprint each segmented sequence (block format) \n");