JPRED-2 Add sources of all binaries (except alscript) to Git
[jpred.git] / sources / seg / genwin.c
diff --git a/sources/seg/genwin.c b/sources/seg/genwin.c
new file mode 100644 (file)
index 0000000..fbf4d0e
--- /dev/null
@@ -0,0 +1,878 @@
+
+/*****************************************************************************/
+/***   (genwin.c)                                                          ***/
+/*****************************************************************************/
+
+/*--------------------------------------------------------------(includes)---*/
+
+#include "genwin.h"
+
+/*---------------------------------------------------------------(defines)---*/
+
+#define STRSIZE 100
+
+/*----------------------------------------------------------------(protos)---*/
+
+struct Sequence *readentry();
+
+/*---------------------------------------------------------------(globals)---*/
+
+char *blastdbs[] =
+  {"bba", "bbn", "embl", "gbupdate", "genbank", "genpept", "gpupdate",
+   "nr", "nrdb", "nrdb.shuf", "pir", "pseq", "swissprot", "tfdaa"};
+
+int nblastdbs = 14;
+
+#ifndef MIN
+#define MIN(a,b) ((a) <= (b) ? (a) : (b))
+#endif
+
+char *blastdir = "/net/cruncher/usr/ncbi/db/fasta/";
+char *indexdir = "/net/cruncher/usr/ncbi/db/index/";
+
+int nabets;
+struct Alphabet **abets;
+int ntvecs;
+struct TransVector **tvecs;
+int nsvecs;
+struct ScoreVector **svecs;
+int nsmats;
+struct ScoreMatrix **smats;
+
+int aaindex[128];
+unsigned char  aaflag[128];
+char aachar[20];
+
+struct strlist
+  {
+   char string[STRSIZE];
+   struct strlist *next;
+  } *str, *curstr;
+
+/*---------------------------------------------------------------(tmalloc)---*/
+
+#define TESTMAX 1000
+void *tmalloc();
+int record_ptrs[TESTMAX] = {0,0,0,0};
+int rptr = 0;
+
+/*------------------------------------------------------------(genwininit)---*/
+
+genwininit()
+{
+       char    *cp, *cp0;
+       int             i;
+       char    c;
+
+       for (i = 0; i < sizeof(aaindex)/sizeof(aaindex[0]); ++i) {
+               aaindex[i] = 20;
+               aaflag[i] = TRUE;
+       }
+               
+       for (cp = cp0 = "ACDEFGHIKLMNPQRSTVWY"; (c = *cp) != '\0'; ++cp) {
+               i = cp - cp0;
+               aaindex[c] = i;
+               aaindex[tolower(c)] = i;
+               aachar[i] = tolower(c);
+               aaflag[c] = FALSE;
+               aaflag[tolower(c)] = FALSE;
+       }
+       return;
+}
+        
+/*-------------------------------------------------------------(opendbase)---*/
+
+extern struct Database *opendbase(name)
+  char *name;
+
+  {struct Database *dbase;
+
+   dbase = (struct Database *) malloc(sizeof(struct Database));
+
+   if (blastdb(name))
+     {
+      dbase->filename = (char *) malloc(strlen(blastdir)+strlen(name)+1);
+      dbase->indexname = (char *) malloc(strlen(indexdir)+strlen(name)+1);
+      strcpy(dbase->filename, blastdir);
+      strcat(dbase->filename, name);
+      strcpy(dbase->indexname, indexdir);
+      strcat(dbase->indexname, name);
+     }
+   else
+     {
+      dbase->filename = (char *) malloc(strlen(name)+1);
+      dbase->indexname = (char *) malloc(strlen(name)+1);
+      strcpy(dbase->filename, name);
+      strcpy(dbase->indexname, name);
+     }
+
+   if (strcmp(dbase->filename, "-")==0)
+     {
+      dbase->fp = stdin;
+     }
+   else if ((dbase->fp=fopen(dbase->filename, "r"))==NULL)
+     {
+      free(dbase->filename);
+      free(dbase->indexname);
+      free(dbase);
+      return((struct Database *) NULL);
+     }
+
+   dbase->filepos = 0L;
+
+   return(dbase);
+  }
+
+/*---------------------------------------------------------------(blastdb)---*/
+
+int blastdb(name)
+  char *name;
+
+  {int i;
+
+   for (i=0; i<nblastdbs; i++)
+     {
+      if (strcmp(name, blastdbs[i])==0) {return(TRUE);}
+     }
+
+   return(FALSE);
+  }
+
+/*------------------------------------------------------------(closedbase)---*/
+
+extern closedbase(dbase)
+  struct Database *dbase;
+
+  {
+   fclose(dbase->fp);
+   free(dbase->filename);
+   free(dbase->indexname);
+   free(dbase);
+
+   return;
+  }
+
+/*--------------------------------------------------------------(firstseq)---*/
+
+extern struct Sequence *firstseq(dbase)
+  struct Database *dbase;
+
+  {
+   if (dbase->filepos!=0L)
+     {
+      dbase->filepos = 0L;
+      if (fseek(dbase->fp, dbase->filepos, 0)!=0)
+        {fprintf(stderr, "Error positioning file %s for firstseq.\n",
+                           dbase->filename);
+         exit(1);}
+     }
+
+   return(readentry(dbase));
+  }
+
+/*---------------------------------------------------------------(nextseq)---*/
+
+extern struct Sequence *nextseq(dbase)
+  struct Database *dbase;
+
+  {
+   return(readentry(dbase));
+  }
+
+/*--------------------------------------------------------------(closeseq)---*/
+
+extern closeseq(seq)
+  struct Sequence *seq;
+
+  {
+   if (seq==NULL) return;
+
+   if (seq->id!=NULL)          free(seq->id);
+   if (seq->name!=NULL)        free(seq->name);
+   if (seq->organism!=NULL)    free(seq->organism);
+   if (seq->header!=NULL)      free(seq->header);
+   if (seq->state!=NULL)       free(seq->state);
+   if (seq->composition!=NULL) free(seq->composition);
+   free(seq->seq);
+   free(seq);
+   return;
+  }
+
+/*---------------------------------------------------------------(openwin)---*/
+
+extern struct Sequence *openwin(parent, start, length)
+  struct Sequence *parent;
+  int start, length;
+
+  {struct Sequence *win;
+   int i;
+
+   if (start<0 || length<0 || start+length>parent->length)
+     {
+      return((struct Sequence *) NULL);
+     }
+
+   win = (struct Sequence *) malloc(sizeof(struct Sequence));
+
+/*---                                          ---[set links, up and down]---*/
+
+   win->parent = parent;
+   if (parent->root==NULL)
+     {win->root = parent;}
+   else
+     {win->root = parent->root;}
+   win->children = (struct Sequence **) NULL;
+
+/* parent->children = ***foo***                   ---[not yet implemented]---*/
+
+   win->id = (char *) NULL;
+   win->name = (char *) NULL;
+   win->organism = (char *) NULL;
+   win->header = (char *) NULL;
+
+/*---                          ---[install the local copy of the sequence]---*/
+
+   win->start = start;
+   win->length = length;
+#if 0
+   win->seq = (char *) malloc(sizeof(char)*length + 1);
+   memcpy(win->seq, (parent->seq)+start, length);
+   win->seq[length] = '\0';
+#else
+       win->seq = parent->seq + start;
+#endif
+
+/*---                          ---[setup window implementation parameters]---*/
+
+/*---                                                 ---[set local flags]---*/
+
+       win->rubberwin = FALSE;
+       win->floatwin = FALSE;
+       win->punctuation = FALSE;
+
+/*---                                   ---[initially unconfiguerd window]---*/
+
+       win->entropy = -2.;
+       win->state = (int *) NULL;
+       win->composition = (int *) NULL;
+       win->classvec = (char *) NULL;
+       win->scorevec = (double *) NULL;
+
+       stateon(win);
+
+       return win;
+}
+
+/*---------------------------------------------------------------(nextwin)---*/
+
+extern struct Sequence *nextwin(win, shift)
+  struct Sequence *win;
+  int shift;
+
+  {
+   if ((win->start+shift)<0 ||
+       (win->start+win->length+shift)>win->parent->length)
+     {
+      return((struct Sequence *) NULL);
+     }
+   else
+     {
+      return(openwin(win->parent, win->start+shift, win->length));
+     }
+  }
+
+/*--------------------------------------------------------------(shiftwin1)---*/
+static void    decrementsv(), incrementsv();
+
+extern int shiftwin1(win)
+       struct Sequence *win;
+{
+       register int    j, length;
+       register int    *comp;
+
+       length = win->length;
+       comp = win->composition;
+
+       if ((++win->start + length) > win->parent->length) {
+               --win->start;
+               return FALSE;
+       }
+
+       if (!aaflag[j = win->seq[0]])
+               decrementsv(win->state, comp[aaindex[j]]--);
+
+       j = win->seq[length];
+       ++win->seq;
+
+       if (!aaflag[j])
+               incrementsv(win->state, comp[aaindex[j]]++);
+
+       if (win->entropy > -2.)
+               win->entropy = entropy(win->state);
+
+       return TRUE;
+}
+
+/*--------------------------------------------------------------(closewin)---*/
+
+extern closewin(win)
+  struct Sequence *win;
+
+  {
+   if (win==NULL) return;
+
+   if (win->state!=NULL)       free(win->state);
+   if (win->composition!=NULL) free(win->composition);
+   if (win->classvec!=NULL)    free(win->classvec);
+   if (win->scorevec!=NULL)    free(win->scorevec);
+
+   free(win);
+   return;
+  }
+
+/*----------------------------------------------------------------(compon)---*/
+
+extern compon(win)
+       struct Sequence *win;
+{
+       register int    *comp;
+       register int    aa;
+       register char   *seq, *seqmax;
+
+       win->composition = comp = (int *) calloc(20*sizeof(*comp), 1);
+       seq = win->seq;
+       seqmax = seq + win->length;
+
+       while (seq < seqmax) {
+               aa = *seq++;
+               if (!aaflag[aa])
+                       comp[aaindex[aa]]++;
+       }
+
+       return;
+}
+
+/*---------------------------------------------------------------(stateon)---*/
+
+static int state_cmp(s1, s2)
+       int     *s1, *s2;
+{
+       return *s2 - *s1;
+}
+
+extern stateon(win)
+       struct Sequence *win;
+{
+       register int    aa, nel, c;
+
+       if (win->composition == NULL)
+               compon(win);
+
+       win->state = (int *) malloc(21*sizeof(win->state[0]));
+
+       for (aa = nel = 0; aa < 20; ++aa) {
+               if ((c = win->composition[aa]) == 0)
+                       continue;
+               win->state[nel++] = c;
+       }
+       for (aa = nel; aa < 21; ++aa)
+               win->state[aa] = 0;
+
+       qsort(win->state, nel, sizeof(win->state[0]), state_cmp);
+
+       return;
+}
+
+/*-----------------------------------------------------------------(enton)---*/
+
+extern enton(win)
+  struct Sequence *win;
+
+  {
+   if (win->state==NULL) {stateon(win);}
+
+   win->entropy = entropy(win->state);
+
+   return;
+  }
+
+/*---------------------------------------------------------------(entropy)---*/
+static int             thewindow;
+static double  *entray;
+
+#define LN2    0.69314718055994530941723212145818
+
+void
+entropy_init(window)
+       int     window;
+{
+       int             i;
+       double  x, xw;
+
+       entray = (double *)malloc((window+1) * sizeof(*entray));
+       xw = window;
+       for (i = 1; i <= window; ++i) {
+               x = i / xw;
+               entray[i] = -x * log(x) / LN2;
+       }
+
+       thewindow = window;
+}
+
+extern double entropy(sv)
+       register int    *sv;
+{
+       int     *sv0 = sv;
+       register double ent;
+       register int    i, total;
+       register int    *svmax;
+       register double xtotrecip, xsv;
+
+       for (total = 0; (i = *sv) != 0; ++sv)
+               total += i;
+       svmax = sv;
+       ent = 0.0;
+       if (total == thewindow) {
+               for (sv = sv0; sv < svmax; ) {
+                       ent += entray[*sv++];
+               }
+               return ent;
+       }
+       if (total == 0)
+               return 0.;
+
+       xtotrecip = 1./(double)total;
+       for (sv = sv0; sv < svmax; ) {
+               xsv = *sv++;
+               ent += xsv * log(xsv * xtotrecip);
+       }
+       return -ent * xtotrecip / LN2;
+}
+
+/*-----------------------------------------------------------(decrementsv)---*/
+
+static void
+decrementsv(sv, class)
+       register int    *sv;
+       register int    class;
+{
+       register int    svi;
+
+       while ((svi = *sv++) != 0) {
+               if (svi == class && *sv < class) {
+                       sv[-1] = svi - 1;
+                       break;
+               }
+       }
+}
+
+/*-----------------------------------------------------------(incrementsv)---*/
+
+static void
+incrementsv(sv, class)
+       register int    *sv;
+       int     class;
+{
+       for (;;) {
+               if (*sv++ == class) {
+                       sv[-1]++;
+                       break;
+               }
+       }
+}
+
+/*-------------------------------------------------------------(readentry)---*/
+
+struct Sequence *readentry(dbase)
+  struct Database *dbase;
+
+  {struct Sequence *seq;
+   int c;
+
+   seq = (struct Sequence *) malloc(sizeof(struct Sequence));
+
+   seq->db = dbase;
+
+/*---                                    ---[backpointers null at the top]---*/
+
+   seq->parent = (struct Sequence *) NULL;
+   seq->root = (struct Sequence *) NULL;
+   seq->children = (struct Sequence **) NULL;
+
+/*---                                                       ---[set flags]---*/
+
+   seq->rubberwin = FALSE;
+   seq->floatwin = FALSE;
+
+/*---                                                  ---[read from file]---*/
+
+   if (!readhdr(seq))
+     {
+      return((struct Sequence *) NULL);
+     }
+   while (1)  /*---[skip multiple headers]---*/
+     {
+      c = getc(dbase->fp);
+         if (c == EOF)
+               break;
+      if (c != '>') {
+         ungetc(c, dbase->fp);
+         break;
+               }
+      while ((c=getc(dbase->fp)) != EOF && c !='\n')
+               ;
+               if (c == EOF)
+                       break;
+     }
+   readseq(seq);
+
+/*---                                   ---[set implementation parameters]---*/
+
+/*---                                          ---[initially unconfigured]---*/
+
+   seq->entropy = -2.;
+   seq->state = (int *) NULL;
+   seq->composition = (int *) NULL;
+   seq->classvec = (char *) NULL;
+   seq->scorevec = (double *) NULL;
+
+   return(seq);
+  }
+
+/*---------------------------------------------------------------(readhdr)---*/
+
+readhdr(seq)
+  struct Sequence *seq;
+
+  {FILE *fp;
+   char *bptr, *curpos;
+   int c, i, itotal;
+   int idend, namend, orgend;
+
+   fp = seq->db->fp;
+
+   if ((c=getc(fp)) == EOF)
+     {
+      free(seq);
+      return(FALSE);
+     }
+   
+   while (c != EOF && isspace(c))
+     {
+      c = getc(fp);
+     }
+
+   if (c!='>')
+     {fprintf(stderr, "Error reading fasta format - '>' not found.\n");
+      exit(1);}
+   ungetc(c, fp);
+/*                                               ---[read the header line]---*/
+   str = (struct strlist *) malloc (sizeof(struct strlist));
+   str->next = NULL;
+   curstr = str;
+
+   for (i=0,itotal=0,c=getc(fp); c != EOF; c=getc(fp))
+     {
+      if (c=='\n') break;
+
+      if (i==STRSIZE-1)
+        {curstr->string[i] = '\0';
+         curstr->next = (struct strlist *) malloc (sizeof(struct strlist));
+         curstr = curstr->next;
+         curstr->next = NULL;
+         i = 0;}
+
+      curstr->string[i] = c;
+      itotal++;
+      i++;
+     }
+
+   curstr->string[i] = '\0';
+   seq->header = (char *) malloc (itotal+2);
+   seq->header[0] = '\0';
+
+   for (curstr=str, curpos=seq->header; curstr!=NULL;)
+     {
+      if (curstr->next==NULL)
+        {memccpy(curpos, curstr->string, '\0', STRSIZE);}
+      else
+        {memccpy(curpos, curstr->string, '\0', STRSIZE-1);}
+
+      str = curstr;
+      curstr = curstr->next;
+      free (str);
+
+      if (curstr!=NULL) {curpos = curpos+STRSIZE-1;}
+     }
+
+   bptr = (seq->header)+1;
+   seq->name = (char *) NULL;
+   seq->organism = (char *) NULL;
+/*                                                   ---[parse out the id]---*/
+   idend = findchar(bptr, ' ');
+   if (idend==-1) {idend = findchar(bptr, '\n');}
+   if (idend==-1) {idend = findchar(bptr, '\0');}
+   if (idend==-1)
+     {fprintf(stderr, "Error parsing header line - id.\n");
+      fputs(seq->header, fp);
+      exit(1);}   
+
+   seq->id = (char *) malloc((idend+1)*sizeof(char));
+   memcpy(seq->id, bptr, idend);
+   seq->id[idend] = '\0';
+
+   if (bptr[idend]=='\n' || bptr[idend]=='\0') {return(TRUE);}
+
+/*                                         ---[parse out the protein name]---*/
+   bptr = bptr + idend + 1;
+   while (bptr[0]==' ') {bptr++;}
+
+   namend = findchar(bptr, '-');
+   if (namend==-1) {namend = findchar(bptr, '\n');}
+   if (namend==-1) {namend = findchar(bptr, '\0');}
+   if (namend==-1)
+     {fprintf(stderr, "Error parsing header line - name.\n");
+      fputs(seq->header, fp);
+      return(TRUE);}
+
+   seq->name = (char *) malloc((namend+1)*sizeof(char));
+   memcpy(seq->name, bptr, namend);
+   seq->name[namend] = '\0';
+
+   if (bptr[namend]=='\n' || bptr[namend]=='\0') {return(TRUE);}
+
+/*                                                 ---[parse out organism]---*/
+   bptr = bptr + namend + 1;
+   while (bptr[0]==' ') {bptr++;}
+
+   orgend = findchar(bptr, '|');
+   if (orgend==-1) {orgend = findchar(bptr, '#');}
+   if (orgend==-1) {orgend = findchar(bptr, '\n');}
+   if (orgend==-1) {orgend = findchar(bptr, '\0');}
+   if (orgend==-1)
+     {fprintf(stderr, "Error parsing header line - organism.\n");
+      fputs(seq->header, fp);
+      return(TRUE);}
+
+   seq->organism = (char *) malloc((orgend+1)*sizeof(char));
+   memcpy(seq->organism, bptr, orgend);
+   seq->organism[orgend] = '\0';
+
+/*                                    ---[skip over multiple header lines]---*/
+   while (TRUE)
+     {
+      c = getc(fp);
+         if (c == EOF)
+               return(TRUE);
+      if (c=='>')
+        {
+         skipline(fp);
+        }
+      else
+        {
+         ungetc(c,fp);
+         break;
+        }
+     }
+
+   return(TRUE);
+  }
+
+/*--------------------------------------------------------------(skipline)---*/
+
+skipline(fp)
+  FILE *fp;
+
+  {int c;
+
+   while ((c=getc(fp))!='\n' && c!=EOF)
+     ;
+
+   return;
+  }
+
+/*--------------------------------------------------------------(findchar)---*/
+
+extern int findchar(str, chr)
+  char *str;
+  char chr;
+
+  {int i;
+
+   for (i=0; ; i++)
+     {
+      if (str[i]==chr)
+        {
+         return(i);
+        }
+      if (str[i]=='\0')
+        {
+         return(-1);
+        }
+     }
+   }
+
+/*---------------------------------------------------------------(readseq)---*/
+
+readseq(seq)
+  struct Sequence *seq;
+
+{FILE *fp;
+   int i, itotal;
+   int c;
+   char *curpos;
+
+   fp = seq->db->fp;
+
+   seq->punctuation = FALSE;   
+
+   str = (struct strlist *) malloc (sizeof(struct strlist));
+   str->next = NULL;
+   curstr = str;
+
+       for (i = 0, itotal = 0, c = getc(fp); c != EOF; c = getc(fp)) {
+               if (!aaflag[c]) {
+Keep:
+                       if (i < STRSIZE-1) {
+                               curstr->string[i++] = c;
+                               continue;
+                       }
+                       itotal += STRSIZE-1;
+                       curstr->string[STRSIZE-1] = '\0';
+                       curstr->next = (struct strlist *) malloc(sizeof(*curstr));
+                       curstr = curstr->next;
+                       curstr->next = NULL;
+                       curstr->string[0] = c;
+                       i = 1;
+                       continue;
+        }
+
+               switch (c) {
+               case '>':
+                       ungetc(c, fp);
+                       goto EndLoop;
+               case '*': case '-':
+                       seq->punctuation = TRUE;
+                       goto Keep;
+               case 'b': case 'B':
+               case 'u': case 'U': /* selenocysteine */
+               case 'x': case 'X':
+               case 'z': case 'Z':
+                       goto Keep;
+               default:
+                       continue;
+               }
+       }
+EndLoop:
+       itotal += i;
+
+       curstr->string[i] = '\0';
+       seq->seq = (char *) malloc (itotal+2);
+       seq->seq[0] = '\0';
+
+       for (curstr = str, curpos = seq->seq; curstr != NULL;) {
+               if (curstr->next == NULL)
+                       memccpy(curpos, curstr->string, '\0', STRSIZE);
+               else
+               memccpy(curpos, curstr->string, '\0', STRSIZE-1);
+
+               str = curstr;
+               curstr = curstr->next;
+               free(str);
+
+               if (curstr != NULL)
+                       curpos = curpos+STRSIZE-1;
+       }
+
+       seq->length = strlen(seq->seq);
+
+       return;
+}
+/*-----------------------------------------------------------------(upper)---*/
+
+extern upper(string, len)
+       register char   *string;
+       size_t  len;
+{
+       register char   *stringmax, c;
+
+       for (stringmax = string + len; string < stringmax; ++string)
+               if (islower(c = *string))
+                       *string = toupper(c);
+}
+
+/*-----------------------------------------------------------------(lower)---*/
+
+extern lower(string, len)
+       char    *string;
+       size_t  len;
+{
+       register char   *stringmax, c;
+
+       for (stringmax = string + len; string < stringmax; ++string)
+               if (isupper(c = *string))
+                       *string = tolower(c);
+}
+
+/*-------------------------------------------------------------------(min)---*/
+
+int min(a, b)
+  int a, b;
+
+  {
+   if (a<b) {return(a);}
+   else {return(b);}
+  }
+
+/*-------------------------------------------------------------------(max)---*/
+
+int max(a, b)
+  int a, b;
+
+  {
+   if (a<b) {return(b);}
+   else {return(a);}
+  }
+
+/*---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------(tmalloc)---*/
+
+void *tmalloc(size)
+  size_t size;
+
+  {void *ptr;
+
+   ptr = (void *) malloc(size);
+
+   if (rptr>TESTMAX)
+     {
+      exit(2);
+     }
+
+   record_ptrs[rptr] = (int) ptr;
+   rptr++;
+
+   return(ptr);
+  }
+
+/*-----------------------------------------------------------------(tfree)---*/
+
+tfree(ptr)
+  void *ptr;
+
+  {int i;
+
+   for (i=0; i<rptr; i++)
+     {
+      if (record_ptrs[i]==(int)ptr)
+        {
+         record_ptrs[i] = 0;
+         break;
+        }
+      }
+
+   free(ptr);
+  }
+
+/*---------------------------------------------------------------------------*/