3 Helper functions frelated to alignments
5 /* Last changed Time-stamp: <2006-01-16 11:42:38 ivo> */
15 #include "fold_vars.h"
18 static char rcsid[] = "$Id: aln_util.c,v 1.4 2007/02/02 15:18:13 ivo Exp $";
20 #define MAX_NUM_NAMES 500
21 int read_clustal(FILE *clust, char *AlignedSeqs[], char *names[]) {
22 char *line, name[100]="", *seq;
23 int n, nn=0, num_seq = 0, i;
25 if ((line=get_line(clust)) == NULL) {
26 fprintf(stderr, "Empty CLUSTAL file\n"); return 0;
29 if ((strncmp(line,"CLUSTAL", 7) !=0) && (!strstr(line,"STOCKHOLM"))) {
30 fprintf(stderr, "This doesn't look like a CLUSTAL/STOCKHOLM file, sorry\n");
34 line = get_line(clust);
37 if(strncmp(line, "//", 2) == 0){
42 if (((n=strlen(line))<4) || isspace((int)line[0])) {
43 /* skip non-sequence line */
44 free(line); line = get_line(clust);
45 nn=0; /* reset seqence number */
51 line = get_line(clust);
55 seq = (char *) space( (n+1)*sizeof(char) );
56 sscanf(line,"%99s %s", name, seq);
58 for(i=0;i<strlen(seq);i++){
59 if(seq[i] == '.') seq[i] = '-'; /* replace '.' gaps by '-' */
60 /* comment the next line and think about something more difficult to deal with
61 lowercase sequence letters if you really want to */
62 seq[i] = toupper(seq[i]);
65 if (nn == num_seq) { /* first time */
66 names[nn] = strdup(name);
67 AlignedSeqs[nn] = strdup(seq);
70 if (strcmp(name, names[nn])!=0) {
71 /* name doesn't match */
73 "Sorry, your file is fucked up (inconsitent seq-names)\n");
74 free(line); free(seq);
77 AlignedSeqs[nn] = (char *)
78 xrealloc(AlignedSeqs[nn], strlen(seq)+strlen(AlignedSeqs[nn])+1);
79 strcat(AlignedSeqs[nn], seq);
82 if (nn>num_seq) num_seq = nn;
85 if (num_seq>=MAX_NUM_NAMES) {
86 fprintf(stderr, "Too many sequences in CLUSTAL/STOCKHOLM file");
90 line = get_line(clust);
93 AlignedSeqs[num_seq] = NULL;
94 names[num_seq] = NULL;
96 fprintf(stderr, "No sequences found in CLUSTAL/STOCKHOLM file\n");
99 n = strlen(AlignedSeqs[0]);
100 for (nn=1; nn<num_seq; nn++) {
101 if (strlen(AlignedSeqs[nn])!=n) {
102 fprintf(stderr, "Sorry, your file is fucked up.\n"
103 "Unequal lengths!\n\n");
108 fprintf(stderr, "%d sequences; length of alignment %d.\n", nn, n);
112 char *consensus(const char *AS[]) {
113 /* simple consensus sequence (most frequent character) */
117 string = (char *) space((n+1)*sizeof(char));
118 for (i=0; i<n; i++) {
119 int s,c,fm, freq[8] = {0,0,0,0,0,0,0,0};
120 for (s=0; AS[s]!=NULL; s++)
121 freq[encode_char(AS[s][i])]++;
122 for (s=c=fm=0; s<8; s++) /* find the most frequent char */
123 if (freq[s]>fm) {c=s, fm=freq[c];}
124 if (s>4) s++; /* skip T */
125 string[i]=Law_and_Order[c];
130 /* IUP nucleotide classes indexed by a bit string of the present bases */
131 /* A C AC G AG CG ACG U AU CU ACU GU AGU CGU ACGU */
132 static char IUP[17] = "-ACMGRSVUWYHKDBN";
133 char *consens_mis(const char*AS[]) {
134 /* MIS displays the 'most informative sequence' (Freyhult et al 2004),
135 elements in columns with frequency greater than the background
136 frequency are projected into iupac notation. Columns where gaps are
137 over-represented are in lower case. */
141 int bgfreq[8] = {0,0,0,0,0,0,0,0};
144 for (N=0; AS[N]!=NULL; N++);
145 cons = (char *) space((n+1)*sizeof(char));
148 for (s=0; s<N; s++) {
149 c = encode_char(AS[s][i]);
154 for (i=0; i<n; i++) {
155 int freq[8] = {0,0,0,0,0,0,0,0};
157 for (s=0; s<N; s++) {
158 c = encode_char(AS[s][i]);
162 for (c=4; c>0; c--) {
164 if (freq[c]*n>=bgfreq[c]) code++;
167 if (freq[0]*n>bgfreq[0])
168 cons[i] = tolower(IUP[code]);