--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+#include "io_lib_header.h"
+#include "util_lib_header.h"
+#include "dp_lib_header.h"
+#include "define_header.h"
+
+int aln_has_stockholm_structure (Alignment *A)
+{
+ return name_is_in_list ("#=GC SS_cons", A->name, A->nseq, 100);
+}
+
+int get_aln_stockholm_structure (Alignment *A)
+{
+ int i;
+ if ((i=aln_has_stockholm_structure(A))==-1)
+ A=add_alifold2aln (A, NULL);
+ return aln_has_stockholm_structure(A);
+}
+int ** update_RNAfold_list (Alignment *A, int **pos, int s, int **l)
+{
+ int a=0;
+ while (l[a])
+ {
+ if (!is_gap(A->seq_al[s][l[a][0]]) && !is_gap (A->seq_al[s][l[a][1]]))
+ {
+ l[a][2]=pos[s][l[a][0]];
+ l[a][3]=pos[s][l[a][1]];
+ }
+ else
+ {
+ l[a][2]=l[a][3]=-1;
+ }
+ a++;
+ }
+ return l;
+}
+
+Alignment *compare_RNA_fold ( Alignment *A, Alignment *B)
+{
+ int i1, i2, i;
+ int **l1, **l2;
+ int **pos1, **pos2;
+ int a, b;
+ int tot_ol=0, tot_l=0;
+
+ i1=get_aln_stockholm_structure (A);
+ i2=get_aln_stockholm_structure (B);
+
+ l1=vienna2list (A->seq_al[i1]);
+ l2=vienna2list (B->seq_al[i2]);
+
+ pos1=aln2pos_simple(A, A->nseq);
+ pos2=aln2pos_simple(B, B->nseq);
+
+
+
+ for (a=0; a< A->nseq; a++)
+ {
+ char **lu;
+ int ol=0, ll1=0, ll2=0;
+ if ( A->name[a][0]=='#')continue;
+ i=name_is_in_list (A->name[a], B->name, B->nseq, 100);
+ if (i!=-1)
+ {
+ l1=update_RNAfold_list (A,pos1,a, l1);
+ l2=update_RNAfold_list (B,pos2,i, l2);
+ lu=declare_char (A->len_aln, B->len_aln);
+
+ b=0;
+ while (l2[b])
+ {
+
+ if (l2[b][2]==-1 || l2[b][3]==-1);
+ else
+ {
+ ll2++;
+ lu[l2[b][2]][l2[b][3]]=1;
+
+ }
+ b++;
+ }
+ b=0;
+
+ while (l1[b])
+ {
+
+ if (l1[b][2]==-1 || l1[b][3]==-1);
+ else
+ {
+ ll1++;
+ if (lu[l1[b][2]][l1[b][3]]==1)
+ {
+ A->seq_al[a][l1[b][0]]='6';
+ A->seq_al[a][l1[b][1]]='6';
+ ol++;
+ }
+ else
+ {
+ A->seq_al[a][l1[b][0]]='0';
+ A->seq_al[a][l1[b][1]]='0';
+ }
+ }
+ b++;
+ }
+
+ free_char (lu, -1);
+ }
+ tot_ol+=ol;
+ tot_l+=ll1;
+ tot_l+=ll2;
+ fprintf ( stdout, "@@ Seq: %s Overalp: %.2f Al1: %.2f Al2: %.2f \n", A->name[a], (float)(ol*200)/(ll1+ll2), (float)(ol*100)/ll1,(float)(ol*100)/ll2);
+ }
+
+ fprintf ( stdout, "@@ Seq: Tot Overalp: %.2f \n", (float)(tot_ol*200)/(tot_l));
+
+ return A;
+}
+int is_neutral(char c1, char c2);
+int is_watson (char c1, char c2);
+int is_watson2 (char c1, char c2);
+int is_watson (char c1, char c2)
+{
+ c1=tolower (c1);
+ c2=tolower (c2);
+ if ( is_watson2 (c1, c2)) return 1;
+ else return is_watson2 (c2, c1);
+}
+int is_watson2 (char c1, char c2)
+{
+
+ if ( c1=='g' && c2=='c')return 1;
+ else if (c1=='a' && (c2=='t' || c2=='u'))return 1;
+ return 0;
+}
+int is_neutral (char c1, char c2)
+{
+
+ c1=tolower (c1);
+ c2=tolower (c2);
+ if (is_watson (c1, c2)) return 1;
+ else if (c1=='g' && (c2=='t' || c2=='u'))return 1;
+ else if ((c1=='t' || c1=='u') && c2=='g')return 1;
+ return 0;
+}
+
+int ** vienna2list ( char *seq)
+{
+ int a, b, i, i2,l;
+ int **list;
+ l=strlen (seq);
+ list=declare_int (l+1, 8);
+ for (i=0,a=0; a<l; a++)
+ {
+ if ( seq[a]=='(')
+ {
+ list[i][0]=a;
+ for (i2=0,b=a+1; b<l && i2>=0; b++)
+ {
+ if (seq[b]=='(')i2++;
+ else if (seq[b]==')')i2--;
+ }
+ list[i][1]=b-1;
+ i++;
+ }
+ }
+
+ list[i]=NULL;
+ return list;
+}
+Alignment *aln2alifold(Alignment *A)
+{
+ char *tmp1;
+ char *tmp2;
+
+ print_aln (A);
+ tmp1=vtmpnam (NULL);
+ tmp2=vtmpnam (NULL);
+ output_clustal_aln (tmp1,A);
+ printf_system ("RNAalifold %s >%s 2>/dev/null", tmp1, tmp2);
+ return alifold2aln (tmp2);
+}
+
+Alignment *add_alifold2aln (Alignment *A, Alignment *ST)
+{
+ int a,b,c,d,p1,p2;
+ int r1, rr1, r2, rr2;
+ int watson, comp,tot;
+ int **compmat;
+ int max, p,k;
+ int minseq=3;
+ int **list;
+ int ncomp=0, nwatson=0;
+ int cons_l, fold_l;
+ int i,l;
+
+ if (!ST)
+ {
+ char *tmp1, *tmp2;
+ int f;
+ Alignment *T;
+ T=copy_aln (A, NULL);
+ tmp1=vtmpnam (NULL);
+ tmp2=vtmpnam (NULL);
+ cons_l=A->len_aln;
+ for (a=0; a<A->len_aln; a++)
+ {
+ for (f=0,b=0; b<A->nseq && f==0; b++)
+ {
+ if (is_gap (A->seq_al[b][a]))f=1;
+
+ }
+ if (f)
+ {
+ cons_l--;
+ for (b=0; b<A->nseq; b++)T->seq_al[b][a]='-';
+ }
+ }
+ ST=aln2alifold (T);
+ }
+ //add or Replace the structure
+ l=strlen (ST->seq_al[1]);
+ for (a=0; a< l; a++)if (ST->seq_al[1][a]==STOCKHOLM_CHAR)ST->seq_al[1][a]='.';
+ if ((i=name_is_in_list ("#=GC SS_cons", A->name, A->nseq, 100))!=-1)
+ {
+ sprintf (A->seq_al[i], "%s", ST->seq_al[1]);
+ }
+ else
+ {
+ A=realloc_aln2 ( A, A->nseq+1, A->len_aln+1);
+ sprintf (A->name[A->nseq], "#=GC SS_cons");
+ sprintf (A->seq_al[A->nseq], "%s", ST->seq_al[1]);
+ A->nseq++;
+ }
+ return A;
+}
+Alignment * alifold2analyze (Alignment *A, Alignment *ST, char *mode)
+{
+ int s;
+ int **list;
+ int usegap;
+
+ s=name_is_in_list ("#=GC SS_cons", A->name,A->nseq, 100);
+
+ if (s==-1)
+ {
+ A=add_alifold2aln (A,ST);
+ s=name_is_in_list ("#=GC SS_cons", A->name,A->nseq, 100);
+ }
+
+ list=vienna2list (A->seq_al[s]);
+ list=alifold_list2cov_list (A, list);
+
+ usegap=0; //do not use gaped positions by default
+ if (mode && strstr (mode, "usegap"))usegap=1;//count positions with gaps
+
+ if (!mode)
+ {
+ A=alifold2cov_stat (A, list,usegap);
+ }
+ else
+ {
+ if ( strstr (mode, "stat")) A=alifold2cov_stat (A, list, usegap);
+ if ( strstr (mode, "list")) A=alifold2cov_list (A, list, usegap);
+ if ( strstr (mode, "aln")) A=alifold2cov_aln (A, list, usegap);
+ if ( strstr (mode, "color") )
+ {
+ Alignment *C;
+ C=copy_aln (A, NULL);
+ C=alifold2cov_cache (C, list, usegap);
+ A=alifold2cov_aln (A, list, usegap);
+ if ( strstr ( mode, "ps"))
+ output_color_ps (A, C, "stdout");
+ else
+ output_color_html (A, C, "stdout");
+ myexit (EXIT_SUCCESS);
+ }
+ }
+ return A;
+}
+
+
+int ** alifold_list2cov_list (Alignment *A, int **list)
+{
+ int a,b,c,d,p1,p2,s;
+ int r1, rr1, r2, rr2;
+ int neutral,watson, comp,tot, occupancy;
+ int **compmat;
+ int max, p,k;
+ int minseq=3;
+
+ int ncomp=0, nwatson=0, nneutral=0, ncomp_wc=0;
+ int cons_l, fold_l;
+ int nseq;
+
+
+ for (nseq=0,a=0; a< A->nseq; a++)if ( A->name[a][0]!='#')nseq++;
+ max=((nseq*(nseq-1))/2);
+ a=0;
+ while (list[a])
+ {
+ p1=list[a][0];
+ p2=list[a][1];
+ watson=0;
+ comp=0;
+ neutral=0;
+ tot=0;
+ occupancy=0;
+ for (c=0; c<A->nseq-1; c++)
+ {
+ if (A->name[c][0]=='#')continue;
+ r1=tolower(A->seq_al[c][p1]);
+ r2=tolower(A->seq_al[c][p2]);
+ if (is_gap(r1) || is_gap(r2))continue;
+ for (d=c+1; d<A->nseq; d++)
+ {
+ if (A->name[d][0]=='#')continue;
+ rr1=tolower(A->seq_al[d][p1]);
+ rr2=tolower(A->seq_al[d][p2]);
+ if (is_gap(rr1) || is_gap(rr2))continue;
+ if (is_watson (r1, r2))watson++;
+ if (is_watson (rr1, rr2))watson++;
+ if (is_neutral (r1, r2))neutral++;
+ if (is_neutral (rr1, rr2))neutral++;
+ if (r1!=rr1 && r2!=rr2)comp++;
+ occupancy++;
+ }
+ }
+ if (occupancy==0)
+ {
+ a++;
+ continue;
+ }
+ watson=(watson*100)/(occupancy*2);
+ comp=(comp*100)/occupancy;
+ neutral=(neutral*100)/(occupancy*2);
+ occupancy=(occupancy*100)/max;
+ list[a][3]=neutral;
+ list[a][4]=watson;
+ list[a][5]=comp;
+ list[a][6]=occupancy;
+
+ if (list[a][3]<100)list[a][7]='I';//incompatible pair
+ else
+ {
+ list[a][7]='N';//Neutral pair
+ if (list[a][4]==100)
+ {
+ list[a][7]='W';//Watson and Crick
+ if ( list[a][5]>0)list[a][7]='C'; //Watson and crick compensated
+ }
+ else if ( list[a][5]>0)
+ {
+ list[a][7]='c';//compensated
+ }
+ }
+ a++;
+ }
+
+ return list;
+}
+Alignment *alifold2cov_aln (Alignment *inA,int **list, int ug)
+{
+ int a=0;
+ a=0;
+ Alignment *A;
+
+ A=copy_aln (inA, NULL);
+ A=realloc_aln2 ( A, A->nseq+1, A->len_aln+1);
+ sprintf (A->name[A->nseq], "#=GC SS_analyze");
+ sprintf (A->seq_al[A->nseq], "%s", A->seq_al[A->nseq-1]);
+ A->nseq++;
+ while (list[a])
+ {
+ char s;
+ if (list[a][6]<100 && !ug);
+ else
+ {
+ s=list[a][7];
+ A->seq_al[A->nseq-1][list[a][0]]=s;
+ A->seq_al[A->nseq-1][list[a][1]]=s;
+ }
+ a++;
+ }
+ return A;
+}
+Alignment *alifold2cov_stat (Alignment *A,int **list, int ug)
+{
+ int fold=0,watson=0, comp=0, compwc=0, incomp=0, neutral=0;
+ int a;
+
+ a=0;
+ while (list[a])
+ {
+ int s;
+ fold++;
+ if (list[a][6]<100 && !ug);
+ else
+ {
+ s=list[a][7];
+ watson +=(s=='W')?1:0;
+ compwc +=(s=='C')?1:0;
+ comp +=(s=='c')?1:0;
+ neutral+=(s=='N')?1:0;
+ incomp +=(s=='I')?1:0;
+ }
+ a++;
+ }
+ fprintf ( stdout, "@@ TOT Nseq:%d tot_len: %d fold: %d neutral: %d watson: %d CorWC: %d cor: %d Incompatible: %d\n",A->nseq-1, A->len_aln,fold, neutral,watson, compwc,comp,incomp);
+ return A;
+}
+Alignment *alifold2cov_cache (Alignment *inA, int **list, int ug)
+{
+ int a,b, c;
+ Alignment *A;
+
+ A=copy_aln (inA, NULL);
+ a=0;
+ while (list[a])
+ {
+ int v, s;
+ if (list[a][6]<100 && !ug);
+ else
+ {
+ s=list[a][7];
+ if (s=='C')v=9; //red
+ else if ( s=='c')v=7; //orange
+ else if ( s=='W')v=5; //Yellow
+ else if ( s=='N')v=2; //green
+ else if ( s=='I')v=0; //blue;
+ for (b=0;b<A->nseq; b++)
+ {
+ if (A->name[b][0]=='#');
+ else
+ {
+ for (c=0; c<2; c++)
+ {
+ A->seq_al[b][list[a][c]]='0'+v;
+ }
+ }
+ }
+ }
+ a++;
+ }
+ return A;
+}
+
+Alignment *alifold2cov_list (Alignment *A,int **list, int ug)
+{
+ int a,b, s;
+
+ a=0;
+ while (list[a])
+ {
+ s=list[a][7];
+ if (list[a][6]<100 && !ug);
+ else if (s=='C')
+ {
+ fprintf ( stdout, "@@ WC Compensated pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
+ for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
+ fprintf (stdout,"\n");
+ }
+ else if (s=='c')
+ {
+ fprintf ( stdout, "@@ Neural Compensated pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
+ for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
+ fprintf (stdout,"\n");
+ }
+ else if (s=='W')
+ {
+ fprintf ( stdout, "@@ WC pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
+ for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
+ fprintf (stdout,"\n");
+ }
+ else if (s=='N')
+ {
+ fprintf ( stdout, "@@ Neutral pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
+ for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
+ fprintf (stdout,"\n");
+ }
+ else if (s=='I')
+ {
+ fprintf ( stdout, "@@ incompatible pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
+ for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
+ fprintf (stdout,"\n");
+ }
+ a++;
+ }
+
+ return A;
+}
+
+
+Alignment *aln2sample (Alignment *A, int n)
+{
+ Alignment *B;
+ int a, b, p;
+ int **pos;
+
+ B=copy_aln (A, NULL);
+
+ vsrand(0);
+
+ pos=declare_int (A->len_aln, 2);
+ for (a=0; a<A->len_aln; a++){pos[a][0]=a;pos[a][1]=rand()%(1000*A->len_aln);}
+
+ sort_int (pos, 2, 1, 0, A->len_aln-1);
+
+ n=(n==0)?A->len_aln:(MIN (n, (A->len_aln)));
+ for (a=0; a<n; a++)
+ for (b=0; b<A->nseq; b++)
+ A->seq_al[b][a]=B->seq_al[b][pos[a][0]];
+ for (b=0; b<A->nseq; b++)
+ A->seq_al[b][n]='\0';
+ A->len_aln=n;
+
+ free_aln (B);
+ free_int (pos, -1);
+ return A;
+}
+Alignment *aln2bootstrap (Alignment *A, int n)
+{
+ Alignment *B;
+ int a, b, p;
+
+ if (n==0)n=A->len_aln;
+ else A=realloc_aln (A, n+1);
+ vsrand(0);
+ B=copy_aln (A, NULL);
+ for (a=0; a<n; a++)
+ {
+ p=rand ()%A->len_aln;
+ for (b=0; b<A->nseq; b++)
+ A->seq_al[b][a]=B->seq_al[b][p];
+ }
+ for ( b=0; b<A->nseq; b++)A->seq_al[b][n]='\0';
+ A->len_aln=n;
+
+ free_aln (B);
+ return A;
+
+}
+
+
+Alignment * aln2random_aln (Alignment *A, char *smode)
+
+{
+ int a, b, n, **res;
+ int max;
+
+
+
+ if ( smode==NULL)
+ {
+ smode=vcalloc (4, sizeof (char));
+ sprintf ( smode, "SCR");//Sequences, Column Residues
+ }
+ else if ( strm (smode, "NO"))return A;
+
+ vsrand(0);
+ max=A->nseq*1000;
+
+ if ( strstr ( smode, "S"))
+ {
+ A=aln2scramble_seq (A);
+ }
+ if ( strstr ( smode, "C"))
+ {
+
+ res=declare_int (A->nseq, 2);
+ for (a=0; a< A->len_aln; a++)
+ {
+ for (n=0,b=0;b<A->nseq; b++)
+ {
+ if ( !is_gap(A->seq_al[b][a]))
+ {
+ res[n][0]=A->seq_al[b][a];
+ res[n][1]=rand()%max;
+ n++;
+ }
+ sort_int (res, 2, 1, 0, n-1);
+ }
+ for (n=0,b=0;b<A->nseq; b++)
+ {
+ if ( !is_gap(A->seq_al[b][a]))A->seq_al[b][a]=res[n++][0];
+ }
+ }
+ free_int (res, -a);
+ }
+
+
+ //Redistributes the residues randomly without changing the gap pattern
+ if ( strstr ( smode, "R"))
+ {
+ max=A->len_aln*A->nseq;
+ res=declare_int (max, 2);
+
+ for (n=0,a=0; a< A->len_aln; a++)
+ {
+ for (b=0;b<A->nseq; b++)
+ {
+ if ( !is_gap(A->seq_al[b][a]))
+ {
+ res[n][0]=A->seq_al[b][a];
+ res[n][1]=rand()%max;
+ n++;
+ }
+
+ }
+ }
+ sort_int (res, 2, 1, 0, n-1);
+ for (n=0,a=0; a< A->len_aln; a++)
+ {
+ for (b=0;b<A->nseq; b++)
+ {
+ if ( !is_gap(A->seq_al[b][a]))
+ {
+ A->seq_al[b][a]=res[n++][0];
+ }
+
+ }
+ }
+
+ free_int (res, -1);
+ }
+
+ return A;
+}
+Alignment *score_aln2score_ascii_aln (Alignment *A, Alignment *C)
+{
+ //Convert the output of T-Coffee evaluate into a printable score_ascii alignment*/
+ //A and C must be sorted
+ //sets to 0 lone residues
+ int a, b;
+
+ for (a=0; a<A->nseq; a++)
+ for (b=0; b<A->len_aln; b++)
+ {
+
+ int rC=C->seq_al[a][b];
+ int rA=A->seq_al[a][b];
+ if ( !strm (A->name[a], C->name[a])){HERE ("Unsorted aln in score_aln2score_ascii"); myexit (EXIT_FAILURE);}
+
+ if ( rA=='x' || rA=='X')C->seq_al[a][b]='9';
+ else if ( rC >='0' && rC<='9');
+ else if ( rC<10)C->seq_al[a][b]='0'+rC;
+ else if ( rC==NO_COLOR_RESIDUE && !is_gap(rA)) C->seq_al[a][b]='0';
+ else if ( rC==NO_COLOR_RESIDUE && is_gap(rA))C->seq_al[a][b]='-';
+ }
+ return C;
+}
+Alignment*aln2gap_cache (Alignment *A, int val)
+{
+ Alignment *B;
+ int a, b, c, nr;
+
+ B=copy_aln (A, NULL);
+ for (b=0; b<A->len_aln; b++)
+ {
+ for (nr=0,a=0; a<A->nseq; a++)nr+=!is_gap (A->seq_al[a][b]);
+ for (a=0; a<A->nseq; a++)if (!is_gap(A->seq_al[a][b]))B->seq_al[a][b]=(nr==1)?'0'+val:'1';
+ }
+ return B;
+}
+
+Alignment* aln2case_aln (Alignment *B, char *upper, char *lower)
+{
+ int a, b, c, up, lo;
+ Alignment *A;
+
+ A=copy_aln (B, NULL);
+
+ up=(upper)?upper[0]:'u';
+ lo=(lower)?lower[0]:'l';
+
+ for (a=0; a<A->nseq; a++)
+ for (b=0; b<A->len_aln; b++)
+ {
+ c=A->seq_al[a][b];
+
+ if ( is_gap(c));
+ else A->seq_al[a][b]=(isupper (c))?up:lo;
+ }
+ return A;
+}
+Alignment *aln2scale (Alignment *A, char *coffset)
+{
+ int a, b, t, v, n;
+ char *s1, *s2;
+ char s[1000];
+ int offset;
+
+ if (coffset)offset=atoi(coffset);
+ else offset=0;
+
+ sprintf (s, "%d", A->len_aln+offset);
+ n=strlen (s);
+
+ A=realloc_aln2 (A, A->nseq+n, A->len_aln+1);
+ s1=vcalloc ( n+1, sizeof (char));
+ s2=vcalloc ( n+1, sizeof (char));
+
+ for (a=0; a<n; a++)
+ {
+ if (a==0)s2[a]='1';
+ else strcat (s2, "0");
+ sprintf (A->name[A->nseq+a], "%s", s2);
+ }
+
+ for (a=0; a<A->len_aln; a++)
+ {
+ sprintf (s1, "%d", a+1+offset);
+ s2=invert_string (s1);
+ t=strlen (s2);
+
+ for (b=0; b<=n; b++)
+ {
+ if (b>=t) v='0';
+ else v=s2[b];
+
+ A->seq_al[A->nseq+b][a]=v;
+ }
+ }
+
+ A->nseq+=n;
+ return A;
+}
+
+
+
+
+int * pos2list (int * pos, int len, int *nl)
+{
+ int *list;
+ int a;
+ nl[0]=0;
+ list=vcalloc (len, sizeof (int));
+ for (a=0; a<len; a++)if (pos[a])list[nl[0]++]=a;
+ return list;
+}
+int *list2pos (int *list, int nl, int len)
+{
+ int *pos, a;
+ pos=vcalloc (len, sizeof (int));
+ for (a=0; a<nl; a++)pos[list[a]]=1;
+ return pos;
+}
+
+int **aln2resindex ( Alignment *A, Alignment *B, FILE *fp)
+{
+ int *list, **pos;
+ int a, b, n, s;
+
+
+ list=vcalloc (A->nseq+((B)?B->nseq:0), sizeof (int));
+ pos=aln2pos_simple_2 (A);
+ if (B)
+ {
+ n=B->nseq;
+ for ( a=0; a<B->nseq; a++)
+ {
+ list[a]=name_is_in_list(B->name[a], A->name, A->nseq, 100);
+ }
+ }
+ else
+ {
+ for ( a=0; a<A->nseq; a++)
+ list[a]=a;
+ n=A->nseq;
+ }
+
+
+ fprintf ( fp, "#");
+ for ( b=0; b<n; b++)
+ {
+ s=list[b];
+ if ( s!=-1)fprintf (fp, " %s",A->name[s]);
+ }
+ fprintf (fp, "\n");
+
+ for ( a=0; a<A->len_aln; a++)
+ {
+ for ( b=0; b<n; b++)
+ {
+ s=list[b];
+ if ( s==-1);
+ else if (pos[s][a]<0)
+ fprintf (fp, "%4d", -1);
+ else
+ fprintf (fp, "%4d", pos[s][a]);
+ }
+ fprintf (fp, "\n");
+ }
+ return pos;
+}
+
+int **index_seq_res ( Sequence *S1, Sequence *S2, int **name_index)
+{
+ /*Index the residues of S1 according to S2
+ index[seq1 of S1][z]->x, where x is the position of residue z of seq1/S1 in S2->seq[index[Seq1/S1]]
+ */
+ int a;
+ int **index;
+ char *seq1=NULL, *seq2=NULL;
+ Alignment *Profile;
+
+ index=vcalloc ( S1->nseq, sizeof (int*));
+
+ for (a=0; a< S1->nseq; a++)
+ {
+ int len1, len2, b, c;
+
+ seq1=S1->seq[a];
+
+ if (name_index[a][0]==-1)
+ seq2=NULL;
+ else if (name_index[a][1]==-1)
+ {
+ seq2=S2->seq[name_index[a][0]];
+ }
+ else if ((Profile=seq2R_template_profile (S2, name_index[a][0])) !=NULL)
+ {
+ seq2=Profile->seq_al[name_index[a][1]];
+ }
+
+ len1=(seq1)?strlen (seq1):0;
+ len2=(seq2)?strlen (seq2):0;
+ index[a]=vcalloc (len2, sizeof(int));
+
+
+ for (c=0,b=0; b<len2; b++)if( !is_gap(seq2[b]))index[a][c++]=b;
+ //index[a]=get_res_index ( seq1, seq2);
+ }
+ return index;
+}
+
+int **index_seq_name ( Sequence *S1, Sequence *S2)
+{
+ /*Index the names of S1 according to S2
+ index[seq1 of S1][0]->x if seq1 is the xth sequence of S2
+ ->-1 if seq1 is nowhere to be found
+ index[seq1 of S1][1]->z if seq1 is the zth sequence within the xth profile of S2
+ */
+ int **index;
+ int a, b, x, z;
+ Alignment *Profile;
+ index=declare_int (S1->nseq, 2);
+
+
+ for ( a=0; a<S1->nseq; a++)
+ {
+ index[a][0]=index[a][1]=-1;
+ x=name_is_in_list (S1->name[a],S2->name,S2->nseq,100);
+ if ( x!=-1){index[a][0]=x;index[a][1]=-1;}
+ for ( b=0; b<S2->nseq; b++)
+ {
+ if ((Profile=seq2R_template_profile (S2,b)))
+ {
+ z=name_is_in_list (S1->name[a],Profile->name,Profile->nseq,100);
+ if ( z!=-1){index[a][0]=b;index[a][1]=z;b=S2->nseq;}
+ }
+ }
+ }
+ return index;
+}
+
+
+
+
+int *get_name_index (char **l1, int n1, char **l2, int n2)
+{
+ int *r;
+ int a;
+ /*return Array[Index_L1]=Index_L2 */
+ r=vcalloc ( n1, sizeof (int));
+ for ( a=0; a< n1; a++)
+ r[a]=name_is_in_list (l1[a],l2,n2,100);
+ return r;
+}
+
+int* get_res_index (char *seq0, char *seq1)
+{
+ int *coor, a;
+
+ if ( !seq0 || !seq1) return NULL;
+
+
+ coor=vcalloc ( strlen (seq0)+1, sizeof (int));
+ if (!strm (seq0, seq1))
+ {
+ int r0, r1 , isr0, isr1;
+ int l0=0, l1=0;
+ Alignment *A;
+ A=align_two_sequences (seq0,seq1,"pam250mt",-5,-1, "myers_miller_pair_wise");
+
+ for ( a=0; a< A->len_aln; a++)
+ {
+ r0=A->seq_al[0][a];r1=A->seq_al[1][a];
+ isr0=!is_gap(r0);
+ isr1=!is_gap(r1);
+ l0+= isr0;
+ l1+= isr1;
+ if (isr0 && isr1)coor[l0-1]=l1-1;
+ else if (isr0) coor[l0-1]=-1;
+ }
+ free_aln (A);
+ }
+ else
+ {
+ int l0;
+
+ l0=strlen (seq0);
+ for ( a=0;a< l0; a++)
+ coor[a]=a;
+ }
+
+ return coor;
+}
+
+int change_residue_coordinate ( char *in_seq1, char *in_seq2, int v)
+{
+ /*Expresses the coordinate of a residue in seq1, in the coordinate system of seq2*/
+
+
+ static char *seq1, *seq2;
+ static int *coor;
+
+
+ if ( seq1 !=in_seq1 || seq2 !=in_seq2)
+ {
+ int r0, r1 , isr0, isr1;
+ int l0=0, l1=0;
+ Alignment *A;
+ int a;
+
+ vfree (coor);
+ seq1=in_seq1, seq2=in_seq2;
+ A=align_two_sequences (seq1,seq2,"pam250mt", -14, -2, "myers_miller_pair_wise");
+
+ coor=vcalloc ( A->len_aln, sizeof (int));
+ for ( a=0; a< A->len_aln; a++)
+ {
+ r0=A->seq_al[0][a];r1=A->seq_al[1][a];
+
+ isr0=!is_gap(r0);
+ isr1=!is_gap(r1);
+ l0+= isr0;
+ l1+= isr1;
+
+ if (isr0 && isr1)coor[l0-1]=l1-1;
+ else if (isr0) coor[l0-1]=-1;
+ }
+ free_aln (A);
+ }
+ return coor[v];
+}
+
+
+int ** minimise_repeat_coor (int **coor, int nseq, Sequence *S)
+ {
+ int **new_coor;
+ int a, min;
+ new_coor=declare_int ( nseq, 3);
+ min=return_min_int (coor, nseq, 2);
+ for ( a=0; a< nseq; a++)
+ {
+ new_coor[a][0]=coor[a][0];
+ new_coor[a][1]=coor[a][1];
+ new_coor[a][2]=min;
+ }
+ return new_coor;
+ }
+int ** get_nol_seq ( Constraint_list *CL, int **coor, int nseq, Sequence *S)
+ {
+ int a, s, p, l, nl;
+ int **buf;
+ int **new_coor;
+
+ new_coor=declare_int ( nseq+1, 3);
+
+
+ buf=get_undefined_list ( CL);
+
+
+
+ for ( a=0; a< nseq; a++)buf[coor[a][0]][coor[a][1]]=1;
+
+
+ for ( a=0; a< nseq; a++)
+ {
+ s=coor[a][0];
+ p=coor[a][1]+1;
+ l=strlen(S->seq[s]);
+ nl=0;
+ while ( p<=l && !buf[s][p++])nl++;
+ new_coor[a][0]=s;
+ new_coor[a][1]=coor[a][1];
+ new_coor[a][2]=nl;
+ }
+ free_int ( buf, -1);
+ return new_coor;
+ }
+
+
+
+int compare_pos_column( int **pos1,int p1, int **pos2,int p2, int nseq)
+ {
+ int a,v1, v2;
+ int identical=0;
+
+ for ( a=0; a< nseq; a++)
+ {
+
+ v1=pos1[a][p1];
+ v2=pos2[a][p2];
+
+ if (v1>0 || v2>0)
+ {
+ if ( v1!=v2)return 0;
+ else identical=1;
+ }
+ }
+
+ return identical;
+ }
+
+char *seq2alphabet (Sequence *S)
+{
+ return array2alphabet (S->seq, S->nseq, "");
+}
+
+char *aln2alphabet (Alignment *A)
+{
+ return array2alphabet (A->seq_al, A->nseq, "");
+}
+
+char *array2alphabet (char **array, int n, char *forbiden)
+{
+ int a, b, l;
+ int *hasch;
+ char *alphabet;
+
+ hasch=vcalloc (256, sizeof (int));
+ alphabet=vcalloc ( 257, sizeof (char));
+
+
+ for ( a=0; a<n; a++)
+ {
+ l=strlen (array[a]);
+ for ( b=0; b<l; b++)
+ hasch[tolower(array[a][b])]++;
+ }
+
+ for ( a=0, b=0; a< 256; a++)
+ {
+ if (hasch[a] && !strrchr(forbiden,a))alphabet[b++]=a;
+ }
+
+ alphabet[b]='\0';
+ vfree (hasch);
+ return alphabet;
+}
+
+
+//***************************************************************
+//
+ // TM PRED
+//***************************************************************
+
+char* alnpos2hmmtop_pred (Alignment *A,Alignment *Pred, int pos, int mode)
+{
+ static char *result;
+ static Alignment *Cache;
+ static int *score;
+ int a, tot, cons;
+
+ if (!score)
+ {
+ score=vcalloc (256, sizeof (int));
+ result=vcalloc (100, sizeof (char));
+ }
+
+ if (!Pred && !Cache)
+ {
+ Cache=aln2hmmtop_pred (A);
+ }
+ if (!Pred) Pred=Cache;
+
+
+ for (tot=0,a=0; a<A->nseq; a++)
+ {
+ char s;
+ s=Pred->seq_al[a][pos];
+ if (!is_gap(s))
+ {
+ score[tolower(s)]++;
+ tot++;
+ }
+ }
+
+ if ( score['h']>score['i'] && score['h']>score['o'])cons='h';
+
+ else if ( score['i']>score['o'])cons='i';
+ else cons='o';
+ if (tot==0) return "";
+
+
+ if (mode==VERBOSE)sprintf (result, " H: %3d I: %3d O: %3d P: %c", (score['h']*100)/tot, (score['i']*100)/tot, (score['o']*100)/tot, cons);
+ else if (mode == SHORT)sprintf ( result, "%c", cons);
+ score['h']=score['o']=score['i']=0;
+ return result;
+}
+
+
+Alignment * aln2hmmtop_pred (Alignment *A)
+ {
+ int a, b, c;
+ char *buf, *pred;
+ Alignment *PA;
+
+ PA=copy_aln (A, NULL);
+ buf=vcalloc ( A->len_aln+1, sizeof (char));
+
+ for ( a=0; a< A->nseq; a++)
+ {
+ sprintf (buf, "%s", A->seq_al[a]);
+ pred=seq2tmstruc (buf);
+ for (c=0,b=0; b<A->len_aln; b++)
+ {
+ if (!is_gap (PA->seq_al[a][b]))PA->seq_al[a][b]=pred[c++];
+ }
+ vfree (pred);
+ }
+ vfree (buf);
+ return PA;
+ }
+
+char * seq2tmstruc ( char *seq)
+ {
+ static Sequence *S;
+ char *seqfile, *predfile, *buf;
+ FILE *fp;
+
+ seqfile=vtmpnam (NULL);
+ predfile=vtmpnam (NULL);
+
+ fp=vfopen (seqfile, "w");
+ fprintf ( fp, ">seq1\n%s", seq);
+ vfclose (fp);
+
+
+ printf_system ( "fasta_seq2hmmtop_fasta.pl -in=%s -out=%s -arch=%s/%s -psv=%s/%s", seqfile, predfile, get_mcoffee_4_tcoffee(), "hmmtop.arch", get_mcoffee_4_tcoffee(), "hmmtop.psv");
+ S=get_fasta_sequence (predfile, NULL);
+ buf=vcalloc ( strlen (S->seq[0])+1, sizeof (char));
+ sprintf ( buf, "%s", S->seq[0]);
+
+ free_sequence (S, S->nseq);
+
+ return buf;
+ }
+
+char * set_blast_default_values()
+{
+ set_string_variable ("blast_server", (getenv ("blast_server_4_TCOFFEE"))?getenv ("blast_server_4_TCOFFEE"):"EBI");
+ set_string_variable ("pdb_db", (getenv ("pdb_db_4_TCOFFEE"))?getenv ("pdb_db_4_TCOFFEE"):"pdb");
+ set_string_variable ("prot_db", (getenv ("prot_db_4_TCOFFEE"))?getenv ("prot_db_4_TCOFFEE"):"uniprot");
+ set_int_variable ("prot_min_sim", 0);
+ set_int_variable ("prot_max_sim", 100);
+
+ set_int_variable ("prot_min_cov", 0);
+ set_int_variable ("prot_max_cov", 100);
+
+ set_int_variable ("pdb_min_sim", 0);
+ set_int_variable ("pdb_max_sim", 100);
+ set_int_variable ("pdb_min_cov", 0);
+ set_int_variable ("pdb_max_cov", 100);
+
+ return;
+}
+
+char * seq2pdb (Sequence *S)
+{
+ set_blast_default_values();
+ S->nseq=1;
+ S=seq2template_seq (S, "PDB", NULL);
+ return seq2P_pdb_id(S,0);
+}
+
+Alignment * seq2blast ( Sequence *S)
+{
+ Alignment *A;
+ set_blast_default_values();
+
+ if (S->nseq==1)
+ {
+ S=seq2template_seq (S, "BLAST", NULL);
+ A=seq2R_template_profile(S,0);
+ sprintf ( A->name[0], "%s", S->name[0]);
+ }
+ else
+ {
+ int a;
+ for (a=0; a< S->nseq; a++)
+ {
+ Sequence *NS;
+ char name[1000];
+ NS=fill_sequence_struc(1, &(S->seq[a]), &(S->name[a]));
+ NS=seq2template_seq (NS, "BLAST", NULL);
+ A=seq2R_template_profile(NS,0);
+ sprintf ( name, "%s.prf", S->name[a]);
+
+ output_fasta_aln (name,A);
+ fprintf (stdout, "\nOUTPUT %s\n", name);
+ }
+ myexit (EXIT_SUCCESS);
+ }
+ return A;
+}
+
+
+
+
+Sequence * seq2unique_name_seq ( Sequence *S)
+{
+ int a;
+ if ((a=name_list2unique_name_list (S->nseq, S->name)))
+ {
+ add_warning ( stderr, "\nWarning: Sequence %s is duplicated in file %s. The sequence will be renamed", S->name[a-1], S->file[a-1]);
+ }
+ return S;
+}
+Alignment * aln2unique_name_aln ( Alignment *S)
+{
+ int a;
+ if ((a=name_list2unique_name_list (S->nseq, S->name)))
+ {
+ add_warning ( stderr, "\nWarning: Sequence %s is duplicated in file %s. The sequence will be renamed", S->name[a-1], S->file[a-1]);
+ }
+ return S;
+}
+
+
+int name_list2unique_name_list (int n, char **name)
+{
+ int duplicate=0;
+ int a, b;
+
+ for (a=0; a<n-1; a++)
+ for (b=a+1; b<n; b++)
+ {
+ if ( strm (name[a], name[b]))
+ {duplicate=a+1;b=a=n;}
+ }
+
+ if (duplicate)
+ {
+ char *tmp1, *tmp2;
+ Sequence *S;
+ FILE *fp;
+
+ tmp1=vtmpnam (NULL);
+ tmp2=vtmpnam (NULL);
+ fp=vfopen (tmp1, "w");
+ for (a=0; a< n; a++)fprintf ( fp, ">%s\naggggg\n", name[a]);
+ vfclose (fp);
+ printf_system ("fasta_aln2fasta_aln_unique_name.pl %s > %s", tmp1, tmp2);
+ S=get_fasta_sequence (tmp2, NULL);
+ for (a=0; a<n; a++)
+ {
+ name[a]=vrealloc (name [a], sizeof (int)*(strlen (S->name[a])+1));
+ sprintf ( name[a], "%s", S->name [a]);
+ }
+ free_sequence(S, -1);
+ }
+ return duplicate;
+}
+char**gene2exons (char **seq, int nseq)
+{
+
+ int a, b, c,r;
+ for (a=0; a<nseq; a++)
+ {
+ int in_exon=0, flag=0,l;
+ l=strlen (seq[a]);
+ for ( b=0; b<l; b++)
+ {
+ r=seq[a][b];
+ if (isupper (r))
+ {
+ in_exon=1;
+ seq[a][b]=(flag)?r:tolower(r);
+ }
+ else if (in_exon)
+ {
+ in_exon=0;
+ flag=1-flag;
+ seq[a][b]='-';
+ }
+ else seq[a][b]='-';
+ }
+ }
+ return seq;
+}
+Sequence* seq2clean_seq (Sequence *S, char *alp)
+{
+ int a, b, c, d, l;
+
+ for (a=0; a< S->nseq; a++)
+ {
+ l=strlen (S->seq[a]);
+ for (d=0,b=0; b<l; b++)
+ {
+ c=S->seq[a][b];
+ if ( alp==NULL && !strchr (AA_ALPHABET, c) && !strchr (DNA_ALPHABET, c));
+ else if (alp && strchr (alp, c));
+ else S->seq[a][d++]=c;
+ }
+ S->seq[a][d]='\0';
+ S->len[a]=strlen (S->seq[a]);
+ }
+ return S;
+}
+int ** seq2aln_pos (Alignment *A, int *ns, int **l_s)
+ {
+ int **code;
+ int a, b,c, d,l, p , g;
+
+
+ l=MAX(strlen (A->seq_al[l_s[0][0]]), strlen (A->seq_al[l_s[1][0]]));
+ code=declare_int ((A->S)->nseq,l+1);
+
+ for (c=0; c<2; c++)
+ {
+ l=strlen (A->seq_al[l_s[c][0]]);
+ for (d=0; d<ns[c]; d++)
+ {
+ a=A->order[l_s[c][d]][0];
+ for (p=0, b=0; b<l; b++)
+ {
+ g=is_gap (A->seq_al[l_s[c][d]][b]);
+ if (!g){p++; code[a][p]=b+1;}
+ }
+ }
+ }
+ return code;
+ }
+
+Alignment *local_maln2global_maln (char *seq, Alignment *A)
+ {
+ /*inputs a BLAST alignmnent where the master sequence may be partila
+ outputs the same alignment, while amkeing sure the profile is perfectly in sink with its master sequence
+ */
+
+ int a, b, c;
+ int start, end, rend;
+ char qname[100], *p;
+ Alignment *B=NULL;
+
+ sprintf ( qname, "%s", A->name[0]);
+ p=strtok (qname, "_");
+ if ( !strm (p, "QUERY"))
+ {
+ fprintf ( stderr, "\nUnappropriate format for the alignment [%s:FATAL]", PROGRAM);
+ myexit (EXIT_FAILURE);
+ }
+
+ start=atoi(strtok (NULL, "_"));
+ end=atoi(strtok (NULL, "_"));
+ rend=strlen (seq);
+
+ B=copy_aln (A,NULL);
+ if ( start>1 || end<rend )A=realloc_aln (A,rend+1);
+
+ for (a=0; a<start-1; a++)
+ {
+ A->seq_al[0][a]=seq[a];
+ for ( b=1; b< A->nseq; b++)A->seq_al[b][a]='-';
+ }
+
+ for (c=0,a=start-1; a< end; a++, c++)
+ {
+ A->seq_al[0][a]=seq[a];
+ for ( b=1; b< A->nseq; b++)
+ {
+ A->seq_al[b][a]=B->seq_al[b][c];
+ }
+ }
+ for ( a=end; a<rend; a++)
+ {
+ A->seq_al[0][a]=seq[a];
+ for ( b=1; b< A->nseq; b++)A->seq_al[b][a]='-';
+ }
+ for ( a=0; a< A->nseq; a++) A->seq_al[a][rend]='\0';
+ free_aln (B);
+
+ A->len_aln=rend;
+ return A;
+ }
+
+int ** aln2inv_pos ( Alignment *A)
+{
+ int **pos,a;
+ pos=vcalloc (A->nseq, sizeof (char*));
+ for (a=0; a< A->nseq; a++)pos[a]=seq2inv_pos (A->seq_al[a]);
+ return pos;
+}
+int * seq2inv_pos ( char *seq)
+{
+ /*returns a list where each value gives the index of the corresponding residue in seq*/
+ /*Numbering: 1 to L : Analogy to the aln2pos*/
+
+ int a,l1, l2;
+ int *pos;
+
+ l1=strlen ( seq);
+ for ( l2=a=0; a< l1; a++)l2+=1-is_gap(seq[a]);
+ pos=vcalloc (l2+1, sizeof (int));
+ for ( l2=a=0; a< l1; a++)if (!is_gap(seq[a]))pos[++l2]=a+1;
+ return pos;
+}
+
+
+int ** aln2pos_simple_2 (Alignment *A)
+ {
+ int **pos1;
+ int **pos2;
+ pos1=aln2pos_simple (A, A->nseq);
+ pos2=duplicate_int (pos1, A->nseq,read_size_int (pos1[0],sizeof (int)));
+ pos1=aln2pos_simple (NULL, 0);
+ return pos2;
+ }
+int ** aln2pos_simple (Alignment *A, int n_nseq, ...)
+ {
+ /*
+ function documentation: start
+ int ** aln2pos_simple (Alignment *A, int n_nseq, ...)
+
+####with two parameter only: Alignment *A, int n_nseq
+
+ this function turns A into pos, a matrix where each residue is replace by its index according to the complete sequence.
+ the indices in pos are computed using A->order[x][1] that contains the indice of the first residue of seq x of A
+
+ n_nseq MUST not be null
+
+####with more than two param:
+ int ** aln2pos_simple (Alignment *A, int n_nseq, int *ns, int **ls)
+ n_nseq must be set to 0 for the param 3 and four to be read
+
+ ns[x]=number seq in group
+ ls[x]=list of the sequences in group x ( size=ns[x])
+
+ The computation of the indices is only carried out on the scpecified residues
+
+####IMPORTANT
+ in pos, the numbering of the residues goes from 1 to L:
+ pos[0][0]=3, means that the first position of the first sequence
+ in the alignmnet contains residue #3 from sequence A->order[0][0];
+
+ function documentation: end
+ */
+
+ int a, b,c, p, g,l;
+ int **T;
+
+ int max_nseq;
+ int n_len=0;
+
+ int *list=NULL;
+ int *ns=NULL;
+ int **ls=NULL;
+
+
+
+ va_list ap;
+
+
+ if ( A==NULL)
+ {
+ return NULL;
+ }
+ else
+ {
+ if ( n_nseq>0)
+ {
+ list=vcalloc(n_nseq, sizeof (int));
+ for ( a=0; a< n_nseq; a++)list[a]=a;
+ }
+ else
+ {
+ va_start (ap, n_nseq);
+ ns=va_arg(ap, int * );
+ ls=va_arg(ap, int **);
+ va_end(ap);
+ list=vcalloc ( ns[0]+ns[1], sizeof (int));
+ n_nseq=0;
+ for ( a=0; a< ns[0]; a++)list[n_nseq++]=ls[0][a];
+ for ( a=0; a< ns[1]; a++)list[n_nseq++]=ls[1][a];
+
+ }
+ max_nseq=MAX(read_size_int(A->order,sizeof (int*)),return_max_int (A->order, read_size_int(A->order,sizeof (int*)),0))+1;
+ n_len=get_longest_string ( A->seq_al,A->max_n_seq, NULL, NULL)+1;
+
+
+ T=declare_int (max_nseq, n_len);
+ for ( c=0; c< n_nseq; c++)
+ {
+ a=list[c];
+ l=strlen ( A->seq_al[a]);
+
+ for ( p=A->order[a][1],b=0; b<l; b++)
+ {
+ g=1-is_gap(A->seq_al[a][b]);
+ p+=g;
+ T[a][b]=(g==1)?p:-(1+p);
+ if ( A->seq_al[a][b]==UNDEFINED_RESIDUE)T[a][b]=0;
+ if ( A->seq_cache && T[a][b]>0)T[a][b]=A->seq_cache[A->order[a][0]][T[a][b]];
+ }
+ }
+ vfree (list);
+ }
+
+ return T;
+ }
+Alignment ** split_seq_in_aln_list ( Alignment **aln, Sequence *S, int n_seq, char **seq_list)
+ {
+ int a, b, c;
+ char * long_seq=NULL;
+ int len,l;
+ int **translation;
+ int **table;
+
+
+
+
+ if ( aln==NULL)return NULL;
+ translation=declare_int ( S->nseq,2);
+
+ for (len=0,a=0; a< S->nseq; a++)
+ {
+ if((b=name_is_in_list (S->name[a],seq_list, n_seq, 100))!=-1)
+ {
+ l=strlen(S->seq[a])+1;
+ long_seq=vrealloc(long_seq,(len+l+1)*sizeof(char));
+ long_seq=strcat(long_seq, S->seq[a]);
+ long_seq=strcat(long_seq, "*");
+
+ translation[a][0]=b;
+ translation[a][1]=len;
+ len+=l;
+ }
+ else translation[a][0]=-1;
+ }
+
+ long_seq[len-1]='\0';
+ len--;
+
+ table=declare_int ( len+1, 2);
+
+ for ( b=0,a=0; a< S->nseq; a++)
+ {
+ if ( translation[a][0]!=-1)
+ {
+ c=1;
+ while (long_seq[b]!='\0' && long_seq[b]!='*')
+ {
+ table[b+1][1]=c++;
+ table[b+1][0]=translation[a][0];
+ b++;
+ }
+ table[b][1]=c++;
+ table[b][0]=translation[a][0];
+ b++;
+ }
+ }
+
+ for ( a=0; a< (aln[-1])->nseq; a++)
+ {
+ for ( b=0; b< (aln[a])->nseq; b++)
+ {
+
+ (aln[a])->order[b][0]=table[(aln[a])->order[b][1]][0];
+ (aln[a])->order[b][1]=table[(aln[a])->order[b][1]][1];
+ sprintf ( (aln[a])->name[b],"%s_%d_%d", S->name[(aln[a])->order[b][0]],a+1,b+1);
+ }
+ }
+ free_int (translation, -1);
+ free_int (table, -1);
+ return aln;
+ }
+
+
+
+Sequence * fill_sequence_struc ( int nseq, char **sequences, char **seq_name)
+ {
+ int a;
+ Sequence *S;
+ int shortest, longuest;
+
+ if (!sequences)
+ {
+ shortest=longuest=0;
+ }
+ else if ( nseq>1)
+ {
+ shortest=get_shortest_string( sequences, nseq, NULL, NULL);
+ longuest=get_longest_string (sequences, nseq, NULL, NULL);
+ }
+ else if ( nseq==1)
+ {
+ shortest=longuest=strlen (sequences[0]);
+ }
+ else
+ {
+ return NULL;
+ }
+
+
+ S=declare_sequence (shortest, longuest,nseq);
+ S->nseq=nseq;
+
+ if (sequences)S->seq=copy_char ( sequences, S->seq, nseq, -1);
+ else S->seq=declare_char (S->nseq, 1);
+
+ S->name=copy_char ( seq_name, S->name,nseq, -1);
+
+ ungap_array (S->seq,nseq);
+ for ( a=0; a< S->nseq; a++)S->len[a]=strlen(S->seq[a]);
+ return S;
+ }
+
+
+Alignment * thread_profile_files2aln (Alignment *A, char *template_file, Fname *F)
+{
+
+ Alignment *P;
+ int a;
+
+ if (!A->S)A->S=aln2seq (A);
+ if (template_file)A->S=seq2template_seq (A->S, template_file,F);
+ for ( a=0; a< A->nseq; a++)
+ {
+ P=seq2R_template_profile (A->S, a);
+ if ( P)
+ {
+ P->expand=1;
+ sprintf ( P->name[0], "%s", A->name[a]);
+ }
+ }
+
+ return expand_aln (A);
+}
+
+
+
+
+Alignment * expand_aln (Alignment *A)
+ {
+ /*This function expands the profiles within an alignment*/
+
+
+ int a, b, d, e;
+ Alignment *MAIN=NULL, *SUB=NULL;
+ int n_sub_seq=0;
+ int new_nseq=0;
+ int *list;
+ Alignment *Profile;
+
+ if ( !A)return A;
+
+
+
+ list=vcalloc (A->nseq, sizeof (int));
+ for ( a=0; a< A->nseq; a++)
+ {
+ Profile=seq2R_template_profile (A->S, A->order[a][0]);
+ if (Profile && Profile->expand)
+ {
+ new_nseq+=Profile->nseq;
+ }
+ else
+ {
+ new_nseq++;
+ list[n_sub_seq++]=a;
+ }
+ }
+
+ if ( n_sub_seq==A->nseq){vfree(list);return A;}
+ else if (n_sub_seq==0){MAIN=copy_aln (A, MAIN);MAIN->nseq=0;}
+ else
+ {
+ MAIN=extract_sub_aln (A, n_sub_seq, list);
+ }
+ vfree(list);
+
+
+ for ( a=0; a< A->nseq; a++)
+ {
+ Profile=seq2R_template_profile (A->S, A->order[a][0]);
+ if ( Profile && Profile->expand)
+ {
+
+ SUB=copy_aln (Profile,SUB);
+
+ SUB=realloc_aln2(SUB, SUB->nseq, A->len_aln+1);
+
+ for ( e=0,b=0; b< A->len_aln; b++)
+ {
+ if ( is_gap(A->seq_al[a][b]))
+ {for (d=0; d< SUB->nseq; d++)SUB->seq_al[d][b]='-';}
+ else
+ {
+ for(d=0; d<SUB->nseq; d++)SUB->seq_al[d][b]=Profile->seq_al[d][e];
+ e++;
+ }
+
+ }
+ MAIN=stack_aln(MAIN, SUB);
+ }
+ }
+ free_aln (A);
+ free_aln (SUB);
+ return MAIN;
+ }
+Alignment * expand_number_aln (Alignment *A,Alignment *EA)
+ {
+ /*This function expands the profiles within an alignment*/
+
+
+ int a, b, d, e;
+ Alignment *MAIN=NULL, *SUB=NULL, *C=NULL;
+ int n_sub_seq=0;
+ int new_nseq=0;
+ int *list;
+ Alignment *Profile;
+
+ if ( !EA || !A)return EA;
+
+ if ( EA->nseq<A->nseq)
+ {
+ fprintf (stderr, "\n[ERROR:expand_number_aln] Using as a master an expanded aln (%d %d) [FATAL:%s]", EA->nseq, A->nseq,PROGRAM);
+ EA->A=A->A=NULL;
+ print_aln (EA);
+ print_aln (A);
+ myexit (EXIT_FAILURE);
+ }
+
+
+ list=vcalloc (EA->nseq, sizeof (int));
+ for ( a=0; a< EA->nseq; a++)
+ {
+ Profile=seq2R_template_profile (EA->S, EA->order[a][0]);
+ if (Profile && Profile->expand)new_nseq+=Profile->nseq;
+ else
+ {
+ new_nseq++;
+ list[n_sub_seq++]=a;
+ }
+ }
+
+ if ( n_sub_seq==EA->nseq){vfree(list);return EA;}
+ else if (n_sub_seq==0){MAIN=copy_aln (EA, MAIN);MAIN->nseq=0;}
+ else
+ {
+ MAIN=extract_sub_aln (EA, n_sub_seq, list);
+ }
+
+
+ list[0]=EA->nseq;
+ C=extract_sub_aln (EA,1, list);
+ vfree(list);
+
+
+
+ for ( a=0; a< EA->nseq; a++)
+ {
+ Profile=seq2R_template_profile (EA->S, EA->order[a][0]);
+ if ( Profile && Profile->expand)
+ {
+ SUB=copy_aln (Profile,SUB);
+ SUB=realloc_aln2(SUB, SUB->nseq, EA->len_aln+1);
+
+ for ( e=0,b=0; b<= EA->len_aln; b++)
+ {
+ if (is_gap(A->seq_al[a][b]))
+ {
+ for ( d=0; d<SUB->nseq; d++)
+ SUB->seq_al[d][b]=NO_COLOR_RESIDUE;
+ }
+ else
+ {
+ for ( d=0; d<SUB->nseq; d++)
+ {
+
+ if ( is_gap (Profile->seq_al[d][e]))
+ {
+ SUB->seq_al[d][b]=NO_COLOR_RESIDUE;
+ }
+ else SUB->seq_al[d][b]=EA->seq_al[a][b];
+ }
+ e++;
+ }
+ }
+ for (d=0; d< SUB->nseq; d++)SUB->score_seq[d]=EA->score_seq[a];
+
+ MAIN=stack_aln(MAIN, SUB);
+ }
+ }
+
+ MAIN=stack_aln(MAIN, C);
+ MAIN->nseq--;
+ MAIN->score=MAIN->score_aln=EA->score_aln;
+
+ free_aln (SUB);
+ free_aln (EA);
+
+ free_aln (C);
+
+ return MAIN;
+ }
+
+Alignment * probabilistic_rm_aa ( Alignment *A, int pos, int len)
+{
+ int random_len=0;
+ int a, b;
+ int left, right;
+
+ if ( len<0)
+ {
+ random_len=1;
+ len=-len;
+ }
+
+ vsrand(0);
+
+ if (pos==0)pos= (rand()%(A->len_aln-(2*len+len))) +len;
+
+
+ for ( a=0; a< A->nseq; a++)
+ {
+ if (random_len)left =rand()%len;
+ else left=len;
+ if (random_len)right=rand()%len;
+ else right=len;
+ if ( (pos-right)<0 || (pos+left)>A->len_aln)
+ {
+ add_warning ( stderr, "\nWarning: probabilistic_rm_aa, pos out of range [%s]\n", PROGRAM);
+ }
+ else
+ for ( b=pos-right; b<pos+left; b++)A->seq_al[a][b]=(b==pos)?'~':'*';
+ }
+
+ ungap_aln (A);
+ free_sequence ( A->S, A->nseq);
+ A->S=aln2seq (A);
+ return A;
+
+}
+
+Alignment * remove_gap_column ( Alignment *A, char *mode)
+ {
+ int a, b;
+ char *p;
+ int *seq_list;
+ int nseq=0;
+ int keep_col, cl;
+
+
+ seq_list =vcalloc ( A->nseq, sizeof (int));
+ while ( (p=strtok(mode, ":")))
+ {
+ mode=NULL;
+ if (p[0]=='#')
+ {
+ seq_list[nseq++]=atoi(p+1)-1;
+ }
+ else if ( (a=name_is_in_list (p, A->name, A->nseq, 100))!=-1)
+ {
+ seq_list[nseq++]=a;
+ }
+ }
+
+ if ( nseq==0)
+ {
+ for ( a=0; a< A->nseq; a++)seq_list[a]=a;
+ nseq=A->nseq;
+ }
+
+ for ( cl=0,a=0; a<=A->len_aln; a++)
+ {
+ for (keep_col=1, b=0; b< nseq && keep_col; b++)
+ {
+ keep_col=(is_gap(A->seq_al[seq_list[b]][a]))?0:keep_col;
+ }
+
+ if ( keep_col)
+ {
+ for ( b=0; b< A->nseq; b++)
+ {
+ A->seq_al[b][cl]=A->seq_al[b][a];
+ }
+ cl++;
+ }
+ else
+ {
+ for ( b=0; b< A->nseq; b++)
+ {
+ A->seq_al[b][cl]='-';
+ }
+ cl++;
+ }
+ }
+ A->len_aln=cl;
+ vfree (seq_list);
+
+ return A;
+ }
+
+
+Alignment * ungap_sub_aln (Alignment *A, int ns, int *ls)
+ {
+
+ int a, b, c,t;
+ int len;
+
+ len=strlen ( A->seq_al[ls[0]]);
+
+ for ( c=0,a=0; a<len; a++)
+ {
+ for ( t=0,b=0; b<ns; b++)
+ t+=is_gap(A->seq_al[ls[b]][a]);
+ if (t==ns);
+ else
+ {
+ for ( b=0; b<ns; b++)
+ A->seq_al[ls[b]][c]=A->seq_al[ls[b]][a];
+ c++;
+ }
+ }
+ for ( b=0; b<ns; b++)A->seq_al[ls[b]][c]='\0';
+ return A;
+ }
+
+Sequence * ungap_seq ( Sequence *S)
+ {
+ int a;
+
+ if ( !S)return NULL;
+ ungap(S->seq[0]);
+ S->max_len=S->min_len=strlen (S->seq[0]);
+ for ( a=0; a< S->nseq; a++)
+ {
+ ungap(S->seq[a]);
+ S->len[a]=strlen (S->seq[a]);
+ S->max_len=MAX(S->max_len,S->len[a]);
+ S->min_len=MAX(S->min_len,S->len[a]);
+ }
+ return S;
+
+ }
+Alignment* shift_column (Alignment *A, int from, int to);
+int max_shift (Alignment *A, int p);
+int column_is_lower (Alignment *A, int p);
+
+Alignment * unalign_aln_2 (Alignment *A, Alignment *C, int t)
+{
+ int a, b, pos, len;
+ Sequence *S;
+ int n, insert;
+ if (C)
+ {
+ for (a=0; a<A->nseq; a++)
+ for (b=0; b<A->len_aln; b++)
+ {
+ int res=C->seq_al[a][b];
+ A->seq_al[a][b]=toupper(A->seq_al[a][b]);
+ if ((isdigit (res) && (res-'0')<=t))
+ A->seq_al[a][b]=tolower(A->seq_al[a][b]);
+ }
+ }
+
+ n=0;
+ while ( A->seq_al[0][n])
+ {
+ insert=0;
+ for (b=0; b<A->nseq; b++)if (islower (A->seq_al[b][n]))insert=1;
+ if (insert)
+ {
+ insert_gap_col (A,n,1);
+ for (b=0; b<A->nseq; b++)
+ {
+ if ( islower (A->seq_al[b][n+1]))
+ {
+ A->seq_al[b][n]=A->seq_al[b][n+1];
+ A->seq_al[b][n+1]='-';
+ }
+ }
+ }
+ n++;
+ }
+ for (a=A->len_aln-1; a>=0; a--)
+ {
+ if (column_is_lower (A,a))
+ {
+ int s;
+ s=max_shift (A,a);
+ shift_column (A,a, a+s);
+ }
+ }
+ return A;
+}
+Alignment* shift_column (Alignment *A, int from, int to)
+{
+ char *buf;
+ int a;
+
+ buf=vcalloc (A->nseq, sizeof (char));
+ for (a=0; a<A->nseq; a++)
+ {
+ buf[a]=A->seq_al[a][from];
+ A->seq_al[a][from]='-';
+ }
+ to++;
+ insert_gap_col (A, to, 1);
+ for ( a=0; a<A->nseq; a++)A->seq_al[a][to]=buf[a];
+ vfree (buf);
+ ungap_aln (A);
+ return A;
+}
+int max_shift (Alignment *A, int p)
+{
+ int shift, max_shift, a;
+ for (max_shift=A->len_aln,a=0; a< A->nseq; a++)
+ {
+ shift=0;
+
+ if (!islower (A->seq_al[a][p]) || A->seq_al[a][p]=='-')continue;
+ while (A->seq_al[a][p+shift+1]=='-')shift++;
+ max_shift=MIN(shift,max_shift);
+ }
+ return max_shift;
+}
+int column_is_lower (Alignment *A, int p)
+{
+ int a;
+
+ for ( a=0; a<A->nseq; a++)
+ if ( !is_gap (A->seq_al[a][p]) && !islower(A->seq_al[a][p]))return 0;
+ return 1;
+}
+
+Alignment * unalign_aln (Alignment *A, Alignment *C, int t)
+{
+ int a, b, pos, len;
+ Sequence *S;
+
+ for (a=0; a<A->nseq; a++)
+ for (b=0; b<A->len_aln; b++)
+ {
+ int res=C->seq_al[a][b];
+ A->seq_al[a][b]=toupper(A->seq_al[a][b]);
+ if ((isdigit (res) && (res-'0')<=t))
+ A->seq_al[a][b]=tolower(A->seq_al[a][b]);
+ }
+
+
+ for (pos=-1, a=0; a<C->nseq; a++)
+ {
+ b=0;
+ while ( C->seq_al[a][b])
+ {
+ int res=C->seq_al[a][b];
+ if ((isdigit (res) && (res-'0')<=t))
+ {
+ if (pos==-1){pos=b;len=1;}
+ else len++;
+ }
+ else if (pos!=-1)
+ {
+
+ C=unalign_aln_pos(C,a,pos, len);
+ pos=-1;
+ }
+ b++;
+ }
+ if ( pos!=-1){C=unalign_aln_pos(C,a,pos, len);pos=-1;}
+ }
+ S=aln2seq (A);
+ thread_seq_struc2aln (C, S);
+ A=realloc_aln2 (A, A->nseq, C->len_aln+1);
+ A->len_aln=C->len_aln;
+ for (a=0; a<A->nseq; a++)sprintf ( A->seq_al[a], "%s", C->seq_al[a]);
+ ungap_aln (A);
+
+ free_sequence (S, -1);
+ return A;
+}
+Alignment * unalign_aln_pos (Alignment *A, int s, int p, int l)
+{
+ int a;
+ char *buf;
+ int unalign=0;
+
+
+ buf=vcalloc (l+1, sizeof (char));
+ for (a=0; a<l; a++)
+ {
+ buf[a]=A->seq_al[s][p+a];
+ A->seq_al[s][p+a]='-';
+ }
+
+
+ A=insert_gap_col (A,p, l);
+ for (a=0; a<l; a++)
+ {
+ A->seq_al[s][p+a]=buf[a];
+ }
+ vfree (buf);
+ return A;
+}
+Alignment * insert_gap_col (Alignment *A, int p, int l)
+{
+ int a, c;
+ char *buf;
+ char *gap;
+
+ gap=generate_null(l);
+ if ( !A || p>=A->len_aln || p<0)return A;
+
+ buf=vcalloc (A->len_aln+l+1, sizeof (char));
+ A=realloc_aln2(A,A->nseq, A->len_aln+l+1);
+ for (a=0; a<A->nseq; a++)
+ {
+ c=A->seq_al[a][p];
+ A->seq_al[a][p]='\0';
+ sprintf ( buf, "%s%s%c%s", A->seq_al[a],gap,c,A->seq_al[a]+p+1);
+ sprintf (A->seq_al[a], "%s", buf);
+ }
+ vfree (buf);
+ A->len_aln+=l;
+ return A;
+}
+Alignment * unalign_residues (Alignment *A, int si1, int si2)
+{
+ char *s1, *s2, *ns1, *ns2;
+ int l, a, b,r1, r2;
+
+ s1=A->seq_al[si1];s2=A->seq_al[si2];
+ l=strlen (s1);
+
+ ns1=vcalloc (2*l+1, sizeof (char));
+ ns2=vcalloc (2*l+1, sizeof (char));
+
+ for (b=a=0; a< l; a++)
+ {
+ r1=s1[a]; r2=s2[a];
+ if (is_gap(r1) || is_gap(r2) || isupper (r1) || isupper(r2))
+ {
+ ns1[b]=(r1=='.')?'-':r1;
+ ns2[b]=(r2=='.')?'-':r2;
+ b++;
+ }
+ else
+ {
+ ns1[b]=r1;
+ ns2[b]='-';
+ b++;
+ ns2[b]=r2;
+ ns1[b]='-';
+ b++;
+ }
+ }
+ ns1[b]='\0';
+ ns2[b]='\0';
+ A->seq_al[si1]=ns1;
+ A->seq_al[si2]=ns2;
+
+
+ A->len_aln=strlen (ns1);
+ return A;
+}
+Alignment *degap_aln (Alignment *A)
+{
+ //Reomove all the gaps
+ int a;
+ for ( a=0; a< A->nseq; a++)ungap (A->seq_al[a]);
+ return A;
+}
+
+Alignment *ungap_aln_n ( Alignment *A, int p)
+ {
+/*remove all the columns of gap-only within an alignment*/
+ int a, b, c;
+ int t;
+ int gp;
+
+ if ( A->nseq==0)return A;
+
+ for ( c=0,a=0; a< A->len_aln; a++)
+ {
+ for ( t=0,b=0; b<A->nseq; b++)
+ t+=is_gap(A->seq_al[b][a]);
+ gp=(t*100)/A->nseq;
+ if (p>0 && (gp>=p || (t==A->nseq && p==100) || (t && p==1)));//Remove columns containing more than p% gaps
+ else if (p<0 && (gp<=p || (t==0 && p==-100) ||(t && p==-1)));//remove columns containing less than p% gaps
+ else
+ {
+ for ( b=0; b<A->nseq; b++)
+ A->seq_al[b][c]=A->seq_al[b][a];
+ c++;
+ }
+ }
+ for ( b=0; b<A->nseq; b++)A->seq_al[b][c]='\0';
+ A->len_aln=c;
+ return A;
+ }
+
+Alignment *ungap_aln ( Alignment *A)
+{
+ return ungap_aln_n (A, 100);
+}
+/*
+Alignment *ungap_aln ( Alignment *A)
+ {
+ int a, b, c,t;
+
+ for ( c=0,a=0; a< A->len_aln; a++)
+ {
+ for ( t=0,b=0; b<A->nseq; b++)
+ t+=is_gap(A->seq_al[b][a]);
+ if (t==A->nseq);
+ else
+ {
+ for ( b=0; b<A->nseq; b++)
+ A->seq_al[b][c]=A->seq_al[b][a];
+ c++;
+ }
+ }
+ for ( b=0; b<A->nseq; b++)A->seq_al[b][c]='\0';
+ A->len_aln=c;
+ return A;
+
+ }
+*/
+
+
+Alignment *remove_end (Alignment *A)
+ {
+ int a, b, d;
+ int left, right;
+
+ for (a=0; a< A->len_aln; a++)
+ {
+ for ( b=0, d=0; b< A->nseq; b++)
+ if ( !is_gap(A->seq_al[b][a]))d++;
+ if ( d>1)break;
+ }
+ left=a;
+ for (a=A->len_aln-1; a>0; a--)
+ {
+ for ( b=0, d=0; b< A->nseq; b++)
+ if ( !is_gap(A->seq_al[b][a]))d++;
+ if ( d>1)break;
+ }
+ right=a;
+
+ return extract_aln(A, left, right+1);
+ }
+
+Alignment* condense_aln (Alignment *A)
+{
+ /* condense complementarz columns:
+ X- X
+ -X ....>X
+ X- X
+
+ */
+ int a, b, plen, n,m, r1, r2;
+
+ plen=0;
+ while ( A->len_aln !=plen)
+ {
+ plen=A->len_aln;
+ for ( a=0; a< A->len_aln-1; a++)
+ {
+ for ( n=m=b=0; b< A->nseq; b++)
+ {
+ r1=is_gap(A->seq_al[b][a]);
+ r2=is_gap(A->seq_al[b][a+1]);
+ n+=(r1 || r2);
+ m+=r1;
+ }
+
+ if ( n==A->nseq && m!=A->nseq)
+ {
+ for (b=0; b< A->nseq; b++)
+ {
+ if (!is_gap(A->seq_al[b][a+1]))
+ {
+ A->seq_al[b][a]=A->seq_al[b][a+1];
+ A->seq_al[b][a+1]='-';
+ }
+ }
+ a++;
+ }
+ }
+ }
+ A=ungap_aln(A);
+ return A;
+}
+
+
+
+
+void compress_aln ( Alignment *A)
+ {
+
+ /*remove all the columns of gap-only within an alignment*/
+ int a, b, c, d;
+
+
+
+ for (c=0, a=0; a< A->len_aln; a++)
+ {
+ for ( b=0, d=0; b< A->nseq; b++)
+ if ( A->seq_al[b][a]!='-'){d=1; break;}
+ if ( d==0);
+ else
+ {
+ for (b=0; b< A->nseq; b++)
+ A->seq_al[b][c]=A->seq_al[b][a];
+ c++;
+ }
+ }
+ A->len_aln=c;
+
+ for ( a=0; a< A->nseq; a++)
+ A->seq_al[a][c]='\0';
+ }
+
+Alignment *seq_coor2aln ( Sequence *S, Alignment *A, int **coor, int nseq)
+ {
+ int a;
+ char *buf;
+
+ A=realloc_alignment2(A, nseq, return_maxlen ( S->seq, S->nseq)+1);
+ for ( a=0; a< S->nseq; a++)sprintf ( A->file[a], "%s", S->file[a]);
+ for ( a=0; a< nseq; a++)
+ {
+ sprintf (A->name[a], "Repeat_%d_%d", a, coor[a][0]);
+ buf=extract_char ( S->seq[coor[a][0]], coor[a][1]-1, coor[a][2]);
+ sprintf ( A->seq_al[a],"%s", buf);
+ vfree(buf);
+ A->order[a][0]=0;
+ A->order[a][1]=coor[a][1]-1;
+ }
+ A->nseq=nseq;
+ return A;
+ }
+
+Alignment *strings2aln (int nseq,...)
+ {
+ /*strings2aln(nseq, <name1>, <seq1>, <name2>, <seq2>....)*/
+ va_list ap;
+ char **list, **list2;
+ char **name, **name2;
+ Sequence *S;
+ Alignment *A;
+ int a, max;
+
+ va_start(ap, nseq);
+ list=vcalloc (nseq, sizeof (char*));
+ name=vcalloc (nseq, sizeof (char*));
+ for ( a=0; a< nseq; a++)
+ {
+ name[a]=va_arg(ap,char*);
+ list[a]=va_arg(ap,char*);
+
+ }
+ va_end(ap);
+
+ for ( max=0,a=0; a< nseq; a++)
+ {
+ max=(strlen (list[a])>max)?strlen(list[a]):max;
+ }
+ list2=declare_char (nseq, max+1);
+ name2=declare_char (nseq, MAXNAMES+1);
+
+ for ( a=0; a< nseq; a++)
+ {
+ sprintf ( list2[a], "%s", list[a]);
+ sprintf ( name2[a], "%s", name[a]);
+ }
+
+
+ S=fill_sequence_struc(nseq,list2,name2);
+
+ free_char (list2, -1);
+ free_char (name2, -1);
+ vfree (list);
+ vfree(name);
+ A=seq2aln(S,NULL, 1);
+ return A;
+ }
+Alignment *seq2aln ( Sequence *S, Alignment *A,int rm_gap)
+ {
+ int a;
+
+ A=realloc_alignment2(A, S->nseq, S->max_len+1);
+ for ( a=0; a< S->nseq; a++)sprintf ( A->file[a], "%s", S->file[a]);
+ A->nseq=S->nseq;
+ A->max_len=S->max_len;
+ A->min_len=S->min_len;
+
+ for ( a=0; a< S->nseq; a++)
+ {
+ A->order[a][0]=a;
+ A->order[a][1]=0;
+
+ sprintf ( A->seq_comment[a], "%s", S->seq_comment[a]);
+ sprintf ( A->aln_comment[a], "%s", S->aln_comment[a]);
+
+ sprintf ( A->name[a], "%s", S->name[a]);
+ sprintf ( A->seq_al[a], "%s", S->seq[a]);
+
+ ungap ( A->seq_al[a]);
+ A->len[a]=strlen ( A->seq_al[a]);
+
+ if ( rm_gap==0 || rm_gap==NO_PAD)sprintf ( A->seq_al[a], "%s", S->seq[a]);
+
+ }
+ if (rm_gap!=NO_PAD)padd_aln (A);
+ A->S=S;
+ return A;
+ }
+
+Alignment *padd_aln ( Alignment *A)
+{
+ A->seq_al=padd_string (A->seq_al, A->nseq, '-');
+ A->len_aln=strlen (A->seq_al[0]);
+ return A;
+}
+
+char **padd_string ( char **string, int n,char pad)
+{
+ /*Pads a the strings so that they all have the same length*/
+
+ int max_len, a;
+ char *buf;
+
+ max_len=get_longest_string (string,n, NULL, NULL);
+ for (a=0; a<n; a++)
+ {
+ buf=generate_null (max_len-strlen (string[a]));
+ strcat ( string[a], buf);
+ vfree (buf);
+ }
+ return string;
+}
+
+Alignment * trim_aln_with_seq ( Alignment *S, Alignment *P)
+{
+ Alignment *A, *R;
+ int a, b, c;
+ static int seqindex;
+ P=aln2profile (P);
+ S=aln2profile (S);
+
+ A=align_two_aln (S,P, "blosum62mt",-8,-1, "myers_miller_pair_wise");
+ for (a=0; a<A->nseq; a++) sprintf (A->name[a], "tmpname_%d", seqindex++);
+
+ R=copy_aln (A, NULL);
+ for (c=0, a=0; a< A->len_aln; a++)
+ {
+ if ( is_gap (A->seq_al[0][a]));
+ else
+ {
+ for ( b=0; b<A->nseq; b++)
+ R->seq_al[b][c]=A->seq_al[b][a];
+ c++;
+ }
+ }
+ for ( a=0; a< A->nseq; a++)R->seq_al[a][c]='\0';
+ R->len_aln=c;
+ R->S=aln2seq (R);
+
+ free_aln (S);
+ free_aln (P);
+ free_aln (A);
+
+ return R;
+}
+
+Alignment * add_align_seq2aln ( Alignment *A, char *seq, char *seq_name)
+ {
+ if ( !A)
+ {
+ A=declare_aln (NULL);
+ A=realloc_aln2 ( A, 1, strlen (seq)+1);
+ A->nseq=0;
+ sprintf ( A->name[A->nseq], "%s", seq_name);
+ sprintf ( A->seq_al[A->nseq], "%s", seq);
+ A->nseq++;
+
+ }
+ else if ( strlen (seq)!=A->len_aln)
+ {
+ fprintf ( stderr, "\nError: Attempt to stack incompatible aln and aligned sequence[FATAL]\n");
+ myexit (EXIT_FAILURE);
+ A=NULL;
+ }
+ else
+ {
+
+ A=realloc_aln2 ( A, A->nseq+1, A->len_aln+1);
+ sprintf ( A->name[A->nseq], "%s", seq_name);
+ sprintf ( A->seq_al[A->nseq], "%s", seq);
+ A->nseq++;
+ }
+ return A;
+ }
+
+
+Alignment *aln2number (Alignment *A)
+ {
+ A->seq_al=char_array2number(A->seq_al, A->nseq);
+ return A;
+ }
+Sequence *seq2number (Sequence *A)
+ {
+ A->seq=char_array2number(A->seq, A->nseq);
+ return A;
+ }
+
+Sequence * aln2seq (Alignment *A)
+{
+ return aln2seq_main(A, RM_GAP);
+}
+Sequence * aln2seq_main (Alignment *A, int mode)
+ {
+ Sequence *LS;
+ int a;
+ int maxlen;
+
+ if ( !A) return NULL;
+ else if ( A->nseq==0)return NULL;
+ for (maxlen=0,a=0; a<A->nseq; a++)maxlen=MAX(maxlen, strlen (A->seq_al[a]));
+
+
+ LS=declare_sequence ( maxlen+1, maxlen+1, A->nseq);
+ LS->nseq=A->nseq;
+ for ( a=0; a< LS->nseq; a++)
+ {
+ sprintf (LS->file[a],"%s", A->file[a]);
+
+ sprintf ( LS->seq[a], "%s", A->seq_al[a]);
+
+ if (mode==RM_GAP)ungap ( LS->seq[a]);
+
+ LS->len[a]=strlen ( LS->seq[a]);
+
+ sprintf ( LS->seq_comment[a], "%s",A->seq_comment[a]);
+ sprintf ( LS->aln_comment[a], "%s",A->aln_comment[a]);
+ sprintf ( LS->name[a], "%s", A->name[a]);
+ }
+ return LS;
+ }
+
+Sequence *keep_residues_in_seq ( Sequence *S, char *list, char replacement)
+{
+ Alignment *A=NULL;
+ int a;
+
+ A=seq2aln (S, A,1);
+ A=keep_residues_in_aln ( A, list, replacement);
+ for ( a=0; a< A->nseq; a++)
+ {
+ ungap (A->seq_al[a]);
+ sprintf ( S->seq[a], "%s", A->seq_al[a]);
+ }
+ free_aln (A);
+ return S;
+}
+
+
+Alignment *aln2short_aln ( Alignment *A, char *list, char *new, int spacer)
+{
+ int a, b, r, cl, l;
+ char *buf;
+
+ for ( a=0; a< A->nseq; a++)
+ {
+ buf=vcalloc ( strlen (A->seq_al[a])+1, sizeof (char));
+
+ for (l=0,cl=0, b=0; b< A->len_aln; b++)
+ {
+ r=A->seq_al[a][b];
+ if ( is_gap(r));
+ else if ( is_in_set (r, list))
+ {
+ if (cl){cl=0; buf[l++]=new[0];}
+ buf[l++]=r;
+ }
+ else
+ {
+ if ( cl==spacer){buf[l++]=new[0];cl=0;}
+ cl++;
+ }
+
+ }
+
+ buf[l]='\0';
+ sprintf (A->seq_al[a], "%s", buf);
+ vfree (buf);
+ }
+ return A;
+}
+
+Alignment *keep_residues_in_aln ( Alignment *A, char *list, char replacement)
+{
+ return filter_keep_residues_in_aln (A,NULL, 0, -1, list, replacement);
+}
+Alignment *filter_keep_residues_in_aln ( Alignment *A,Alignment *ST, int use_cons, int value, char *list, char replacement)
+{
+ char **sl;
+ int n, a;
+
+ n=strlen (list);
+ sl=declare_char (n+1, 256);
+ for (a=0; a< n; a++)
+ sprintf ( sl[a], "%c%c", list[a], list[a]);
+ sprintf ( sl[a],"#%c", replacement);
+ A=filter_aln_convert (A, ST,use_cons,value, n+1, sl);
+ free_char (sl, -1);
+ return A;
+}
+
+
+Alignment *filter_convert_aln ( Alignment *A,Alignment *ST, int use_cons, int value, int n, ...)
+{
+ va_list ap;
+ char **sl;
+ int a;
+ va_start (ap, n);
+ sl=vcalloc ( n,sizeof(char*));
+ for ( a=0; a< n; a++)
+ {
+ sl[a]=va_arg(ap, char * );
+ }
+ va_end(ap);
+ A=filter_aln_convert (A,ST,use_cons,value, n,sl);
+ vfree(sl);
+ return A;
+}
+
+Alignment * filter_aln ( Alignment *A, Alignment *ST, int value)
+ {
+ return filter_aln_convert (A, ST,0,value,DELETE, NULL);
+ }
+Alignment * filter_aln_switchcase ( Alignment *A, Alignment *ST,int use_cons, int value)
+ {
+ return filter_aln_convert (A, ST,0,value,SWITCHCASE, NULL);
+ }
+Alignment * filter_aln_upper_lower ( Alignment *A, Alignment *ST,int use_cons, int value)
+ {
+ return filter_aln_convert (A, ST,use_cons,value, LOWER, NULL);
+ }
+Alignment * filter_aln_lower_upper ( Alignment *A, Alignment *ST,int use_cons, int value)
+ {
+
+ return filter_aln_convert (A, ST,use_cons,value, UPPER, NULL);
+ }
+Alignment * STseq2STaln ( Alignment *A, Alignment *ST)
+ {
+ int a, i=0;
+
+ if (ST && ST->len_aln !=A->len_aln)
+ {
+ Sequence *S_T, *S_A;
+
+ S_T=aln2seq (ST);
+ S_A=aln2seq (A);
+
+ for (a=0; a< A->nseq; a++)
+ {
+ i=name_is_in_list (A->name[a], S_T->name,S_T->nseq, 100);
+ if (i!=-1)
+ {
+ char *s1, *s2;
+ s1=(S_T)->seq[i];ungap(s1);
+ s2=(S_A)->seq[a];ungap(s2);
+
+ if ( strlen (s1)!=strlen(s2))
+ {
+ fprintf ( stderr, "%s\n%s\n", s1, s2);
+ printf_exit (EXIT_FAILURE, stderr, "ERROR: Sequence %s has different length in the alignment and in the structure Alignment [FATAL:%s]\n", A->name[a], PROGRAM);
+ }
+ }
+ }
+ ST=copy_aln (A, ST);
+ thread_seq_struc2aln (ST,S_T);
+ }
+
+ return ST;
+ }
+Alignment * merge_annotation ( Alignment *A, Alignment *ST, char *seq)
+{
+ int s, a, b;
+
+ ST=STseq2STaln (A, ST);
+ if ( seq==NULL)s=0;
+ else
+ s=name_is_in_list ( seq, A->name, A->nseq, 100);
+
+ if (s==-1)
+ {
+ add_warning ( stderr, "\nERROR: %s is not in your MSA [FATAL: %s]", PROGRAM);
+ myexit (EXIT_FAILURE);
+ }
+
+ for (a=0; a<A->len_aln; a++)
+ {
+ int t, r;
+
+ t=A->seq_al[s][a];
+ if (is_gap (t))continue;
+ for (b=0; b<A->nseq; b++)
+ {
+ t=A->seq_al[s][a];
+ r=ST->seq_al[b][a];
+ if ( isdigit (r))
+ {
+ if (!isdigit(t) || (isdigit (t) && t<r))
+ A->seq_al[s][a]=r;
+ }
+ }
+ }
+ return A;
+}
+
+
+
+Alignment * filter_aln_convert ( Alignment *A, Alignment *ST,int use_cons, int value, int n_symbol,char **symbol_list)
+ {
+ int a, b, c;
+ int st;
+ int cons=0;
+
+
+ ST=STseq2STaln (A, ST);
+ if ( ST && use_cons)
+ {
+ cons=name_is_in_list ("con", ST->name,ST->nseq+1, 100);
+ if ( cons==-1)cons=name_is_in_list ("cons", ST->name,ST->nseq+1, 100);
+ if ( cons==-1)cons=name_is_in_list ("Cons", ST->name,ST->nseq+1, 100);
+ if ( cons==-1)
+ {
+ use_cons=0;
+ fprintf (stderr, "WARNING: Could Not Use the Consensus Sequence [WARNING:%s]\n", PROGRAM);
+ }
+ }
+
+ A->residue_case=KEEP_CASE;
+ for ( a=0; a< A->nseq; a++)
+ {
+ if(value!=10 && ST && !use_cons)
+ {
+ c=name_is_in_list (A->name[a], ST->name, ST->nseq,100);
+ if (c==-1)st=11;
+ }
+
+ for ( b=0; b< A->len_aln; b++)
+ {
+ if ( value==10 || !ST)st=11;
+ else if ( ST && use_cons)
+ {
+ st=(isdigit(ST->seq_al[cons][b]))?ST->seq_al[cons][b]-'0':ST->seq_al[cons][b];
+ }
+ else st=(isdigit(ST->seq_al[c][b]))?ST->seq_al[c][b]-'0':ST->seq_al[c][b];
+
+
+ if ( st==value || value==-1 || st==NO_COLOR_RESIDUE)
+ {
+ if ( n_symbol==UPPER && !symbol_list)A->seq_al[a][b]=toupper (A->seq_al[a][b]);
+ else if ( n_symbol==LOWER && !symbol_list)A->seq_al[a][b]=tolower (A->seq_al[a][b]);
+ else if ( n_symbol==SWITCHCASE && !symbol_list)
+ {
+ if ( !isalpha(A->seq_al[a][b]));
+ else if (isupper (A->seq_al[a][b]))A->seq_al[a][b]=tolower (A->seq_al[a][b]);
+ else if (islower (A->seq_al[a][b]))A->seq_al[a][b]=toupper (A->seq_al[a][b]);
+ }
+ else if ( n_symbol==DELETE && !symbol_list)A->seq_al[a][b]='-';
+ else
+ {
+ A->seq_al[a][b]=convert(A->seq_al[a][b],n_symbol,symbol_list);
+ }
+ }
+
+ }
+ }
+ return A;
+ }
+
+
+char ** sar_aln2motif (Alignment *A, Alignment *B, int *pos, int c);
+char ** sar_aln2motif (Alignment *A, Alignment *B, int *pos, int c)
+{
+ static Alignment *I;
+ static Alignment *O;
+ int a, b, o, i;
+
+ float tp,tn,fp,fn,best, sp, sn, sen2;
+ float best_pred=-1;
+ int best_motif=0;
+
+
+ int n1;
+ static char ***alp;
+ static int *alp_size;
+
+ char ***motif_list;
+ int n;
+
+
+ if (!I)
+ {
+ I=copy_aln(A, NULL);
+ O=copy_aln(A, NULL);
+ }
+
+
+
+ I->nseq=O->nseq=I->len_aln=O->len_aln=0;
+ for (a=0; a<A->len_aln; a++)
+ {
+ if (pos[a])
+ {
+ for (i=o=0,b=0; b<A->nseq; b++)
+ {
+
+ if ( is_gap(A->seq_al[b][a]))return 0;
+ if (B->seq_al[b][c]=='I')I->seq_al[i++][I->len_aln]=A->seq_al[b][a];
+ else O->seq_al[o++][O->len_aln]=A->seq_al[b][a];
+ }
+ I->len_aln++;
+ O->len_aln++;
+ }
+ }
+
+ if (O->len_aln==0 || I->len_aln==0) return 0;
+ O->nseq=o;
+ I->nseq=i;
+ for (a=0; a<o; a++)O->seq_al[a][O->len_aln]='\0';
+ for (a=0; a<i; a++)I->seq_al[a][I->len_aln]='\0';
+
+ alp=vcalloc ( sizeof (char**), I->len_aln);
+ alp_size= vcalloc ( I->len_aln, sizeof (int));
+ for (a=0; a<I->len_aln; a++)
+ {
+ char *col;
+ alp[a]=string2alphabet ( (col=aln_column2string (I,a)),2, &alp_size[a]);
+ vfree (col);
+ }
+
+
+
+ motif_list=generate_array_string_list (I->len_aln, alp, alp_size, &n, NULL, OVERLAP);
+ best_pred=best_motif=0;
+ for (a=0; a<n; a++)
+ {
+
+ tp=tn=fp=fn=0;
+
+ for (b=0; b<I->nseq; b++)
+ {
+ if (match_motif (I->seq_al[b], motif_list[a]))tp++;
+ else fn++;
+ }
+ for (b=0; b<O->nseq; b++)
+ {
+ if (match_motif (O->seq_al[b], motif_list[a]))fp++;
+ else tn++;
+ }
+ rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
+
+ if (best> best_pred)
+ {
+ best_pred=best;
+ best_motif=a;
+ }
+ }
+
+ output_Alignment_without_header ( I, stdout);
+ fprintf ( stdout, "\n");
+ output_Alignment_without_header ( O, stdout);
+
+
+ fprintf ( stdout, "\nMotifCompound %d pred: %.2f motif: ", c, best_pred);
+ for (n1=0, a=0; a<I->len_aln; a++)
+ {
+ char *m;
+ int l;
+ m=motif_list[best_motif][a];
+ fprintf ( stdout, "[%s]-", m);
+ l=strlen (m);
+ n1+=(l==1 && !strm ("*",m) )?1:0;
+ }
+ fprintf (stdout, "SCORE: %d", n1);
+
+ for (a=0; a<n; a++)vfree (motif_list[a]);
+ vfree (motif_list);
+ free_arrayN((void ***) alp, 3);
+ vfree (alp_size);
+
+ return NULL;
+}
+
+
+
+
+void explore_weight_matrix (Alignment *A, Alignment *B, int range, int n, int *array);
+void explore_weight_matrix (Alignment *A, Alignment *B, int range, int n, int *array)
+{
+ int a;
+ if ( n==A->len_aln)
+ {
+ fprintf ( stdout, "\n W:");
+ for (a=0; a<A->len_aln; a++)fprintf ( stdout, "%d", array[a]);
+ fprintf ( stdout, " %.4f",(float)sar_aln2r(A,B,array,0));
+ return;
+ }
+ else
+ {
+ for ( a=0; a<range; a++)
+ {
+ array[n]=a;
+ explore_weight_matrix (A, B, range, n+1, array);
+ }
+ }
+}
+float search_best_combo(Alignment *A, Alignment *B);
+void search_best_combo_sar_aln(Alignment *A, Alignment *B);
+void search_best_combo_sar_aln(Alignment *A, Alignment *B)
+{
+ int a,b,c;
+ Alignment *S;
+ float s;
+ int w=5;
+
+ S=copy_aln (B, NULL);
+ S->len_aln=w;
+ for ( a=0; a<B->len_aln-w;a++)
+ {
+ for (b=0; b<B->nseq; b++)
+ {
+ for (c=0; c<w; c++)
+ {
+ S->seq_al[b][c]=B->seq_al[b][a+c];
+ }
+ S->seq_al[b][c]='\0';
+ }
+
+ s=search_best_combo (A, S);
+ fprintf ( stdout,"\nP: XXXX \nP: XXXXX A=%d / %d", a, B->len_aln);
+
+ }
+
+}
+
+float search_best_combo(Alignment *A, Alignment *B)
+{
+ int a, b, c, d, best_pos,nl, max;
+ float best_score, score;
+ int *list, *pos;
+
+ int w;
+ int combo_mode=1; //1: greedy 2: consider all thw w combinations;
+ FILE *fp2;
+ static int **M;
+ max=2;
+ int delta=0;
+ w=1;
+
+ pos=vcalloc ( A->len_aln, sizeof (int));
+ list=vcalloc (A->len_aln, sizeof (int));
+ nl=0;
+
+ if ( combo_mode==1)
+ {
+ for (a=0; a< max; a++)
+ {
+ for (best_score=-9999,best_pos=0,b=0; b< A->len_aln-w; b++)
+ {
+ for (c=0; c<nl; c++)pos[list[c]]=1;
+ for (c=0; c<w; c++)pos[b+c]=1;
+ score=sar_aln2r(A,B,pos,0);
+ if ( score>best_score)
+ {
+ best_score=score;
+ best_pos=b;
+ }
+ for (c=0; c<w; c++)pos[b+c]=0;
+ }
+ if (best_pos==list[nl-1])break;
+ list[nl++]=best_pos;
+ for (b=0; b<nl; b++) pos[list[b]]=1;
+ fprintf ( stdout, "\n%2d P: %d S:%.3f Delta= %d", nl,best_pos, best_score, (int)sar_aln2delta(A,B, pos,0));
+ for (b=0; b<nl; b++) pos[list[b]]=0;
+
+
+ }
+ for (a=0; a<nl; a++) pos[list[a]]=1;
+ fprintf ( stdout, "\nR: %3f " ,(float)sar_aln2r(A,B,pos,1));
+
+ }
+ else if ( combo_mode==2)
+ {
+ int *array;
+ char *tmpf;
+ FILE *fp;
+ char *buf=NULL;
+ int *preset, n_preset;
+
+ tmpf=vtmpnam (NULL);
+ max=1;
+ generate_array_int_list (max, 0,A->len_aln-1, 1,NULL, tmpf);
+ printf_system ( "cp %s testfile", tmpf);
+ buf=vcalloc ( 1000, sizeof (char));
+ fp=vfopen (tmpf, "r");
+ best_score=-99999;
+
+ n_preset=0;
+ preset=vcalloc (A->len_aln, sizeof (int));
+ preset[n_preset++]=353;
+ preset[n_preset++]=361;
+ //preset[n_preset++]=365;
+ //preset[n_preset++]=187;
+ //preset[n_preset++]=397;
+ //preset[n_preset++]=492;
+
+
+ while ( (buf=vfgets ( buf, fp))!=NULL)
+ {
+
+ array=string2num_list (buf);
+
+ for (a=1; a<=max; a++)
+ {
+ pos[array[a]]=1;
+ }
+ for ( a=0; a<n_preset; a++)pos[preset[a]]=1;
+
+ score=sar_aln2r(A,B,pos,0);
+
+ if ( score>best_score)
+ {
+ best_score=score;
+ fprintf ( stdout, "\n");
+ for (a=0; a<n_preset; a++)fprintf (stdout, "%2d ", preset[a]);
+ for (a=1; a<=max; a++)fprintf (stdout, "%2d ", array[a]);
+ fprintf ( stdout, " R: %.3f", best_score);
+ for (nl=0,a=0; a<n_preset; a++)list[nl++]=preset[a];
+ for (a=1; a<=max; a++)list[nl++]=array[a];
+ }
+ //if ( score!=0)HERE ("R=%.2f", score);
+ for (b=1; b<=max; b++)
+ pos[array[b]]=0;
+ vfree (array);
+ }
+ fprintf ( stdout, "\n");
+ vfclose (fp);
+ //for (a=0; a<max; a++)fprintf (stdout, "%2d ", array[best_pos][a]);
+ //fprintf ( stdout, " R: %.3f", best_score);
+ }
+ for (c=0; c<B->len_aln; c++)
+ {
+ sar_aln2motif (A,B,pos, c);
+
+ }
+ myexit (EXIT_FAILURE);
+ HERE ("***************");
+ fp2=vfopen ("aln.aln", "w");
+ for (a=0; a<A->nseq; a++)
+ {
+ fprintf (fp2, ">%s\n", A->name[a]);
+ for ( b=0; b<nl; b++)fprintf (fp2, "%c", A->seq_al[a][list[b]]);
+ fprintf ( fp2, "\n");
+ }
+ vfclose (fp2);
+ HERE ("Output aln.aln");
+ if (1)
+ {
+ float tp=0, tn=0, fp=0, fn=0, pp2=0,pp=0, sn,sn2, sp;
+ int **result,**result2,**compound_score, *ref_score,n2,n, s, p, c;
+ Alignment *AI, *AO;
+ int simI, simO;
+
+ compound_score=declare_int (B->len_aln, 2);
+ ref_score=vcalloc (nl, sizeof (int));
+
+ result=declare_int (B->len_aln*A->nseq*A->nseq, 2);
+ result2=declare_int (B->len_aln*A->nseq*A->nseq, 2);
+
+ for (n2=c=0; c< B->len_aln; c++)
+ {
+
+ int sar1, sar2;
+ pp=tp=tn=fp=fn=0;
+ if (!M)M=read_matrice ("blosum62mt");
+ for (n=0,a=0; a<A->nseq-1; a++)
+ {
+ for (b=a+1; b<A->nseq;b++)
+ {
+ for (s=0,p=0; p<nl; p++)
+ {
+ char r1, r2;
+
+ r1=A->seq_al[a][list[p]];
+ r2=A->seq_al[b][list[p]];
+ if ( !is_gap (r1) && !is_gap(r2))s+=M[r1-'A'][r2-'A'];
+ }
+ result2[n2][0]=result[n][0]=s;
+
+ sar1=B->seq_al[a][c];sar2=B->seq_al[b][c];
+
+ if (sar1=='I' && sar1==sar2)
+ {
+ result2[n2][1]=result[n][1]=1;
+ pp++;pp2++;
+ n++;n2++;
+ }
+ else if ( sar1==sar2 && sar1=='O')
+ {
+ ;
+ }
+ else
+ {
+ result2[n2][1]=result[n][1]=0;
+ n++;n2++;
+ }
+ //else if ( s1==s2=='O')result[n][1]=-1;
+ }
+ }
+
+ if (pp==0)continue;
+ sort_int_inv (result, 2, 0, 0, n-1);
+
+
+ for (tp=0,a=0; a<n; a++)
+ {
+ tp+=result[a][1];
+ if ((pp-tp) == (a-tp))break;
+ }
+ fp=a-tp;
+ fn=pp-tp;
+ tn=n-pp;
+
+ sn=(tp/(tp+fn));
+ sn2=(tp/(tp+fp));
+ sp=(tn/(tn+fp));
+ fprintf ( stdout, "\nCompound %3d sn: %.3f sn2: %.3f sp: %.3f MIN: %.3f",c,sn, sn2,sp, MIN((MIN(sn,sn2)),sp));
+ compound_score[c][0]=c;
+ compound_score[c][1]=1000*MIN((MIN(sn,sn2)),sp);
+ }
+
+ sort_int_inv (compound_score,2, 1, 0, B->len_aln-1);
+
+ fp2=vfopen ("compound.fasta", "w");
+ for (d=0; d<nl; d++)
+ {
+ int r1, r2;
+ for (n=0,a=0;a<A->nseq; a++)
+ for (b=0; b<A->nseq; b++)
+ {
+ r1= A->seq_al[b][list[d]];
+ r2= A->seq_al[b][list[d]];
+ if (is_gap(r1) || is_gap(r2))continue;
+ else
+ {
+ ref_score[d]+=M[r1-'A'][r2-'A'];
+ n++;
+ }
+ }
+ ref_score[d]/=n;
+ }
+ AO=copy_aln (A, NULL);
+ AI=copy_aln (A,NULL);
+ AO->len_aln=AI->len_aln=nl;
+ for (a=0; a<A->nseq; a++)AO->seq_al[a][nl]=AI->seq_al[a][nl]='\0';
+
+ for (a=0; a<B->len_aln; a++)
+ {
+ fprintf (stdout, "\n>%4d %4d ", compound_score[a][0], compound_score[a][1]);
+ for (b=0; b<B->nseq; b++) fprintf (stdout, "%c", B->seq_al[b][compound_score[a][0]]);
+ fprintf ( stdout, "\n");
+
+ for (AI->nseq=0,b=0; b<B->nseq; b++)
+ {
+ if (B->seq_al[b][compound_score[a][0]]=='O')continue;
+ fprintf ( stdout, "\n\t");
+ for (c=0; c<nl; c++)
+ {
+ fprintf ( stdout, "%c", A->seq_al[b][list[c]]);
+ AI->seq_al[AI->nseq][c]=A->seq_al[b][list[c]];
+ }
+ AI->nseq++;
+ }
+ fprintf ( stdout, "\n\t");
+ for (d=0; d<nl; d++)
+ {
+ for (score=0,n=0,b=0; b<B->nseq; b++)
+ {
+ if (B->seq_al[b][compound_score[a][0]]=='O')continue;
+ for (c=0; c<B->nseq; c++)
+ {
+ if (B->seq_al[c][compound_score[a][0]]=='O')continue;
+ {
+ int r1, r2;
+
+ r1= A->seq_al[b][list[d]];
+ r2= A->seq_al[b][list[d]];
+ if (is_gap(r1) || is_gap(r2))continue;
+ else score+=M[r1-'A'][r2-'A'];
+ n++;
+ }
+ }
+ }
+ score/=n;
+ if ((float)score/(float)ref_score[d]>1.2)fprintf ( stdout, "*");
+ else fprintf ( stdout, " ");
+ }
+ for (AO->nseq=0,b=0; b<B->nseq; b++)
+ {
+ if (B->seq_al[b][compound_score[a][0]]=='I')continue;
+ fprintf ( stdout, "\n\t");
+ for (c=0; c<nl; c++)
+ {
+ AO->seq_al[AO->nseq][c]=A->seq_al[b][list[c]];
+ fprintf ( stdout, "%c", A->seq_al[b][list[c]]);
+ }
+ AO->nseq++;
+ }
+ simI=aln2sim (AI, "blosum62mt"); simO=aln2sim (AO, "blosum62mt");
+ fprintf ( stdout, "\nDELTA: I: %d O: %d %d",simI,simO, simI-simO);
+ delta+=simI-simO;
+ }
+
+ for ( a=0; a<B->nseq; a++)
+ {
+
+ fprintf ( fp2, ">%s\n", B->name[a]);
+ for (b=0; b<B->len_aln/2; b++)
+ fprintf ( fp2, "%c", B->seq_al[a][compound_score[b][0]]);
+ fprintf (fp2, "\n");
+ }
+ vfclose (fp2);
+ HERE ("OUTPUT compound.fasta");
+ result=result2;
+ n=n2;
+ pp=pp2;
+
+ sort_int_inv (result, 2, 0, 0, n-1);
+
+
+ for (tp=0,a=0; a<n; a++)
+ {
+ tp+=result[a][1];
+ if ((pp-tp) == (a-tp))break;
+ }
+ fp=a-tp;
+ fn=pp-tp;
+ tn=n-pp;
+
+ sn=(tp/(tp+fn));
+ sn2=(tp/(tp+fp));
+ sp=(tn/(tn+fp));
+ fprintf ( stdout, "\nTOT: sn: %.3f sn2: %.3f sp: %.3f MIN: %.3f",sn, sn2,sp, MIN((MIN(sn,sn2)),sp));
+
+ }
+ HERE ("Delta= %d", delta);
+
+
+ /*
+ C=copy_aln(A, NULL);
+ for (a=0; a< nl; a++)
+ for (b=0; b<A->nseq; b++)
+ C->seq_al[b][a]=A->seq_al[b][list[a]];
+ C->len_aln=nl;
+ array=vcalloc (C->len_aln, sizeof (int));
+ explore_weight_matrix (C, B, 6,0, array);
+ */
+
+ return best_score;
+}
+
+
+void count_misc (Alignment *A, Alignment *B)
+{
+ int **done, a, b, c, d, e,f, g, *list, n, score;
+ double **slist, *r;
+ int *pos;
+ int w=1;
+
+ search_best_combo (A,B);
+ myexit (EXIT_FAILURE);
+ pos=vcalloc (A->len_aln+1, sizeof (int));
+ /*
+ pos[354]=1;
+ pos[362]=1;
+ pos[366]=1;
+ pos[398]=1;
+ pos[476]=1;
+
+
+ fprintf ( stdout, "\nR: %3f " ,(float)sar_aln2r(A,B,pos,1));myexit (EXIT_FAILURE);
+ */
+ for (a=0; a< A->len_aln-w; a++)
+ {
+ for (c=0; c<w; c++)
+ {
+ pos[a+c]=1;
+ }
+ pos[398]=1;
+ pos[362]=1;
+ pos[354]=1;
+ pos[366]=1;
+ pos[419]=1;
+ pos[494]=1;
+ pos[476]=1;
+ pos[337]=1;
+ fprintf ( stdout, "\nP: %3d W:2 R: %3f ",a+1, (float)sar_aln2r(A,B,pos,0));
+ for (c=0; c<w; c++)
+ {
+ pos[a+c]=0;
+ }
+ }
+
+ myexit (EXIT_FAILURE);
+ for (a=0; a<w; a++) pos[a]=1;
+ for (a=w; a< A->len_aln-1; a++)
+ {
+ pos[a-w]=0;
+ pos[a]=1;
+ fprintf ( stdout, "\nP: %3d W:2 R: %3f ",a, (float)sar_aln2r(A,B,pos,0));
+ }
+
+ myexit (EXIT_FAILURE);
+ pos[2]=1;
+ pos[3]=1;
+
+
+
+ explore_weight_matrix (A, B,3, 0,pos);
+ myexit (EXIT_FAILURE);
+
+ for (a=0; a<A->len_aln; a++)
+ for ( b=0; b<A->len_aln; b++)
+ for (c=0; c<A->len_aln; c++)
+ for (d=0; d<A->len_aln; d++)
+ for (f=0; f<A->len_aln; f++)
+ for (g=0; g<A->len_aln; g++)
+ {
+ e=0;
+ pos[e++]=a;
+ pos[e++]=b;
+ pos[e++]=c;
+ pos[e++]=d;
+ pos[e++]=f;
+ pos[e++]=g;
+ pos[e++]=-1;
+ fprintf ( stdout, "\n%d %d %d %d %d %d %.3f", a, b,c,d,f, g, sar_aln2r(A,B, pos,0));
+
+ }
+
+ myexit (EXIT_FAILURE);
+
+
+ slist=declare_double (A->nseq*A->nseq*10, 2);
+ done=declare_int (256, 256);
+ list=vcalloc ( A->nseq, sizeof (int));
+
+ for (a=0; a<A->len_aln-1; a++)
+ {
+ for (b =0; b<256; b++)for (c=0; c<256; c++)done[b][c]=0;
+
+ for (b=0; b<A->nseq-1; b++)
+ {
+ int r1, r2;
+ r1=A->seq_al[b][a];
+ r2=A->seq_al[b][a+1];
+ if (done[r1][r2])continue;
+ n=0;
+ done[r1][r2]=1;
+ list[n++]=b;
+ fprintf ( stdout, "\n%3d %c%c: %s ",a+1, r1, r2, A->name[b]);
+ for ( c=b+1; c<A->nseq; c++)
+ {
+ if (r1==A->seq_al[c][a] && r2==A->seq_al[c][a+1])
+ {
+ fprintf ( stdout, "%s ", A->name[c]);
+ list[n++]=c;
+ }
+
+ }
+ if (B && n>1)
+ {
+ for (e=0,score=0,c=0; c<n-1; c++)
+ for (d=c+1; d<n; d++,e++)
+ score+=get_sar_sim2(B->seq_al[list[c]], B->seq_al[list[d]]);
+ fprintf ( stdout, " Score=%d", score/e);
+ }
+ }
+ }
+ for (score=0,e=0,a=0; a<A->nseq-1; a++)
+ for (b=a+1; b<A->nseq; b++,e++)
+ {
+ score+=get_sar_sim2(B->seq_al[a], B->seq_al[b]);
+ }
+ fprintf (stdout,"AVG=%d", score/e);
+ for (n=0,a=0; a< A->nseq-1; a++)
+ {
+ static int **M;
+ int sim;
+ if (!M)M=read_matrice ("blosum62mt");
+
+
+ for (b=a+1; b<A->nseq; b++)
+ {
+ int n11, n01, n10, n00, n1;
+
+ for (sim=d=0;d<A->len_aln; d++)
+ {
+ int r1, r2;
+ r1=A->seq_al[a][d];
+ r2=A->seq_al[b][d];
+ sim+=(r1==r2)?1:0;
+ //sim +=(M[r1-'A'][r2-'A']>0)?1:0;
+ }
+
+ sim=(100*sim)/(A->len_aln);//+rand()%10;
+ for (n1=n00=n11=n10=n01=score=0, d=0; d<B->len_aln; d++)
+ {
+ int r1, r2;
+ r1=B->seq_al[a][d];
+ r2=B->seq_al[b][d];
+ n11+=(r1=='I' && r2=='I');
+ n00+=(r1=='O' && r2=='O');
+ n10+=(r1=='I' && r2=='0');
+ n01+=(r1=='O' && r2=='I');
+ n1+=(r1=='I' || r2=='I');
+ }
+ score =((n11+n00)*100)/B->len_aln;
+
+ //score=get_sar_sim2(B->seq_al[a], B->seq_al[b]);
+
+ fprintf ( stdout, "\nSIM: %d SC: %d", sim, score);
+ slist[n][0]=(double)sim;
+ slist[n][1]=(double)score;
+ n++;
+ }
+ }
+ r=return_r(slist, n);
+ fprintf ( stdout, "\nR= %.4f", (float)r[0]);
+ myexit (EXIT_FAILURE);
+}
+
+int aln2ngap ( Alignment *A)
+{
+ int ngap=0, a, b;
+ for (a=0; a< A->len_aln; a++)
+ for (b=0; b<A->nseq; b++) ngap+=is_gap (A->seq_al[b][a]);
+ return ngap;
+}
+int * count_in_aln ( Alignment *A, Alignment *ST, int value, int n_symbol,char **symbol_list, int *table)
+ {
+ int a, b, c=0, d;
+ int st;
+
+ if (!table)table=vcalloc (n_symbol, sizeof (int));
+
+ A->residue_case=KEEP_CASE;
+ for ( a=0; a< A->nseq; a++)
+ {
+ if(value!=10 && ST)for ( c=0; c< ST->nseq; c++)if ( strm(ST->name[c], A->name[a]))break;
+ for ( b=0; b< A->len_aln; b++)
+ {
+ if ( value==10 || !ST)st=11;
+ else st=(isdigit(ST->seq_al[c][b]))?ST->seq_al[c][b]-'0':ST->seq_al[c][b];
+ if ( st==value || value==-1)
+ {
+ for ( d=0; d<n_symbol; d++)table[d]+=is_in_set ( A->seq_al[a][b], symbol_list[d]);
+ }
+ }
+ }
+ return table;
+ }
+
+char *dna_aln2cons_seq ( Alignment *A)
+ {
+ int a, b, best;
+ static int **column_count;
+ static int **old_tot_count;
+ static int **new_tot_count;
+ static char *string1, *string2;
+ int **count_buf;
+ char r1, r2,*seq;
+ int NA=0, NG=1, NC=2, NT=3, IGAP=4;
+ static int MAX_EST_SIZE=10000;
+ static int size_increment=1000;
+ static int first;
+ int overlap=0, best_overlap=0;
+
+
+ seq=vcalloc ( A->len_aln+1, sizeof (char));
+
+ if (!column_count )
+ {
+ column_count=vcalloc(MAX_EST_SIZE, sizeof (int*));
+ for ( a=0; a< MAX_EST_SIZE; a++)
+ column_count[a]=vcalloc (5, sizeof (int));
+
+ old_tot_count=vcalloc(MAX_EST_SIZE, sizeof (int*));
+ new_tot_count=vcalloc(MAX_EST_SIZE, sizeof (int*));
+ A->P=declare_profile( "agct-",MAX_EST_SIZE);
+ string1=vcalloc (MAX_EST_SIZE, sizeof (char));
+ string2=vcalloc (MAX_EST_SIZE, sizeof (char));
+ }
+ else if (A->len_aln>MAX_EST_SIZE)
+ {
+ if ( column_count)
+ {
+ for ( a=0; a< MAX_EST_SIZE; a++)
+ vfree(column_count[a]);
+ vfree(column_count);
+ vfree(old_tot_count);
+ vfree(new_tot_count);
+ vfree(string1);
+ vfree(string2);
+ }
+
+ column_count=vcalloc(MAX_EST_SIZE+ size_increment, sizeof (int*));
+ for ( a=0; a< MAX_EST_SIZE+ size_increment; a++)
+ column_count[a]=vcalloc (5, sizeof (int));
+
+ old_tot_count=vcalloc(MAX_EST_SIZE+ size_increment, sizeof (int*));
+ new_tot_count=vcalloc(MAX_EST_SIZE+ size_increment, sizeof (int*));
+
+ for (a=0; a< MAX_EST_SIZE; a++)
+ {
+ old_tot_count[a]=*(column_count++);
+ for ( b=0; b<5; b++)old_tot_count[a][b]=(A->P)->count[b][a];
+ }
+ free_int ( (A->P)->count, -1);
+
+ (A->P)->count=declare_int (5, MAX_EST_SIZE+ size_increment);
+ (A->P)->max_len=MAX_EST_SIZE+ size_increment;
+ MAX_EST_SIZE+= size_increment;
+ string1=vcalloc (MAX_EST_SIZE, sizeof (char));
+ string2=vcalloc (MAX_EST_SIZE, sizeof (char));
+ }
+
+
+ sprintf ( string1, "%s",A->seq_al[0]);
+ sprintf ( string2, "%s",A->seq_al[1]);
+
+
+ string1=mark_internal_gaps(string1,'.');
+ string2=mark_internal_gaps(string2,'.');
+
+
+
+ for (b=0,a=0; a< A->len_aln; a++)
+ {
+ r1=string1[a];
+ r2=string2[a];
+
+ if ( r1==r2)
+ {
+ overlap++;
+ }
+ else
+ {
+ best_overlap=MAX(overlap, best_overlap);
+ overlap=0;
+ }
+
+
+ if (!is_gap(r1) && first==1)new_tot_count[a]=old_tot_count[b++];
+ else if (is_gap(r1) || first==0){new_tot_count[a]=*column_count;column_count++;};
+
+ if ( first==0)
+ {
+ if(r1=='a') new_tot_count[a][NA]++;
+ else if ( r1=='g')new_tot_count[a][NG]++;
+ else if ( r1=='c')new_tot_count[a][NC]++;
+ else if ( r1=='t')new_tot_count[a][NT]++;
+ else if (is_gap(r1));
+ else
+ {
+ new_tot_count[a][NA]++;
+ new_tot_count[a][NG]++;
+ new_tot_count[a][NC]++;
+ new_tot_count[a][NT]++;
+ }
+ }
+ if ( a> 0 && a<A->len_aln-1 && r1=='.')
+ {
+ new_tot_count[a][IGAP]+=((new_tot_count[a-1][NA]+new_tot_count[a-1][NG]+new_tot_count[a-1][NC]+new_tot_count[a-1][NT]));
+ }
+
+
+ if(r2=='a') new_tot_count[a][NA]++;
+ else if ( r2=='g')new_tot_count[a][NG]++;
+ else if ( r2=='c')new_tot_count[a][NC]++;
+ else if ( r2=='t')new_tot_count[a][NT]++;
+ else if ( r2=='.')new_tot_count[a][IGAP]++;
+ else if ( r2=='-');
+ else
+ {
+ new_tot_count[a][NA]++;
+ new_tot_count[a][NG]++;
+ new_tot_count[a][NC]++;
+ new_tot_count[a][NT]++;
+ }
+ (A->P)->count[0][a]=new_tot_count[a][NA];
+ (A->P)->count[1][a]=new_tot_count[a][NG];
+ (A->P)->count[2][a]=new_tot_count[a][NC];
+ (A->P)->count[3][a]=new_tot_count[a][NT];
+ (A->P)->count[4][a]=new_tot_count[a][IGAP];
+
+ best_int(4,1, &best,new_tot_count[a][NA], new_tot_count[a][NG],new_tot_count[a][NC],new_tot_count[a][NT]);
+ if( best==0) seq[a]='a';
+ else if ( best==1)seq[a]='g';
+ else if ( best==2)seq[a]='c';
+ else if ( best==3)seq[a]='t';
+ }
+
+ first=1;
+
+ seq[a]='\0';
+ fprintf ( stderr, "[Best Overlap: %d Residues]", best_overlap);
+ count_buf=old_tot_count;
+ old_tot_count=new_tot_count;
+ new_tot_count=count_buf;
+
+ return seq;
+
+ }
+
+char *aln2cons_maj ( Alignment *A, int ns, int *ls, int n_groups, char **group_list)
+ {
+ char *seq;
+ int a, b;
+ int len;
+ int clean_ls=0;
+ static int *aa;
+
+ if ( !aa) aa=vcalloc (1000, sizeof (int));
+
+ len=strlen (A->seq_al[ls[0]]);
+ seq=vcalloc (len+1, sizeof (char));
+
+ if ( ns==0)
+ {
+ ns=A->nseq;
+ ls=vcalloc ( A->nseq, sizeof (int));
+ for ( a=0; a< A->nseq; a++)ls[a]=a;
+ clean_ls=1;
+ }
+
+ for ( a=0; a<len; a++)
+ {
+ int best_s=0, best_aa=0, r;
+ for (b=0; b< ns; b++)
+ {
+ r=tolower(A->seq_al[ls[b]][a]);
+ aa[r]++;
+ if (!is_gap(r) && aa[r]>best_s)
+ {
+ best_s=aa[r];
+ best_aa=r;
+ }
+ seq[a]=best_aa;
+ }
+ for (best_s=0, best_aa=0,b=0; b< ns; b++)
+ {
+ aa[tolower(A->seq_al[ls[b]][a])]=0;
+ }
+ }
+ if ( clean_ls)vfree(ls);
+ seq[a]='\0';
+
+ return seq;
+ }
+
+char *aln2cons_seq ( Alignment *A, int ns, int *ls, int n_groups, char **group_list)
+ {
+ char *seq;
+ int a, b, c;
+ int best_group=0;
+ int aa_group=0;
+ int *group;
+ int len;
+ int clean_ls=0;
+
+ len=strlen (A->seq_al[ls[0]]);
+ seq=vcalloc (len+1, sizeof (char));
+
+ if ( ns==0)
+ {
+ ns=A->nseq;
+ ls=vcalloc ( A->nseq, sizeof (int));
+ for ( a=0; a< A->nseq; a++)ls[a]=a;
+ clean_ls=1;
+ }
+
+
+ if ( !group_list)
+ {
+ group_list=declare_char ( 26, 2);
+ for ( a=0; a<26; a++)group_list[a][0]=a+'a';
+ n_groups=26;
+ aa_group=1;
+ }
+
+
+ for ( a=0; a<len; a++)
+ {
+ group=vcalloc (n_groups+1, sizeof (int));
+ for (best_group=0,b=0; b< ns; b++)
+ {
+ if ( !is_gap(A->seq_al[ls[b]][a]))
+ {
+ for (c=0; c< n_groups; c++)
+ if ( is_in_set (tolower(A->seq_al[ls[b]][a]), group_list[c]))
+ {group[c]++;
+ best_group=(group[c]>group[best_group])?c:best_group;
+ }
+ }
+ seq[a]=group_list[best_group][0];
+ }
+ vfree (group);
+ }
+ seq[a]='\0';
+ if ( aa_group) free_char (group_list, -1);
+
+ if ( clean_ls)vfree(ls);
+
+ return seq;
+ }
+
+Alignment *aln2conservation ( Alignment *A, int threshold,char *seq)
+{
+ int a, b, c, d, i, c1, c2;
+ int *pos;
+ float *eval;
+ float tot=0;
+ float tn=0;
+ int **sim;
+ int w=0;
+
+ pos =vcalloc (A->len_aln, sizeof (int));
+ eval=vcalloc (A->len_aln, sizeof (int));
+ sim=aln2sim_mat (A, "idmat");
+ if (seq)i=name_is_in_list (seq, A->name, A->nseq, 100);
+ else i=0;
+
+ if ( i==-1) {HERE ("%s is an unknown:sequence [FATAL]"); myexit (EXIT_FAILURE);}
+
+ for (a=0; a<A->len_aln; a++)
+ {
+ double s;
+ int e;
+ for (c=0,e=a-w; e<=a+w; e++)
+ {
+ if (e<0 || e==A->len_aln)continue;
+ c1=toupper (A->seq_al[i][e]);
+ for (b=0; b<A->nseq; b++)
+ {
+ c2=toupper (A->seq_al[b][a]);
+ if (c1==c2)
+ {
+ c++;
+ s=(double)((double)sim[i][b]/(double)(100));
+
+ }
+ else
+ {
+ s=(double)(((double)100-(double)sim[i][b])/(double)(100));
+ }
+ eval[a]+=(s==0)?0:log(s);
+ }
+ }
+ pos[a]=(c*100)/A->nseq;
+ if (!is_gap(c1)){tot+=pos[a]; tn++;}
+
+ if (pos[a]>=threshold)A->seq_al[i][a]=toupper (A->seq_al[i][a]);
+ else A->seq_al[i][a]=tolower (A->seq_al[i][a]);
+ }
+ fprintf (stdout, ">%s %s [i=%d]\n%s\n", A->name[i],A->aln_comment[i],i, A->seq_al[i]);
+ tot=(tn>0)?(float)tot/(float)tn:0;
+
+ for (d=0,a=0; a<A->len_aln; a++)
+ {
+ fprintf (stdout, "# %c %4d", A->seq_al[i][a],pos[a]);
+
+
+ if ( !is_gap (A->seq_al[i][a]))
+ {
+ fprintf (stdout, " LogOdd: %6.2f ", (tot==0 || pos[a]==0)?0:(float)log((float)pos[a]/tot));
+ fprintf ( stdout, " Pos: %5d E-Val: %9.2f", ++d, eval[a]/(A->nseq));
+ }
+ fprintf ( stdout, "\n");
+ }
+ fprintf ( stdout, "#average conservation: %.2f", tot);
+ myexit (EXIT_SUCCESS);
+}
+char *aln2cons_seq_mat ( Alignment *A, char *mat_name)
+{
+ return sub_aln2cons_seq_mat (A, A->nseq, NULL, mat_name);
+}
+char *sub_aln2cons_seq_mat2 ( Alignment *A,int ns, char **ls, char *mat_name)
+{
+ char *cons;
+ int *list;
+ list=name_array2index_array(ls, ns, A->name, A->nseq);
+ cons=sub_aln2cons_seq_mat ( A,ns, list, mat_name);
+ vfree (list);
+ return cons;
+}
+
+char *sub_aln2cons_seq_mat ( Alignment *A,int ns, int *ls, char *mat_name)
+{
+ int a, b, c, s;
+ char *seq, r1, r2;
+ int **mat;
+ int score=0, best_score=0, best_r=0;
+ int len;
+ int naa;
+
+ mat=read_matrice (mat_name);
+ len=strlen ( A->seq_al[(ls==NULL)?0:ls[0]]);
+ seq=vcalloc (len+1, sizeof (char));
+ for ( a=0; a<len; a++)
+ {
+ for (b=0; b<20; b++)
+ {
+ r1=AA_ALPHABET[b];
+ for ( naa=0,score=0,c=0; c<ns; c++)
+ {
+ s=(ls==NULL)?c:ls[c];
+ if ( ls && ls[c]==-1) continue;
+ else if (is_gap(A->seq_al[s][a]))continue;
+ else
+ {
+ naa++;
+ r2=A->seq_al[s][a];
+ score+=mat[r1-'A'][r2-'A'];
+ }
+ }
+ if (naa==0)best_r='-';
+ if ( b==0 || score>best_score){best_score=score; best_r=r1;}
+ }
+ seq[a]=best_r;
+ }
+ free_int (mat, -1);
+ return seq;
+}
+
+int seq_list2in_file ( TC_method *M, Sequence *S, char *list, char *file)
+{
+ X_template *T=NULL;
+
+ if ( !S)return 0;
+ else
+ {
+ int t;
+ t=tolower(M->seq_type[0]);
+
+ if ( t=='s')
+ {
+ return seq_list2fasta_file ( S, list, file, M->out_mode);
+
+ }
+ else
+ {
+ FILE *fp, *fp2;
+ int a, n, s, c;
+ int *slist;
+
+
+
+ fp=vfopen ( file, "w");
+ slist=string2num_list (list);
+ n=slist[0];
+
+ if (strlen (M->seq_type) >1)
+ {
+ add_warning( stderr, "\nERROR: Mixed seq_type not supported for external methods\n[FATAL:%s]", PROGRAM);
+ }
+
+ for ( a=2; a<n; a++)
+ {
+ s=slist[a];
+ if (t=='p')T=(S->T[s])->P;
+ else if (t=='r')T=(S->T[s])->R;
+ else if (t=='g')T=(S->T[s])->G;
+
+ if (!T && t=='r')
+ {
+ fprintf ( fp, ">%s\n%s%s", S->name[s], S->seq[s], LINE_SEPARATOR);
+ }
+ else if ( T && T->template_file && T->template_file[0])
+ {
+ fp2=vfopen (T->template_file, "r");
+ while ( (c=fgetc (fp2))!=EOF)
+ {
+ fprintf ( fp, "%c", c);
+ }
+ fprintf (fp, "%s", LINE_SEPARATOR);
+ vfclose (fp2);
+ }
+ }
+
+ fprintf (fp, "TARGET_SEQ_NAME: ");
+ for (a=2; a<n; a++)fprintf ( fp, "%s ", (S->name[slist[a]]));
+ fprintf ( fp, "%s", LINE_SEPARATOR);
+
+ vfclose (fp); vfree (slist);
+
+ }
+
+ return 1;
+ }
+}
+
+int seq_list2fasta_file( Sequence *S, char *list, char *file, char *outmode)
+ {
+ FILE *fp;
+ int n, a, s;
+ static char *buf;
+ static int blen;
+ int l;
+ //out_mode: names can only be re-converted when out mode is aln
+
+ /*Buf is used because cmalloced functions cannot go through strtok*/
+ if ( !S)return 0;
+ else
+ {
+ fp=vfopen ( file, "w");
+ if ( !list)
+ {
+ for ( a=0; a<S->nseq; a++)
+ {
+ if (outmode && strm (outmode, "aln"))fprintf ( fp, ">%s %s\n%s\n", decode_name (S->name[a], CODE),S->name[a], S->seq[a]);
+ else fprintf ( fp, ">%s %s\n%s\n", S->name[a],S->name[a], S->seq[a]);
+ }
+ }
+ else
+ {
+ int **list2;
+ int max;
+
+ l=strlen (list);
+ if ( l>blen)
+ {
+ if (buf)vfree(buf);
+ buf=vcalloc ( strlen (list)+1, sizeof (char));
+ sprintf ( buf, "%s", list);
+ blen=l;
+ }
+ n=atoi(strtok (list,SEPARATORS));
+
+ list2=declare_int (n, 2);
+ max=n*1000;
+ for ( a=0; a<n; a++)
+ {
+ list2[a][0]=atoi(strtok (NULL, SEPARATORS));
+ list2[a][1]=rand()%max;
+ }
+ if ( atoigetenv ("HoT_4_TCOFFEE"))sort_int ( list2,2, 1, 0, n-1);
+ for ( a=0; a< n; a++)
+ {
+ int i=list2[a][0];
+ if (outmode && strm (outmode, "aln"))fprintf ( fp, ">%s %s\n%s\n", decode_name (S->name[i], CODE), S->name[a],S->seq[i]);
+ else fprintf ( fp, ">%s %s\n%s\n", S->name[a], S->name[a],S->seq[i]);
+ }
+ }
+ vfclose (fp);
+ }
+ return 1;
+ }
+Structure * seq2struc ( Sequence *S, Structure *ST)
+ {
+ int a, b;
+
+ for ( a=0; a< S->nseq; a++)
+ for ( b=0; b< S->len[a]; b++)
+ ST->struc[a][b+1][ST->n_fields-1]=S->seq[a][b];
+ return ST;
+ }
+
+void aln2struc (Alignment *A, Structure *ST)
+ {
+ int a, b, c;
+
+ for ( a=0; a< A->nseq; a++)
+ for (c=0, b=0; b< A->len_aln; b++)
+ {
+ if ( !is_gap (A->seq_al[a][b]))
+ {
+ ST->struc[a][c][ST->n_fields-1]=A->seq_al[a][b];
+ c++;
+ }
+ }
+ }
+Alignment *stack_aln (Alignment *A, Alignment *B)
+ {
+ int a,b;
+ int max_len=0, max_nseq=0;
+ if ( B==NULL)return A;
+ if ( A==NULL)return B;
+
+ max_nseq=A->nseq+B->nseq;
+ for (a=0; a< A->nseq; a++)max_len=MAX(strlen(A->seq_al[a]),max_len);
+ for (a=0; a< B->nseq; a++)max_len=MAX(strlen(B->seq_al[a]),max_len);
+
+ A=realloc_aln2 ( A,max_nseq,max_len+1);
+
+ for (a=A->nseq,b=0; b< B->nseq; b++, a++)
+ {
+ sprintf ( A->seq_comment[a] , "%s", B->seq_comment[b]);
+ sprintf ( A->aln_comment[a] , "%s", B->aln_comment[b]);
+
+ sprintf ( A->seq_al [a] , "%s", B->seq_al [b]);
+ sprintf ( A->name [a] , "%s", B->name[b]);
+ sprintf ( A->file [a], "%s" , B->file[b]);
+ A->order[a][0]=B->order[b][0];
+ A->order[a][1]=B->order[b][1];
+ A->score_seq[a]=B->score_seq[b];
+ A->len[a]=B->len[b];
+ }
+
+ A->len_aln=MAX(A->len_aln, B->len_aln);
+ A->nseq=A->nseq+B->nseq;
+ A->score_aln=A->score_aln+B->score_aln;
+
+ A->finished=A->finished+B->finished;
+ return A;
+ }
+
+Alignment *chseqIaln(char *name, int seq_n, int start,int len,Sequence *S, int seqIaln, Alignment *A)
+ {
+ char *seq;
+
+ seq=extract_char ( S->seq[seq_n], start, len);
+ A=realloc_aln2 (A, (A==NULL)?(seqIaln+1):MAX(A->nseq,seqIaln+1), ((A==NULL)?(strlen (seq)):MAX(strlen (seq),A->len_aln))+1);
+
+
+ sprintf ( A->seq_al[seqIaln], "%s",seq);
+
+
+ A->order[seqIaln][0]=seq_n;
+ A->order[seqIaln][1]=start;
+ sprintf ( A->name[seqIaln], "%s", name);
+ A->nseq=MAX(A->nseq, seqIaln+1);
+ A->len_aln=return_maxlen(A->seq_al, A->nseq);
+ A->S=S;
+ vfree (seq);
+ return A;
+ }
+
+Alignment * aln_gap2random_aa(Alignment *A)
+ {
+ int a, b,l;
+ char alp[200];
+
+ if (strm ( (A->S)->type, "PROTEIN"))
+ sprintf ( alp, "acefghiklmnpqrstuvwy");
+ else if ( strm ( (A->S)->type, "DNA") ||strm ( (A->S)->type, "RNA") )
+ sprintf ( alp, "agct");
+ l=strlen (alp);
+
+
+ for (a=0; a<A->nseq; a++)
+ for ( b=0; b<A->len_aln; b++)
+ if ( is_gap (A->seq_al[a][b]))A->seq_al[a][b]=alp[(int)rand()%(l)];
+ return A;
+ }
+
+Alignment * make_random_aln(Alignment *A,int nseq, int len, char *alphabet)
+ {
+ int a;
+
+
+ A=realloc_aln2(A, nseq, len+1);
+
+ A->nseq=0;
+ A->len_aln=len;
+ for ( a=0; a< A->nseq; a++)sprintf ( A->file[a], "random alignment");
+ for ( a=0; a< nseq; a++)
+ A=add_random_sequence2aln(A,alphabet);
+ return A;
+ }
+Alignment * add_random_sequence2aln( Alignment *A, char *alphabet)
+ {
+ int a, n;
+
+ vsrand(0);
+
+ n=strlen(alphabet);
+ A=realloc_alignment2 (A, A->nseq+1, A->len_aln+1);
+
+ for ( a=0; a< A->len_aln; a++)A->seq_al[A->nseq][a]=alphabet[rand()%n];
+ if (! A->name[A->nseq][0])
+ {
+ for ( a=0; a<10; a++)A->name[A->nseq][a]=alphabet[rand()%n];
+ A->name[A->nseq][a]='\0';
+ }
+
+ A->nseq++;
+ return A;
+ }
+
+Sequence *get_defined_residues( Alignment *A)
+ {
+ char *buf;
+ Sequence *S;
+ int a, b, s, l, r;
+ if ( !A || !A->S) return NULL;
+
+ S=duplicate_sequence (A->S);
+ for ( a=0; a< S->nseq; a++)
+ for ( b=0; b< S->len[a]; b++)S->seq[a][b]=UNDEFINED_RESIDUE;
+ buf=vcalloc(A->len_aln+1,sizeof (char));
+ for ( a=0; a< A->nseq; a++)
+ {
+ sprintf ( buf, "%s",A->seq_al[a]);
+ ungap(buf);
+ l=strlen (buf);
+ s=A->order[a][0];
+
+ for ( b=1; b<= l; b++)
+ {
+ r=A->seq_cache[s][b];
+
+ if ( r>=0)S->seq[s][r-1]=(A->S)->seq[s][r-1];
+ }
+ }
+ vfree(buf);
+ return S;
+ }
+Alignment *thread_defined_residues_on_aln ( Alignment *A, Sequence *S1)
+ {
+ int a, b;
+ int gap, r,s, r2;
+ for ( a=0; a< A->nseq; a++)
+ {
+ s=A->order[a][0];
+ r=A->order[a][1];
+ for (b=0;b< A->len_aln; b++)
+ {
+ gap=is_gap(A->seq_al[a][b]);
+
+ if (!gap)
+ {
+ r+=!gap;
+ r2=A->seq_cache[s][r]-1;
+
+ if (r2>=0 && S1->seq[s][r2]==UNDEFINED_RESIDUE)
+ A->seq_al[a][b]=UNDEFINED_RESIDUE;
+ }
+ }
+ }
+ return A;
+ }
+
+int ** trim_aln_borders (char **seq1, char **seq2, int nseq)
+ {
+ int a, b, c,l1,l2;
+ char *buf1;
+ char *buf2;
+ int max;
+
+
+
+
+ max=MAX(get_longest_string (seq1,-1, NULL, NULL),get_longest_string (seq2,-1, NULL, NULL))+1;
+ buf1=vcalloc ( max, sizeof(char));
+ buf2=vcalloc ( max, sizeof(char));
+
+ for ( a=0; a< nseq; a++)
+ {
+ sprintf ( buf1, "%s", seq1[a]);
+ sprintf ( buf2, "%s", seq2[a]);
+
+
+
+ ungap (buf1);
+ ungap (buf2);
+
+ if (str_overlap ( buf1, buf2,'*')!=0)
+ {
+ l1=strlen ( seq1[a]);
+ l2=strlen ( seq2[a]);
+ for ( b=0,c=0; c< l1; c++)
+ if ( !is_gap(seq1[a][c]))seq1[a][c]=buf1[b++];
+ seq1[a][c]='\0';
+ for ( b=0,c=0; c< l2; c++)
+ if ( !is_gap(seq2[a][c]))seq2[a][c]=buf2[b++];
+ seq2[a][c]='\0';
+ }
+ }
+ vfree (buf1);
+ vfree (buf2);
+ return NULL;
+
+ }
+Sequence * merge_seq ( Sequence *IN, Sequence *OUT)
+ {
+ int a;
+
+ if ( OUT==NULL)return duplicate_sequence (IN);
+ else
+ {
+ if ( IN && check_list_for_dup( IN->name, IN->nseq))
+ {
+ fprintf ( stderr, "\nERROR: %s is duplicated in file %s[FATAL]\n", check_list_for_dup( IN->name, IN->nseq), IN->file[0]);
+ myexit (EXIT_FAILURE);
+ }
+ for ( a=0; a< IN->nseq; a++)
+ if ((OUT=add_sequence ( IN, OUT, a))==NULL)return NULL;
+ return OUT;
+ }
+ }
+
+Alignment *seq_name2removed_seq_name(Sequence *S, Alignment *NA, float **diff)
+{
+ int a, b, rb, s;
+ float min_diff;
+ for (a=0; a< S->nseq; a++)
+ {
+ if (name_is_in_list( S->name[a], NA->name, NA->nseq, 100)!=-1) continue;
+ for ( min_diff=100, s=0, b=0; b< NA->nseq; b++)
+ {
+ rb=name_is_in_list ( NA->name[b], S->name, S->nseq, 100);
+ if ( diff[a][rb]<min_diff)
+ {
+ s=b;
+ min_diff=diff[a][rb];
+
+ }
+ }
+ strcat ( NA->seq_comment[s], " ");
+ strcat ( NA->seq_comment[s], S->name[a]);
+ }
+ return NA;
+}
+
+
+
+
+int seq_name2index (char *name, Sequence *S)
+{
+ if ( !S) return -1;
+ else return name_is_in_list ( name, S->name, S->nseq, MAXNAMES+1);
+}
+char * seq_name2coor ( char *s, int *start, int *end, char sep)
+{
+ /*name|start|end */
+ char n1[100], n2[100];
+ int a=0, b=0, c=0;
+
+ n1[0]=n2[0]='\0';
+ start[0]=end[0]=0;
+
+ while ( s[a]!=sep && s[a]!='\0')a++;
+ if ( s[a]=='\0')return s;
+ else
+ s[a++]='\0';
+
+
+
+ while ( s[a]!=sep && s[a]!='\0')n1[b++]=s[a++];
+
+ if ( s[a]=='\0'){n1[b]='\0';if ( n1[0])start[0]=atoi(n1);return s;}
+ else s[a++]=n1[b]='\0';
+
+
+ while ( s[a]!=sep && s[a]!='\0')n2[c++]=s[a++];
+ n2[c]='\0';
+
+
+ if ( n1[0])start[0]=atoi(n1);
+ if ( n2[0])end[0]=atoi(n2);
+
+
+ return s;
+}
+
+Sequence *extract_one_seq(char *n,int start, int end, Alignment *S, int keep_name)
+ {
+
+ int seq, a;
+ FILE*fp;
+ char *name;
+ Sequence *OUT_S;
+
+
+ if ( n[0]=='#')seq=S->nseq;
+ else if ( (seq=name_is_in_list (n, S->name, S->nseq, 100)+1)!=0);
+ else if (is_number (n) && (seq=atoi(n))!=0) seq=atoi(n);
+ else
+ {
+ fprintf ( stderr, "\nCould not find Sequence %s [FATAL]", n);
+ myexit (EXIT_FAILURE);
+ }
+ seq--;
+
+ name=vtmpnam ( NULL);
+ fp=vfopen ( name, "w");
+ if ( start && end &&!keep_name)fprintf (fp, ">%s_%d_%d\n",S->name[seq],start, end);
+ else if ( start && end==0 && !keep_name)fprintf (fp, ">%s_%d_%d\n",S->name[seq],start,(int)strlen ( S->seq_al[seq]));
+ else fprintf (fp, ">%s\n", S->name[seq]);
+
+ if ( start==0 && end==0){fprintf (fp, "%s\n", S->seq_al[seq]);}
+ else if (end==0){fprintf (fp, "%s\n", S->seq_al[seq]+start-1);}
+ else
+ {
+ for ( a=start-1; a<end; a++){fprintf ( fp, "%c", S->seq_al[seq][a]);}
+ fprintf ( fp, "\n");
+ }
+
+
+ vfclose (fp);
+ OUT_S=get_fasta_sequence_num (name, NULL);
+
+ return OUT_S;
+ }
+
+
+
+Sequence * extract_sub_seq( Sequence *COOR, Sequence *S)
+ {
+ int a, b, c,s;
+ int start, end;
+
+ for ( a=0; a< S->nseq; a++)
+ {
+ if ( (s=name_is_in_list ( S->name[a], COOR->name, COOR->nseq, 100))!=-1)
+ {
+
+ sscanf ( COOR->seq_comment[s], "%d %d", &start, &end);
+ for (c=0,b=start-1; b< end; b++, c++)S->seq[a][c]=S->seq[a][b];
+ S->seq[a][c]='\0';
+ sprintf ( S->seq_comment[a], "%s",COOR->seq_comment[s]);
+
+ }
+ }
+ S=reorder_seq ( S, COOR->name, COOR->nseq);
+ return S;
+ }
+
+
+
+char * aln_column2string (Alignment *A, int p)
+ {
+ char *s;
+ int a;
+ if (p>=A->len_aln)
+ {
+ HERE ("ERROR: index (p=%d) loger than aln (l=%d) [FATAL]", p, A->len_aln);
+ myexit (EXIT_FAILURE);
+ }
+ else
+ {
+ s=vcalloc (A->nseq+1, sizeof (char));
+ for (a=0; a< A->nseq; a++)s[a]=A->seq_al[a][p];
+ }
+ return s;
+ }
+
+
+int **fix_seq_aln (Sequence *S, Alignment*A, int **cache)
+{
+ int s, b,i,nr;
+
+ if (!cache)cache=vcalloc (S->nseq, sizeof (int*));
+
+ for (s=0; s<A->nseq; s++)
+ {
+ if ((i=name_is_in_list (A->name[s], S->name, S->nseq, 100)==-1))continue;
+ for (nr=0,b=0; b<A->len_aln; b++)
+ {
+ if (!is_gap(A->seq_al[s][b]))
+ cache[i][++nr]=b+1;
+ }
+ }
+ return cache;
+}
+
+int **fix_seq_seq (Sequence *S0, Sequence *Sx)
+{
+ //Expresses seq1 in terms of s2
+ //sequences 0-N
+ //residues 1-N+1
+ int s0, r0,i;
+ int **index;
+
+ index=vcalloc ( S0->nseq, sizeof (int*));
+ for (s0=0; s0<S0->nseq; s0++)
+ {
+ int l=S0->len[s0];
+ index[s0]=vcalloc (l+1, sizeof (int));
+ i=index[s0][0]=name_is_in_list (S0->name[s0], Sx->name, Sx->nseq, 100);
+ if (i==-1);
+ else if (strm (S0->seq[s0], Sx->seq[i]))
+ {
+ for (r0=1; r0<=l; r0++)
+ {
+ index [s0][r0]=r0;
+ }
+ }
+ else
+ {
+ int c;
+ int nr0=0;
+ int nr1=0;
+
+ Alignment *B=align_two_sequences (S0->seq[s0],Sx->seq[i],(strm(S0->type, "PROTEIN"))?"blosum62mt":"idmat",-4,-1, "myers_miller_pair_wise");
+ for (c=0; c<B->len_aln; c++)
+ {
+
+ int g0=is_gap(B->seq_al[0][c]);
+ int g1=is_gap(B->seq_al[1][c]);
+ nr0+=1-g0;
+ nr1+=1-g1;
+ if (!g0 && !g1)index[s0][nr0]=nr1;
+ }
+ if (aln2sim(B, "idmat")<20) add_warning (stderr,"Unreliable reconciliation for sequence %s. If it a PDB, check source file", S0->name[s0]);
+ free_aln (B);B=NULL;
+ }
+ }
+ return index;
+}
+int **fix_aln_seq_new (Alignment *A, Sequence *Sx)
+{
+ Sequence *S;
+ int **f;
+
+ S=aln2seq (A);
+ f=fix_seq_seq(S, Sx);
+ free_sequence (S, S->nseq);
+ return f;
+}
+Alignment * fix_aln_seq ( Alignment *A, Sequence *S)
+ {
+ int a, b, c;
+ char *buf1, *buf2;
+ int g0, g1, nr0, nr1;
+ int id, tot;
+ Alignment *B;
+
+
+ /*This function establishes the correspondance between every (1..N+1) residue of each aligned sequence
+ and its correspondance in S:
+ A->seq_cache[a][b]=x means that residue b of aligned sequence a corresponds to residue x of the sequence with tye same index in S
+ A->seq_cache[a][b]=0 means there is no correspondance.
+ a is the index of the sequence
+ Applying this function is needed for turning an alignment into a constraint list
+ */
+
+
+ if ( S==NULL)return A;
+ reorder_aln (A, S->name,S->nseq);
+ if (A->seq_cache)free_int (A->seq_cache, -1);
+ A->seq_cache=declare_int ( S->nseq, MAX((A->len_aln+1), S->max_len+1));
+
+ for (a=0; a< S->nseq; a++)
+ for ( b=0; b< A->len_aln; b++)A->seq_cache[a][b]=-1;
+
+ buf1=buf2=NULL;
+ for ( a=0; a< S->nseq; a++)
+ {
+ for (b=0; b< A->nseq; b++)
+ {
+ if (strm ( S->name[a], A->name[b]))
+ {
+ A->order[b][0]=a;
+
+ vfree (buf1);
+ buf1=vcalloc ( A->len_aln+1, sizeof (char));
+ sprintf (buf1, "%s", A->seq_al[b]);
+ ungap (buf1);
+ upper_string (buf1);
+
+ vfree(buf2);
+ buf2=vcalloc (strlen(S->seq[a])+1, sizeof (char));
+ sprintf (buf2, "%s",S->seq[a]);
+ ungap (buf2);
+ upper_string (buf2);
+
+
+
+ if ( strm (buf1,buf2))
+ {
+
+ for ( c=0; c<S->len[a]; c++)A->seq_cache[a][c+1]=c+1;
+ }
+ else
+ {
+
+ B=align_two_sequences (buf2,buf1,"blosum62mt",-4,-1, "myers_miller_pair_wise");
+ if ( getenv ("DEBUG_RECONCILIATION"))
+ {
+ fprintf (stderr, "\n[DEBUG_RECONCILIATION:fix_aln_seq]\nReconciliation of %s\nA=Ref_sequence\nB=New_seq", S->name[a]);
+ print_aln (B);
+ }
+
+ for (id=0, tot=0,nr0=0,nr1=0,c=0; c<B->len_aln; c++)
+ {
+ g0=is_gap(B->seq_al[0][c]);
+ g1=is_gap(B->seq_al[1][c]);
+ nr0+=1-g0;
+ nr1+=1-g1;
+ if ( !g0 && !g1)
+ {
+ tot++;
+ id+=(B->seq_al[0][c]==B->seq_al[1][c])?1:0;
+ A->seq_cache[a][nr1]=nr0;
+ }
+ else if (g0 && !g1)
+ {
+ A->seq_cache[a][nr1]=0;
+ }
+ }
+ if ( ((id*100)/tot)<20)
+ {
+ print_aln (B);
+ fprintf ( stderr, "\nTwo different sequences have the same name: %s", S->name[a]);
+ fprintf ( stderr, "\nIf %s is a PDBID, Make sure it identifies the right chain (A, B, 1, 2...)", S->name[a]);
+ fprintf ( stderr, "\nChain number or index must be added to the PDB id (i.e. 1gowA)");
+ fprintf ( stderr, "\nIf You want to use %s anyway, rename it with a non-PDB identifier such as seq_%s\n",S->name[a],S->name[a]);
+ myexit (EXIT_FAILURE);
+ }
+
+ free_sequence ( B->S, -1);
+ free_aln (B);
+ }
+
+ }
+ }
+ }
+ vfree(buf1);vfree(buf2);
+ return A;
+ }
+
+Sequence * add_prf2seq ( char *file, Sequence *S)
+ {
+ char **new_seq;
+ Sequence *NS;
+
+ if ( !is_aln (file)&& !is_seq (file))return S;
+ else
+ {
+ X_template *R;
+ Alignment *A;
+
+
+ R=fill_R_template(file,file, S);
+
+ A=(R->VR)->A;
+ ((R->VR)->A)->expand=1;
+ new_seq=declare_char (1,A->len_aln+1);
+ sprintf ( new_seq[0], "%s",aln2cons_seq_mat(A, "blosum62mt"));
+
+ NS=fill_sequence_struc(1, new_seq,A->file);
+ S=add_sequence (NS, S, 0);
+ (S->T[S->nseq-1])->R=R;
+
+ free_sequence (NS, NS->nseq);
+ free_char( new_seq, -1);
+
+ return S;
+ }
+ }
+int prf_in_seq ( Sequence *S)
+{
+ int a;
+
+ if ( !S) return 0;
+ else
+ {
+ for ( a=0; a< S->nseq; a++)
+ if (seq2R_template_profile(S, a)) return 1;
+ }
+ return 0;
+}
+Sequence * add_sequence ( Sequence *IN, Sequence *OUT, int i)
+ {
+ int s, a;
+
+ char *buf;
+ if (OUT==NULL)
+ {
+
+ OUT=duplicate_sequence (IN);
+ return OUT;
+ }
+ for (a=0; a<OUT->nseq; a++)
+ {
+ Alignment *P;
+ P=seq2R_template_profile (OUT, a);
+ if (!P) continue;
+ else if (name_is_in_list (IN->name[i], P->name, P->nseq, 100)!=-1) return OUT;
+ }
+
+ /*Adds sequence i of IN at the end of OUT*/
+
+ if ((s=name_is_in_list ( IN->name[i], OUT->name, OUT->nseq,STRING))==-1 )
+ {
+ OUT=realloc_sequence (OUT, OUT->nseq+1, IN->len[i]);
+ sprintf ( OUT->name[OUT->nseq],"%s",IN->name[i]);
+ sprintf ( OUT->file[OUT->nseq],"%s",IN->file[i]);
+ sprintf ( OUT->seq_comment[OUT->nseq],"%s",IN->seq_comment[i]);
+ sprintf ( OUT->aln_comment[OUT->nseq],"%s",IN->aln_comment[i]);
+
+ sprintf ( OUT->seq[OUT->nseq],"%s",IN->seq[i]);
+ OUT->len[OUT->nseq]=IN->len[i];
+ OUT->T[OUT->nseq][0]=IN->T[i][0];
+ OUT->nseq++;
+ return OUT;
+ }
+ else if ( s!=-1 && !case_insensitive_strcmp ( IN->seq[i], OUT->seq[s]))
+ {
+
+ if ( getenv4debug("DEBUG_RECONCILIATION"))fprintf ( stderr,"[DEBUG_RECONCILIATION:add_sequence]\n%s\n%s\n", IN->seq[i], OUT->seq[s]);
+
+ add_warning (stderr, "DISCREPANCY:%s in [%s] and [%s]\n", IN->name[i], IN->file[i], OUT->file[s]);
+
+
+ if (((buf=build_consensus(IN->seq[i], OUT->seq[s],"cfasta_pair_wise" ))!=NULL)||((buf=build_consensus(IN->seq[i], OUT->seq[s],"myers_miller_pair_wise" ))!=NULL))
+ {
+
+ OUT->max_len=MAX(OUT->max_len, strlen(buf));
+ OUT->min_len=MIN(OUT->min_len, strlen(buf));
+ OUT->seq =realloc_char ( OUT->seq, -1, -1,OUT->nseq,OUT->max_len+1);
+
+ sprintf ( OUT->seq[s],"%s",buf);
+ OUT->len[s]=strlen (buf);
+ vfree (buf);
+ return OUT;
+ }
+ else
+ {
+ fprintf ( stderr, "IMPOSSIBLE TO RECONCILIATE SOME SEQUENCES[FATAL:%s]\n", PROGRAM);
+ print_aln ( align_two_sequences (IN->seq[i], OUT->seq[s], "idmat", 0, 0, "fasta_pair_wise"));
+ myexit (EXIT_FAILURE);
+ return NULL;
+ }
+
+ }
+ else
+ {
+ return OUT;
+ }
+ }
+
+
+Sequence * trim_seq ( Sequence *A, Sequence *B)
+ {
+ int a;
+ Sequence *R;
+
+ if (A->nseq>B->nseq)
+ {
+ Sequence *I;
+ I=A;A=B;B=I;
+ }
+
+ R=declare_sequence (MIN(A->min_len,B->min_len), MAX(A->max_len, B->max_len), MIN(A->nseq, B->nseq));
+ R->nseq=0;
+
+ for (a=0; a< A->nseq; a++)
+ {
+ if ( name_is_in_list ( A->name[a], B->name, B->nseq,STRING+1)!=-1)
+ {
+ sprintf ( R->name[R->nseq], "%s", A->name[a]);
+ sprintf ( R->seq[R->nseq], "%s", A->seq[a]);
+ sprintf ( R->file[R->nseq], "%s", A->file[a]);
+ sprintf ( R->aln_comment[R->nseq], "%s", A->aln_comment[a]);
+ sprintf ( R->seq_comment[R->nseq], "%s", A->seq_comment[a]);
+
+ R->len[R->nseq]=A->len[a];
+ R->nseq++;
+ }
+ }
+ return R;
+ }
+
+Sequence * trim_aln_seq ( Alignment *A, Alignment *B)
+ {
+ int a;
+ static char **name_list;
+ int n=0;
+ Sequence *SA, *SB;
+ int **cache_A=NULL;
+ int **cache_B=NULL;
+ int * p;
+
+ /*This function inputs two alignments A and B
+ It removes sequences that are not common to both of them
+ It rearange the sequences so that they are in the same order
+ A decides on the order
+ The Sequences (A->S) and (B->S) are treated the same way
+ Sequences are also merged in order to detects discrepencies.
+ A pointer to S is returned
+ */
+ if (name_list)free_char (name_list, -1);
+ name_list=declare_char (MAX(A->nseq, B->nseq), STRING+1);
+
+ for ( a=0; a< A->nseq; a++)
+ {
+ if ( name_is_in_list ( A->name[a], B->name, B->nseq,STRING)!=-1)
+ {
+ sprintf ( name_list[n++], "%s", A->name[a]);
+ }
+ }
+
+
+
+ reorder_aln ( A, name_list, n);
+ if (A->seq_cache)cache_A=duplicate_int (A->seq_cache, -1, -1);
+ if (B->seq_cache)cache_B=duplicate_int (B->seq_cache, -1, -1);
+ reorder_aln ( B, name_list, n);
+ for ( a=0; a< n; a++)
+ {
+ if ( cache_A)
+ {
+ p=A->seq_cache[A->order[a][0]];
+ A->seq_cache[A->order[a][0]]=cache_A[a];
+ cache_A[a]=p;
+ }
+ if ( cache_B)
+ {
+ p=B->seq_cache[B->order[a][0]];
+ B->seq_cache[B->order[a][0]]=cache_B[a];
+ cache_B[a]=p;
+ }
+ A->order[a][0]=B->order[a][0]=a;
+ }
+ free_int(A->seq_cache, -1);
+ free_int(B->seq_cache, -1);
+
+ A->seq_cache=cache_A;
+ B->seq_cache=cache_B;
+
+
+
+ SA=aln2seq(A);
+ SB=aln2seq(B);
+
+ A->S=B->S=merge_seq (SA, SB);
+ return A->S;
+ }
+Sequence * trim_aln_seq_name ( Alignment *A, Alignment *B)
+ {
+ int a;
+ Sequence *S;
+
+ /*This function inputs two alignments A and B
+ It removes sequences that are not common to both of them
+ It rearange the sequences so that they are in the same order
+ A decides on the order
+ */
+ S=declare_sequence ( 1, 1, A->nseq+B->nseq);
+ S->nseq=0;
+ for ( a=0; a< A->nseq; a++)
+ {
+ if ( name_is_in_list ( A->name[a], B->name, B->nseq,STRING)!=-1)
+ {
+ sprintf ( S->name[S->nseq++], "%s", A->name[a]);
+ }
+ }
+ return S;
+ }
+
+
+
+char ** rm_name_tag (char **name, int nseq, char *tag)
+{
+ int a , b, ntag;
+ char **tag_list;
+ char *s;
+ char **template_list;
+ if ( !name )return NULL;
+
+ tag_list=declare_char (10, 4);
+
+ if ( tag)
+ {
+ ntag=1; sprintf ( tag_list[0], "%s", tag);
+ }
+ else
+ {
+ ntag=0;
+ sprintf ( tag_list[ntag++], "_S_");
+ sprintf ( tag_list[ntag++], "_G_");
+ }
+ template_list=declare_char (nseq, 100);
+ for ( a=0; a<nseq ; a++)
+ {
+ for ( b=0; b<ntag; b++)
+ {
+ s=strstr(name[a], tag_list[b]);
+ if ( s)
+ {
+ s[0]='\0';
+ s[2]='\0';
+ sprintf ( template_list[a], ">%s _%s_ %s", name[a], s+1, s+3);
+ break;
+ }
+ }
+ }
+
+ free_char (tag_list, -1);
+ return template_list;
+}
+Sequence * swap_header ( Sequence *S, Sequence *H)
+{
+ int a, b, n;
+
+ for ( a=0; a< S->nseq; a++)
+ {
+ if ( (n=name_is_in_list (S->name[a],H->name, H->nseq, 1000))!=-1)
+ {
+ char **list;
+
+
+ list=string2list (H->seq_comment[n]);
+ if ( list==NULL || atoi(list[0])==1)continue;
+ S->seq_comment[a]='\0';
+ sprintf (S->name[a], "%s%s%s",H->name[n], list[1], list[2]);
+ vfree ( S->seq_comment[a]);S->seq_comment[a]=vcalloc ( strlen (H->seq_comment[n])+1, sizeof (char));
+ for (b=3; b< atoi(list[0]); b++)S->seq_comment[a]=strcat (S->seq_comment[a], list[b]);
+ free_char (list, -1);
+ }
+ }
+ return S;
+}
+
+
+Sequence * profile_seq2template_seq ( Sequence *S, char *template_file, Fname *F)
+{
+ /*This function fetches potential templates associated with sequences within a profile*/
+ int i;
+ Alignment *A;
+
+ for ( i=0; i< S->nseq; i++)
+ {
+
+ if ( (A=seq2R_template_profile (S, i)))
+ {
+
+ A->S=aln2seq (A);
+ A->S=seq2template_seq (A->S, template_file, F);
+ if (!A->S)return NULL;
+ }
+ }
+
+ return S;
+}
+
+Sequence * seq2template_type(Sequence *Seq)
+{
+ //add template
+ int a, e;
+ int s;
+ struct X_template *S=NULL;
+ struct X_template *P=NULL;
+ struct X_template *R=NULL;
+ struct X_template *G=NULL;
+ struct X_template *F=NULL;
+ struct X_template *T=NULL;
+ struct X_template *E=NULL;
+ struct X_template *U=NULL;
+ Alignment *A;
+
+
+ e=' ';
+ for (a=0; a< Seq->nseq; a++)
+ {
+ if (!Seq->T[a])continue;
+ //HERE ADD a Template
+ P=seq_has_template (Seq, a, "_P_");
+ S=seq_has_template (Seq, a, "_S_");
+ R=seq_has_template (Seq, a, "_R_");
+ G=seq_has_template (Seq, a, "_G_");
+ F=seq_has_template (Seq, a, "_F_");
+ T=seq_has_template (Seq, a, "_T_");
+ E=seq_has_template (Seq, a, "_E_");
+ U=seq_has_template (Seq, a, "_U_");
+
+ s=(!P)?1:0;
+ sprintf ( (Seq->T[a])->seq_type, "%c%c%c%c%c%c%c%c", (P)?'P':e, (S)?'S':e, (S &&!P)?'s':e,(R)?'R':e, (G)?'G':e,(T)?'T':e,(E)?'E':e,(U)?'U':e);
+
+ if (R && (A=seq2R_template_profile (Seq,a)) && A->S)
+ {
+
+ A->S=seq2template_type ( A->S);
+ }
+ }
+ return Seq;
+}
+
+char * string_contains_template_tag (char *string_in)
+{
+ char string[100];
+
+ if ( strstr (string, "_P_"))return "_P_";
+ if ( strstr (string, "_S_"))return "_S_";
+ if ( strstr (string, "_R_"))return "_R_";
+ if ( strstr (string, "_G_"))return "_G_";
+ if ( strstr (string, "_F_"))return "_F_";
+ if ( strstr (string, "_T_"))return "_T_";
+ if ( strstr (string, "_E_"))return "_E_";
+ if ( strstr (string, "_U_"))return "_U_";
+
+ return NULL;
+}
+static int check_blast_is_installed (char *server);
+
+
+
+static int check_blast_is_installed (char *server)
+{
+ if (strm (server, "EBI"));
+ else if ( strm (server, "NCBI"))
+ return check_program_is_installed (NCBIWEBBLAST_4_TCOFFEE,NULL, NULL,NCBIWEBBLAST_ADDRESS, INSTALL_OR_DIE);
+ else if ( strm (server, "LOCAL"))
+ return check_program_is_installed (NCBIBLAST_4_TCOFFEE,NULL, NULL,NCBIBLAST_ADDRESS, INSTALL_OR_DIE);
+ return 1;
+}
+
+
+Sequence * vremove_seq_template_files(Sequence *S)
+{
+ return handle_seq_template_file (S, "remove");
+}
+Sequence * display_seq_template_files(Sequence *S)
+{
+ return handle_seq_template_file (S, "display");
+}
+Sequence * handle_seq_template_file (Sequence *S, char *mode)
+{
+ int a;
+ Template *T;
+
+ for (a=0; a< S->nseq; a++)
+ {
+ T=S->T[a];
+ if (T)
+ {
+ handle_X_template_files (T->P, mode);
+ handle_X_template_files (T->F, mode);
+ handle_X_template_files (T->R, mode);
+ handle_X_template_files (T->T, mode);
+ handle_X_template_files (T->E, mode);
+ }
+ }
+
+ return S;
+}
+int handle_X_template_files ( X_template *T, char *mode)
+ {
+ if (!T)return 0;
+
+ if ( strm (mode, "remove"))
+ {
+ vremove (T->template_file);
+ vremove (T->template_name);
+ }
+ else if (strm (mode, "display"))
+ {
+ char buf[100];
+ sprintf ( buf, "Template %s", template_type2type_name (T->template_type));
+ if (check_file_exists (T->template_name))display_output_filename ( stdout,buf,T->template_format,T->template_name, STORE);
+ }
+ else
+ {
+ printf_exit (EXIT_FAILURE, stderr, "\nUnkonwn mode %s for template handling [FATAL:%s]", mode, PROGRAM);
+ }
+ return 1;
+ }
+Sequence * seq2template_seq ( Sequence *S, char *template_list, Fname *F)
+{
+ /*Expected format for the template file:
+ >seq_name _X_ Target_template
+ X: S for Structures
+ G for genomes (Exoset)
+ When alternative templates are given for a sequence, the first one superseeds all the others
+ */
+
+ /*Fill the sequences*/
+ /*1: No template*/
+ char buf[1000];
+
+ int PmC,PmI,PMI;
+ int BmC,BmI,BMI;
+ char *server;
+ char *pdb_db,*prot_db;
+ char pdb_type[100];
+ char *p;
+ int remove_template_file=0;
+
+
+ remove_template_file=get_int_variable ("remove_template_file");
+ server=get_string_variable ("blast_server");
+ pdb_db=get_string_variable ("pdb_db");
+ prot_db=get_string_variable ("prot_db");
+
+ PmI=get_int_variable ("pdb_min_sim");
+ PMI=get_int_variable ("pdb_max_sim");
+ PmC=get_int_variable ("pdb_min_cov");
+
+ BmI=get_int_variable ("prot_min_sim");
+ BMI=get_int_variable ("prot_max_sim");
+ BmC=get_int_variable ("prot_min_cov");
+
+ //Set the type of the PDB structure
+ if ((p=get_string_variable ("pdb_type")))
+ {
+ sprintf ( pdb_type, "%s",p);
+ }
+ else
+ {
+ sprintf (pdb_type, "dmn");
+ }
+
+ if ( (template_list && template_list[0]=='\0') || strm ( template_list, "no_template"))
+ {
+ return S;
+ }
+ else if ( strstr (template_list, "MODE_"))//pre_set mode
+ {
+ return seq2template_seq ( S,template_list+strlen ("MODE_"),F);
+ }
+ else if ( strm ( template_list, "SSP")|| strm ( template_list, "GOR"))
+ {
+
+ /*use GOR to Predict the secondary structure*/
+ check_program_is_installed (GOR4_4_TCOFFEE,NULL, NULL,GOR4_ADDRESS, INSTALL_OR_DIE);
+ sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#ssp_template@seq#%s/%s@obs#%s/%s@cache#%s@type#_E_",get_mcoffee_4_tcoffee(), "New_KS.267.seq", get_mcoffee_4_tcoffee(), "New_KS.267.obs", get_cache_dir());
+ S=seq2template_seq (S,buf, F);
+ return S;
+ }
+ else if ( strm ( template_list, "PSISSP") || strm (template_list, "PSIGOR"))
+ {
+
+ /*Computes a GOR consensus on a psi-blast output*/
+ check_program_is_installed (GOR4_4_TCOFFEE,NULL, NULL,GOR4_ADDRESS, INSTALL_OR_DIE);
+ check_blast_is_installed(server);
+
+ sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#psissp_template@seq#%s/%s@obs#%s/%s@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_E_",get_mcoffee_4_tcoffee(), "New_KS.267.seq", get_mcoffee_4_tcoffee(), "New_KS.267.obs", get_cache_dir(), BmI,BMI,BmC,server);
+ S=seq2template_seq (S,buf, F);
+ return S;
+ }
+ else if ( strm ( template_list, "TM"))
+ {
+
+ /*predict transmembrane structure*/
+ check_program_is_installed (HMMTOP_4_TCOFFEE,NULL, NULL,HMMTOP_ADDRESS, INSTALL_OR_DIE);
+ sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#tm_template@arch#%s/%s@psv#%s/%s@type#_T_",get_mcoffee_4_tcoffee(), "hmmtop.arch", get_mcoffee_4_tcoffee(), "hmmtop.psv");
+ S=seq2template_seq (S,buf, F);
+ return S;
+ }
+ else if ( strm ( template_list, "PSITM"))
+ {
+
+ /*predict transmembrane structure*/
+ check_program_is_installed (HMMTOP_4_TCOFFEE,NULL, NULL,HMMTOP_ADDRESS, INSTALL_OR_DIE);
+ check_blast_is_installed(server);
+
+ sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#psitm_template@database#%s@arch#%s/%s@psv#%s/%s@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_T_", prot_db, get_mcoffee_4_tcoffee(), "hmmtop.arch", get_mcoffee_4_tcoffee(), "hmmtop.psv",get_cache_dir(), BmI,BMI,BmC,server);
+ S=seq2template_seq (S,buf, F);
+ return S;
+ }
+
+ else if (strm ( template_list, "PSIBLAST"))
+ {
+
+ check_blast_is_installed(server);
+ sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#psiprofile_template@database#%s@method#psiblast@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_R_", prot_db,get_cache_dir(),BmI,BMI,BmC,server);
+ S=seq2template_seq (S,buf, F);
+
+ return S;
+ }
+ else if (strm ( template_list, "BLAST") )
+ {
+ check_blast_is_installed(server);
+ sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#profile_template@database#%s@method#blastp@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_R_", prot_db,get_cache_dir(),BmI,BMI,BmC,server);
+ S=seq2template_seq (S,buf, F);
+
+ return S;
+ }
+ else if ( strm ( template_list, "EXPRESSO") || strm (template_list, "PDB"))
+ {
+ check_blast_is_installed(server);
+
+ int isRNA = 0;
+ int i;
+ for (i= 0; i < S->len[0]; ++i)
+ {
+ isRNA = (isRNA || is_rna(S->seq[0][i]));
+ }
+
+ if (isRNA)
+ {
+ sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#pdb_template@database#%s@method#blastn@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_P_@pdb_type#%s",pdb_db, get_cache_dir(),PmI,PMI,PmC, server,pdb_type);
+ }
+ else
+ {
+ sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#pdb_template@database#%s@method#blastp@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_P_@pdb_type#%s",pdb_db, get_cache_dir(),PmI,PMI,PmC, server,pdb_type);
+ }
+ return seq2template_seq (S,buf, F);
+ }
+
+ else if ( strm (template_list, "RCOFFEE") || strm (template_list, "RNA"))
+ {
+ char *file_struc_clac = vtmpnam (NULL);
+ FILE* struc_calc_f =vfopen(file_struc_clac,"w");
+ int i;
+ int j=0;
+ for (i = 0; i< S->nseq; ++i)
+ {
+ if (S->T[i]->P)
+ {
+ ++j;
+ fprintf(struc_calc_f,"%s %s\n",S->name[i],S->T[i]->P->template_file);
+ }
+ }
+ vfclose(struc_calc_f);
+ if (j == S->nseq)
+ {
+// S = seq2template_seq (S,buf,F);
+ sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#calc_rna_template@pdbfile#%s@cache#%s@type#_F_", file_struc_clac,get_cache_dir());
+ }
+ else
+ {
+ check_program_is_installed (RNAPLFOLD_4_TCOFFEE,NULL, NULL,RNAPLFOLD_ADDRESS, IS_FATAL);
+ sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#RNA_template@type#_F_");
+ if (j > 0)
+ {
+ S = seq2template_seq (S,buf,F);
+ sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#calc_rna_template@pdbfile#%s@cache#%s@type#_F_", file_struc_clac,get_cache_dir());
+ }
+ }
+// printf("IN T_\n");
+ return seq2template_seq (S,buf,F);
+ }
+
+
+ /*2: Templates from seqnames (SELF) or named like the sequences (SEQFILE)*/
+ else if ( strstr (template_list, "SELF_") ||strstr (template_list, "SEQFILE_") )
+ {
+ int a;
+ char *p;
+
+ //add template
+ for (a=0; a< S->nseq; a++)
+ {
+
+ if ( (p=strstr (template_list,"SELF_")))p=S->name[a];
+ else if ( strstr (template_list, "SEQFILE_"))p=template_list;
+ else
+ {
+ fprintf ( stderr, "\nUnkown mode for Template [FATAL:%s]\n", PROGRAM);
+ myexit (EXIT_FAILURE);
+ }
+
+ if ( strstr (template_list, "_P_") && !(S->T[a])->P)(S->T[a])->P =fill_P_template ( S->name[a], p,S);//PDB
+ else if ( strstr (template_list, "_S_") && !(S->T[a])->S)(S->T[a])->S =fill_S_template ( S->name[a], p,S);//Sequence
+ else if ( strstr (template_list, "_R_" )&& !(S->T[a])->R)(S->T[a])->R =fill_R_template ( S->name[a], p,S);//pRofile
+ else if ( strstr (template_list, "_G_" )&& !(S->T[a])->G)(S->T[a])->G =fill_G_template ( S->name[a], p,S);//Genomic
+ else if ( strstr (template_list, "_F_" )&& !(S->T[a])->F)(S->T[a])->F =fill_F_template ( S->name[a], p,S);//Fold
+ else if ( strstr (template_list, "_T_" )&& !(S->T[a])->T)(S->T[a])->T =fill_T_template ( S->name[a], p,S);//Trans Membrane
+ else if ( strstr (template_list, "_E_" )&& !(S->T[a])->E)(S->T[a])->E =fill_E_template ( S->name[a], p,S);//Secondary Structure
+ else if ( strstr (template_list, "_U_" )&& !(S->T[a])->U)(S->T[a])->U =fill_U_template ( S->name[a], p,S);//unicode, list template
+
+ }
+ return S;
+ }
+
+ /*2: Templates comes in a template_file*/
+ else if ( template_list==NULL || format_is_fasta (template_list))
+ {
+ Sequence *T;
+ int a, i;
+ int ntemp=0;
+ T=(template_list!=NULL)?get_fasta_sequence (template_list, NULL):S;
+ for (a=0; a< T->nseq; a++)
+ {
+
+ char *p;
+ if ((i=name_is_in_list(T->name[a], S->name, S->nseq, MAXNAMES))!=-1)
+ {
+
+ if ( (p=strstr (T->seq_comment[a], " _P_ ")) && !(S->T[i])->P &&( (S->T[i])->P=fill_P_template (S->name[i],p,S)))ntemp++;
+ else if ( (p=strstr (T->seq_comment[a], " _F_ ")) && !(S->T[i])->F &&( (S->T[i])->F=fill_F_template (S->name[i],p,S)))ntemp++;
+ else if ( (p=strstr (T->seq_comment[a], " _S_ ")) && !(S->T[i])->S &&( (S->T[i])->S=fill_S_template (S->name[i],p,S)))ntemp++;
+
+ else if ( (p=strstr (T->seq_comment[a], " _R_ ")) && !(S->T[i])->R &&( (S->T[i])->R=fill_R_template (S->name[i],p,S)))ntemp++;
+ else if ( (p=strstr (T->seq_comment[a], " _G_ ")) && !(S->T[i])->G &&( (S->T[i])->G=fill_G_template (S->name[i],p,S)))ntemp++;
+ else if ( (p=strstr (T->seq_comment[a], " _T_ ")) && !(S->T[i])->T &&( (S->T[i])->T=fill_T_template (S->name[i],p,S)))ntemp++;
+ else if ( (p=strstr (T->seq_comment[a], " _E_ ")) && !(S->T[i])->E &&( (S->T[i])->E=fill_E_template (S->name[i],p,S)))ntemp++;
+ else if ( (p=strstr (T->seq_comment[a], " _U_ ")) && !(S->T[i])->U &&( (S->T[i])->E=fill_U_template (S->name[i],p,S)))ntemp++;
+
+ if (T!=S)strcat (S->seq_comment[i], T->seq_comment[a]);
+
+ }
+ }
+
+ if (T!=S)free_sequence (T, -1);
+
+ if ( remove_template_file==2)
+ {
+ vremove (template_list);
+ }
+ else
+ if (template_list)display_output_filename ( stdout, "Template_List","fasta_seq", template_list, STORE);
+ return S;
+ }
+
+ /*3 Templates are generated with a script*/
+ else if (strstr (template_list, "SCRIPT_") && get_string_variable ("multi_core") && strstr (get_string_variable ("multi_core"), "templates") && get_nproc()>1)
+ {
+ char *tmp1,*command;
+ Alignment *A;
+ char **temp_file,**seq_file;
+ int * pid_list, pid, npid, submited;
+ int nproc, max_nproc;
+ int num=0;
+
+ char outfile[1000];
+ static char *script;
+ static int ntemp;
+ char *p;
+ int z, i;
+ int freeF=0;
+
+ if (!script)script=vcalloc ( 1000, sizeof(char));
+
+ ntemp++;
+
+ command=vcalloc ( 1000, sizeof (char));
+ tmp1=vtmpnam (NULL);
+
+ A=seq2aln (S,NULL, 0);
+ string_array_upper(A->seq_al, A->nseq);
+ output_fasta_seq (tmp1, A);
+ sprintf ( script, "%s", after_strstr (template_list, "SCRIPT_"));
+
+ if ((p=strstr (template_list, "@type#")))
+ p+=strlen ("@type#");
+
+ if (!F){F=parse_fname (S->file[0]);freeF=1;}
+ sprintf (outfile, "%s%s_%s%d.template_list", F->path,F->name,template_type2short_type_name(p),ntemp);
+ while ( check_file_exists (outfile))
+ {
+ sprintf (outfile, "%s%s_%s%d.%d.template_list",F->path, F->name,template_type2short_type_name(p),ntemp, ++num);
+ }
+ if (freeF)free_fname(F);
+
+ nproc=get_nproc();
+ //max_nproc=2*nproc;
+ max_nproc=20; //EBI recommended maximum
+ script=substitute(script, "@", " -");
+ script=substitute(script, "#", "=");
+
+ temp_file=vcalloc ( A->nseq, sizeof (char*));
+ seq_file =vcalloc (A->nseq, sizeof (char*));
+ pid_list =vcalloc (MAX_N_PID, sizeof (int *));
+
+ fprintf ( stderr, "\n\t------ Fetch Templates [Multi Core Mode %d CPUs]\n",get_nproc());
+ for (npid=0, submited=0,i=0; i<S->nseq; i++)
+ {
+ FILE *fp2;
+ seq_file[i]=vtmpnam (NULL);
+ temp_file[i]=vtmpnam (NULL);
+ fp2=vfopen (seq_file[i], "w");
+ fprintf ( fp2, ">%s\n%s\n", S->name[i], S->seq[i]);
+ vfclose (fp2);
+
+ pid=vvfork(NULL);
+ if (pid==0)
+ {
+ initiate_vtmpnam (NULL);
+ if ( strstr (script, "tc_generic_method"))
+ {
+ //sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s -tmpdir=%s",get_string_variable ("t_coffee"),script,seq_file[i],temp_file[i],get_tmp_4_tcoffee());
+ sprintf ( command, "%s -infile=%s -outfile=%s -tmpdir=%s",script,seq_file[i],temp_file[i],get_tmp_4_tcoffee());
+ }
+ else
+ //sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s",get_string_variable("t_coffee"),script,seq_file[i],temp_file[i]);
+ sprintf ( command, "%s -infile=%s -outfile=%s",script,seq_file[i],temp_file[i]);
+ command=substitute(command, "@", " ");
+ //my_system ( command);
+ myexit (my_system(command));
+ }
+ else
+ {
+ pid_list[pid]=npid;
+ //set_pid(pid);
+ npid++;
+ submited++;
+ submited=vwait_npid(submited,max_nproc,nproc);
+ }
+ }
+
+ submited=vwait_npid(submited,0,0);
+ //Concatenate all the files
+ vremove (outfile);
+ for (i=0; i<npid; i++) file_cat (temp_file[i],outfile);
+
+ //Free the process table
+ vfree (temp_file);
+ vfree (pid_list);
+ vfree (seq_file);
+
+ free_aln (A);
+ if ( check_file_exists (outfile) && format_is_fasta(outfile))
+ {
+ S=seq2template_seq (S, outfile, F);
+ }
+ else if (strstr (command, "webblast.pl"))return S;
+ else
+ {
+
+ add_warning (stderr, "Could not Run %s to find templates[%s](Forked mode)\n",command, PROGRAM);
+ return NULL;
+ }
+
+ vfree (command);
+ return S;
+ }
+
+ else if (strstr (template_list, "SCRIPT_"))
+ {
+ char x[299];
+ char *tmp1,*command;
+ Alignment *A;
+ char outfile[1000];
+ static char *script;
+ static int ntemp;
+ char *p;
+ int z;
+ if (!script)script=vcalloc ( 1000, sizeof(char));
+
+ ntemp++;
+
+ command=vcalloc ( 1000, sizeof (char));
+ tmp1=vtmpnam (NULL);
+
+ A=seq2aln (S,NULL, 0);
+ string_array_upper(A->seq_al, A->nseq);
+ output_fasta_seq (tmp1, A);
+ sprintf ( script, "%s", after_strstr (template_list, "SCRIPT_"));
+ fprintf ( stderr, "\n");
+ if ((p=strstr (template_list, "@type#")))
+ p+=strlen ("@type#");
+ if (F)
+ {
+ sprintf (outfile, "%s%s_%s%d.template_list", F->path,F->name,template_type2short_type_name(p),ntemp);
+ }
+ else
+ {
+ F=parse_fname (S->file[0]);
+ sprintf (outfile, "%s%s_%s%d.template_list",F->path, F->name,template_type2short_type_name(p),ntemp);
+ free_fname (F);
+ }
+
+ script=substitute(script, "@", " -");
+ script=substitute(script, "#", "=");
+
+ if ( strstr (script, "tc_generic_method"))
+ {
+ sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s -tmpdir=%s",get_string_variable ("t_coffee"),script, tmp1,outfile,get_tmp_4_tcoffee());
+ }
+ else sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s",get_string_variable("t_coffee"),script, tmp1, outfile);
+
+ vremove (outfile);
+ command=substitute(command, "@", " ");
+
+ my_system ( command);
+
+ free_aln (A);
+
+ if ( check_file_exists (outfile) && format_is_fasta(outfile))
+ {
+ S=seq2template_seq (S, outfile, F);
+ }
+ else if (strstr (command, "webblast.pl"))return S;
+ else
+ {
+
+ add_warning (stderr, "Could not Run %s to find templates[%s](unforked mode)\n",command, PROGRAM);
+ return NULL;
+ }
+
+ vfree (command);
+ return S;
+ }
+
+ return S;
+}
+
+char* seq2template_file (Sequence *S, char *file)
+{
+ Alignment *A;
+ int i;
+ if (!S)return file;
+ if (file==NULL)file=vtmpnam (NULL);
+
+ seq2template_file2 (S, file, "w");
+
+ for (i=0; i<S->nseq; i++)
+ {
+ if ( (A=seq2R_template_profile (S, i)))
+ {
+ Sequence *S;
+ S=A->S;
+ if (S)seq2template_file2 (A->S, file, "a");
+ }
+ }
+ return file;
+}
+
+int seq2template_file2 (Sequence *S, char *file, char *mode)
+{
+ FILE *fp;
+ int i;
+ char buf1[10000];
+ char buf2[10000];
+ struct X_template *X;
+
+ fp=vfopen ( file, mode);
+ for ( i=0; i< S-> nseq; i++)
+ {
+ buf1[0]=0;
+ if ( S->T)
+ {
+ if (S->T[i])
+ {
+ if ( (X=(S->T[i])->P)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}
+ /*if ( (X=(S->T[i])->S)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}*/
+ if ( (X=(S->T[i])->R)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}
+ if ( (X=(S->T[i])->G)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}
+ if (buf1[0])fprintf ( fp, ">%s %s\n", S->name[i], buf1);
+ }
+ }
+ }
+ vfclose (fp);
+ return EXIT_SUCCESS;
+}
+
+
+
+
+int seq2n_X_template ( Sequence *S, char *type)
+{
+ int a, n;
+
+ for (n=0,a=0; a< S->nseq; a++)
+ {
+ if ( strm2 (type, "_P_","_*_") && (S->T[a])->P)n++;
+ if ( strm2 (type, "_F_","_*_") && (S->T[a])->F)n++;
+ if ( strm2 (type, "_S_","_*_") && (S->T[a])->S)n++;
+ if ( strm2 (type, "_R_","_*_") && (S->T[a])->R)n++;
+ if ( strm2 (type, "_G_","_*_") && (S->T[a])->G)n++;
+ }
+ return n;
+}
+struct X_template *fill_X_template ( char *name, char *p, char *token)
+{
+ struct X_template *X;
+
+
+
+
+ char *k;
+
+ X=vcalloc (1, sizeof (X_template));
+ sprintf ( X->seq_name, "%s", name);
+ if ( (k=strstr (p, token)))sscanf (k+strlen(token), "%s",X->template_name);
+ else sprintf (X->template_name, "%s", p);
+
+
+ /*Add a Structure HERE*/
+ sprintf ( X->template_type, "%s", token);
+ if ( strm (token, "_P_"))X->VP=vcalloc (1, sizeof (P_template));
+ if ( strm (token, "_F_"))X->VF=vcalloc (1, sizeof (F_template));
+
+ if ( strm (token, "_S_"))X->VS=vcalloc (1, sizeof (S_template));
+ if ( strm (token, "_R_"))X->VR=vcalloc (1, sizeof (R_template));
+ if ( strm (token, "_G_"))X->VG=vcalloc (1, sizeof (G_template));
+ if ( strm (token, "_T_"))X->VT=vcalloc (1, sizeof (T_template));
+ if ( strm (token, "_E_"))X->VE=vcalloc (1, sizeof (E_template));
+ if ( strm (token, "_U_"))X->VU=vcalloc (1, sizeof (U_template));
+
+ return X;
+}
+
+struct X_template* free_X_template ( struct X_template *X)
+{
+ if (X->VP)
+ {
+ vfree (X->VP);
+ }
+ if (X->VF)
+ {
+ vfree (X->VF);
+ }
+ if ( X->VS)
+ {
+ free_sequence ((X->VS)->S, -1);
+ vfree (X->VS);
+ }
+ if ( X->VR)
+ {
+ free_aln ((X->VR)->A);
+ vfree (X->VR);
+ }
+ if ( X->VG)
+ {
+ free_sequence ((X->VG)->S, -1);
+ vfree (X->VG);
+ }
+
+ vfree (X);
+ return NULL;
+}
+
+FILE * display_sequence_templates (Sequence *S,int i, FILE *io)
+{
+
+
+ io=display_X_template ( (S->T[i])->P, io);
+
+ io=display_X_template ( (S->T[i])->F, io);
+
+ io=display_X_template ( (S->T[i])->S, io);
+
+ io=display_X_template ( (S->T[i])->R, io);
+ io=display_X_template ( (S->T[i])->G, io);
+ io=display_X_template ( (S->T[i])->T, io);
+ io=display_X_template ( (S->T[i])->E, io);
+
+ return io;
+}
+
+FILE * display_X_template (struct X_template *X, FILE *io)
+{
+
+ if ( !X) return io;
+ if ( !strm (X->template_type, "_S_"))fprintf (io, "\n\t%s: Template=%s, File=%s",template_type2type_name (X->template_type), X->template_name,X->template_file);
+ return io;
+}
+char *template_type2short_type_name (char *type)
+{
+ //add_template
+ if (!type)return "";
+ else if ( strstr (type, "_P_")) return "pdb";
+ else if ( strstr (type, "_F_")) return "rfold";
+ else if ( strstr (type, "_S_")) return "seq";
+ else if ( strstr (type, "_R_")) return "prf";
+ else if ( strstr (type, "_G_")) return "genome";
+ else if ( strstr (type, "_E_")) return "ssp";
+ else if ( strstr (type, "_T_")) return "tmp";
+ else if ( strstr (type, "_U_")) return "unicode";
+ else return type;
+}
+char *template_type2type_name (char *type)
+{
+ //add_template
+ if ( strstr (type, "_P_")) return "PDB struc";
+ else if ( strstr (type, "_F_")) return "RNA Fold";
+ else if ( strstr (type, "_S_")) return "Sequeence";
+ else if ( strstr (type, "_R_")) return "Profile";
+ else if ( strstr (type, "_G_")) return "Genomic";
+ else if ( strstr (type, "_E_")) return "Protein Secondary Structure";
+ else if ( strstr (type, "_T_")) return "Protein Trans Membrane Structure ";
+ else if ( strstr (type, "_U_")) return "Unicode and strings";
+
+ else return type;
+}
+struct X_template *fill_F_template ( char *name,char *p, Sequence *S)
+{
+ /*Profile template*/
+ struct X_template *F;
+
+ F=fill_X_template ( name, p, "_F_");
+ sprintf (F->template_format , "TCOFFEE_LIBRARY");
+ if (!F || !check_file_exists (F->template_name))
+ {
+ fprintf ( stderr, "Could Not Fill _F_ (Fold) template for sequence |%s|", name);
+ free_X_template (F);
+ return NULL;
+ }
+ else if ( check_file_exists (F->template_name))
+ {
+ sprintf ( F->template_file, "%s", F->template_name);
+ }
+
+ return F;
+
+}
+
+
+struct X_template *fill_P_template ( char *name,char *p, Sequence *S)
+{
+ struct X_template *P;
+ Sequence *PS;
+ Alignment *A;
+ int sim, cov, i;
+ char *buf;
+
+
+ P=fill_X_template ( name, p, "_P_");
+ sprintf (P->template_format , "pdb");
+
+ if (!P ||(check_file_exists (P->template_name) && !is_pdb_file (P->template_name) ))
+ {
+ //fprintf ( stderr, "Could Not Fill _P_ template for sequence |%s|", name);
+ free_X_template (P);
+ return NULL;
+ }
+ else if ( check_file_exists (P->template_name))
+ {
+ sprintf ( P->template_file, "%s", P->template_name);
+ buf=path2filename (P->template_name);
+ if (P->template_name!=buf)
+ {
+ sprintf ( P->template_name, "%s",buf );
+ vfree (buf);
+ }
+ }
+ else
+ {
+ char *st;
+
+ st=is_pdb_struc (P->template_name);
+ if (st)
+ {
+ if (st!=P->template_file)sprintf ( P->template_file, "%s", st);
+ }
+ }
+
+ /*Make a first run to fix relaxed PDB files*/
+ buf=fix_pdb_file (P->template_file);
+
+ if ( buf!=P->template_file)
+ {
+
+ sprintf ( P->template_file, "%s",buf);
+ vfree (buf);
+ }
+
+ /*Check the PDB FILE EXISTS*/
+
+ if (!is_pdb_file (P->template_file))
+ {
+
+ if (p)add_warning(stderr, "_P_ Template | %s | Could Not Be Found\n",p);
+ else if (name)add_warning(stderr, "_P_ Template | %s | Could Not Be Found\n",name);
+ free_X_template (P);
+ return NULL;
+ }
+ else
+ {
+ buf= get_pdb_id (P->template_file);
+ if (buf!=(P->VP)->pdb_id)
+ {
+ sprintf ((P->VP)->pdb_id, "%s", buf);
+ vfree (buf);
+ }
+ }
+
+ /*Check the target sequence is similar enough*/
+
+ PS=get_pdb_sequence (P->template_file);
+
+
+
+ if ( PS==NULL)
+ {
+ add_warning( stderr, "_P_ Template |%s| Could Not be Used for Sequence |%s|: Structure Not Found", P->template_name, name);
+ free_X_template (P);P=NULL;
+ }
+ else
+ {
+ int minsim=get_int_variable ("pdb_min_sim");
+ int mincov=get_int_variable ("pdb_min_cov");
+
+
+ i=name_is_in_list (name, S->name, S->nseq, 100);
+
+ A=align_two_sequences (S->seq[i], PS->seq[0],"idmat",-3,0, "fasta_pair_wise");
+
+ sprintf ( A->name[0], "seq");
+ sprintf ( A->name[1], "pdb");
+ cov=aln2coverage (A, 0);
+ sim=aln2sim (A, "idmat");
+
+ if (sim<=minsim)
+ {
+ add_information( stderr, "_P_ Template %s Could Not be Used for Sequence %s: Similarity too low [%d, Min=%d]",P->template_name,name,sim,minsim);
+ add_information( stderr, "If you want to include %s in anycase,add -pdb_min_sim=%d to the command line",name,sim);
+ print_aln (A);
+ free_X_template (P);
+ P=NULL;
+ }
+ else if ( cov<=mincov)
+ {
+ add_information(stderr, "_P_ Template |%s| Could Not be Used for Sequence |%s|: Coverage too low [%d, Min=%d]",P->template_name,name, cov, mincov);
+ add_information( stderr, "If you want to include this sequence in anycase add -pdb_min_cov=%d to the command line", cov);
+ print_aln (A);
+ free_X_template (P);P=NULL;
+ }
+ free_aln(A);
+ free_sequence (PS, -1);
+ }
+
+ return P;
+}
+
+struct X_template *fill_S_template ( char *name,char *p, Sequence *Seq)
+{
+ struct X_template *S;
+ S=fill_X_template ( name, p, "_S_");
+ if ( strm (name, p))sprintf ( S->template_file, "%s",output_fasta_seqX (NULL,"w",Seq,NULL, seq_name2index (name, Seq)));
+ (S->VS)->S=get_fasta_sequence (S->template_file, NULL);
+ return S;
+}
+struct X_template *fill_R_template ( char *name,char *p, Sequence *S)
+{
+ /*Profile template*/
+ struct X_template *R;
+
+
+ R=fill_X_template ( name, p, "_R_");
+ sprintf (R->template_format , "fasta_aln");
+
+
+ if (!is_aln(R->template_name) && !is_seq (R->template_name))
+ {
+
+ add_information ( stderr, "_R_ Template %s Could Not Be Found\n",R->template_name);
+ free_X_template (R);
+ return NULL;
+ }
+ else
+ {
+ int s;
+ Sequence *S1;
+ Alignment *A1;
+
+ (R->VR)->A=main_read_aln (R->template_name, NULL);
+
+ if ( !S)
+ sprintf ( R->template_file, "%s", R->template_name);
+ else
+ {
+ s=name_is_in_list(name, S->name, S->nseq, 100);
+ if ( s!=-1)
+ {
+ S1=fill_sequence_struc (1, &S->seq[s], &S->name[s]);
+ A1=seq2aln (S1,NULL, RM_GAP);
+
+ (R->VR)->A=trim_aln_with_seq (A1, (R->VR)->A);
+
+ sprintf ( R->template_file, "%s", vtmpnam (NULL));
+ output_clustal_aln (R->template_file, (R->VR)->A);
+ }
+ else
+ sprintf ( R->template_file, "%s", R->template_name);
+ }
+ (R->VR)->A=aln2profile ((R->VR)->A);
+
+ //free_data_in_aln ((R->VR)->A);
+
+ }
+ return R;
+}
+
+struct X_template *fill_T_template ( char *name,char *p, Sequence *S)
+{
+ /*Profile template*/
+ struct X_template *T;
+
+ T=fill_X_template ( name, p, "_T_");
+ sprintf (T->template_format , "fasta_seq");
+
+ if (!is_aln(T->template_name) && !is_seq (T->template_name))
+ {
+
+ add_information ( stderr, "_T_ Template %s Could Not Be Found\n",T->template_name);
+ free_X_template (T);
+ return NULL;
+ }
+ else
+ {
+
+ (T->VT)->S=main_read_seq(T->template_name);
+ sprintf ( T->template_file, "%s", T->template_name);
+ }
+ return T;
+}
+//add template
+struct X_template *fill_U_template ( char *name,char *p, Sequence *S)
+{
+ /*Profile template*/
+ struct X_template *U;
+
+ U=fill_X_template ( name, p, "_U_");
+ sprintf (U->template_format , "string list");
+
+ if (!check_file_exists(U->template_name))
+ {
+ add_information ( stderr, "_U_ Template %s Could Not Be Found\n",U->template_name);
+ free_X_template (U);
+ return NULL;
+ }
+ else
+ {
+ //(U->VU)->list=file2string(U->template_name);
+ sprintf ( U->template_file, "%s", U->template_name);
+ }
+ return U;
+}
+struct X_template *fill_E_template ( char *name,char *p, Sequence *S)
+{
+ /*Profile template*/
+ struct X_template *E;
+
+
+ E=fill_X_template ( name, p, "_E_");
+ sprintf (E->template_format , "fasta_seq");
+
+ if (!is_aln(E->template_name) && !is_seq (E->template_name))
+ {
+
+ add_information ( stderr, "_E_ Template %s Could Not Be Found\n",E->template_name);
+ free_X_template (E);
+ return NULL;
+ }
+ else
+ {
+ (E->VE)->S=main_read_seq (E->template_name);
+ sprintf ( E->template_file, "%s", E->template_name);
+ }
+ return E;
+}
+struct X_template *fill_G_template ( char *name,char *p, Sequence *S)
+{
+ struct X_template *G;
+ G=fill_X_template ( name, p, "_G_");
+ sprintf (G->template_format , "fasta_seq");
+
+ /*1: Get the sequence from another file if needed*/
+ if ( strm (name, p))sprintf ( G->template_file, "%s",output_fasta_seqX (NULL,"w",S,NULL, seq_name2index (name, S)));
+ else if ( strstr (p, "SEQFILE_"))
+ {
+ Sequence *ST;
+ int i2;
+
+
+ ST=main_read_seq (after_strstr ( p,"SEQFILE_G_"));
+
+ i2=seq_name2index (name, ST);
+ if ( i2!=-1)
+ {
+ sprintf ( G->template_file, "%s",output_fasta_seqX (NULL,"w",ST,NULL, i2));
+ sprintf ( G->template_name, "%s", name);
+ }
+ free_sequence (ST, -1);
+ }
+ else sprintf (G->template_file, "%s", G->template_name);
+
+
+ /*2: Put the template in VG->S*/
+ if (!is_seq (G->template_file))
+ {
+ add_information ( stderr, "_G_ Template %s Could Not Be Found \n",p);
+
+ free_X_template (G);
+ return NULL;
+ }
+ else
+ {
+ (G->VG)->S=get_fasta_sequence (G->template_file, NULL);
+ }
+ return G;
+}
+
+
+char *seq2T_value ( Sequence *S, int n, char *value, char *type)
+{
+ static char *rv_buf;
+ X_template *X;
+
+ if ( !rv_buf)rv_buf=vcalloc (100, sizeof(char));
+ if (!(X=seq_has_template (S, n, type)))return NULL;
+ else
+ {
+ if (strm (value, "template_file"))return X->template_file;
+ else if ( strm (value, "template_name"))return X->template_name;
+ else if ( strm (value, "seq_name"))return X->seq_name;
+ else if (strm (type, "_P_"))
+ {
+ if ( strm (value, "pdb_id"))return (X->VP)->pdb_id;
+ }
+ else if ( strm (type, "_R_"))
+ {
+ if ( strm (value, "A"))
+ {
+ if ((X->VR)->A)
+ {sprintf ( rv_buf, "%ld", (long)(X->VR)->A);return rv_buf;}
+ else return NULL;
+ }
+ }
+
+ }
+ return NULL;
+}
+char *seq2P_pdb_id (Sequence *S, int n)
+{
+ if (!S->T || !S->T[n] || !(S->T[n])->P ) return NULL;
+ else return ((S->T[n])->P)->template_name;
+}
+
+
+char *seq2P_template_file(Sequence *S, int n)
+{
+
+ return seq2T_value (S, n, "template_file", "_P_");
+}
+
+char *profile2P_template_file (Sequence *S, int n)
+{
+ Alignment *A;
+ int a;
+ char *p;
+
+ if ( !(A=seq2R_template_profile (S, n)))return NULL;
+ for (a=0; a<A->nseq; a++)
+ {
+ if ((p=seq2P_template_file (A->S, a))!=NULL)return p;
+ }
+ return NULL;
+}
+Alignment * seq2R_template_profile (Sequence *S, int n)
+{
+ X_template *X;
+
+ return (Alignment *)atop(seq2T_value (S, n, "A", "_R_"));
+
+ if (!(X=seq_has_template (S, n, "_R_")))return NULL;
+ else
+ {
+ if (!(X->VR))return NULL;
+ else return (X->VR)->A;
+ }
+ return NULL;
+
+
+
+}
+char * seq2E_template_string (Sequence *S, int n)
+{
+ struct X_template *T;
+
+ if ( (T=seq_has_template (S, n, "_E_"))!=NULL)
+ return ((T->VE)->S)->seq[0];
+ else
+ return NULL;
+}
+//add template
+int* seq2U_template (Sequence *S, int n)
+{
+ struct X_template *T;
+
+ if ( (T=seq_has_template (S, n, "_U_"))!=NULL)
+ return (T->VU)->list;
+ else
+ return NULL;
+}
+char * seq2T_template_string (Sequence *S, int n)
+{
+ struct X_template *T;
+
+ if ( (T=seq_has_template (S, n, "_T_"))!=NULL)
+ return ((T->VT)->S)->seq[0];
+ else
+ return NULL;
+}
+
+struct X_template* seq_has_template ( Sequence *S, int n, char *mode)
+{
+ Template *T;
+
+ if ( !S || !mode) return NULL;
+ else if ( n<0 || n>=S->nseq)return NULL;
+ else if ( !(S->T)) return NULL;
+ else if ( !(S->T[n]))return NULL;
+
+ T=S->T[n];
+ //ADD STRUCTURE
+ //add template
+ if ( strm (mode, "_P_"))return T->P;
+ else if ( strm (mode, "_F_"))return T->F;
+ else if ( strm (mode, "_S_"))return T->S;
+ else if ( strm (mode, "_R_"))return T->R;
+ else if ( strm (mode, "_T_"))return T->T;
+ else if ( strm (mode, "_E_"))return T->E;
+ else if ( strm (mode, "_U_"))return T->U;
+ else if ( strm (mode, "_G_"))return T->G;
+ else return NULL;
+}
+
+char ** name2random_subset (char **in_name, int n_in, int n_out)
+{
+ char **out_name;
+
+ int **list;
+ int a,max;
+
+
+ vsrand (0);
+ max=n_in*10000;
+ out_name=declare_char (n_out,MAXNAMES+1 );
+ list=declare_int (n_in, 2);
+
+ for (a=0; a<n_in; a++)
+ {
+ list[a][0]=a;
+ list[a][1]=rand ()%max;
+ }
+ sort_int ( list,2, 1, 0, n_in-1);
+
+ for ( a=0; a<n_in; a++)
+ {
+ sprintf ( out_name[a], "%s", in_name[list[a][0]]);
+ }
+ free_int (list, -1);
+ return out_name;
+}
+
+Alignment * aln2random_order (Alignment *A)
+{
+
+ char **name_list;
+
+ name_list=name2random_subset (A->name, A->nseq, A->nseq);
+ A=reorder_aln (A, name_list, A->nseq);
+ free_char (name_list, -1);
+ return A;
+}
+Alignment *aln2jacknife (Alignment *A, int nseq, int len)
+{
+ int a, b;
+
+ if (nseq!=0 && nseq<A->nseq)
+ {
+ char **name;
+
+ name=name2random_subset (A->name, A->nseq, nseq);
+ A=reorder_aln (A, name, nseq);
+ free_char (name, -1);
+ }
+
+ if (len!=0 && len<A->len_aln)
+ {
+ int **l;
+ Alignment *B;
+
+ l=declare_int (A->len_aln, 2);
+ for (a=0; a< A->len_aln; a++)
+ {
+ l[a][0]=a;
+ l[a][1]=rand()%(A->len_aln*1000);
+ }
+ sort_int ( l,2, 1, 0, A->len_aln-1);
+ B=copy_aln (A, NULL);
+ for ( a=0; a< len; a++)
+ {
+ for ( b=0; b<A->nseq; b++)
+ {
+ A->seq_al[b][a]=B->seq_al[b][l[a][0]];
+ }
+ }
+ for (b=0; b<A->nseq; b++)A->seq_al[b][len]='\0';
+ free_aln (B);
+ free_int (l, -1);
+ }
+ return A;
+}
+Alignment * aln2scramble_seq (Alignment *A)
+{
+ int **list;
+ char **name_list;
+ int a,max;
+
+ max=100*A->nseq;
+ vsrand (0);
+
+ list=declare_int (A->nseq, 2);
+ name_list=vcalloc (A->nseq, sizeof (char*));
+
+
+ for (a=0; a<A->nseq; a++)
+ {
+ list[a][0]=a;
+ list[a][1]=rand ()%max;
+ }
+ sort_int ( list,2, 1, 0, A->nseq-1);
+
+ for ( a=0; a< A->nseq; a++)
+ name_list[a]=A->seq_al[a];
+ for (a=0; a<A->nseq; a++)
+ {
+ A->seq_al[a]=name_list[list[a][0]];
+ }
+ vfree (name_list);
+ free_int (list, -1);
+ return aln2random_order (A);
+}
+
+
+
+Alignment * reorder_aln ( Alignment *A, char **name, int nseq)
+ {
+ int a,sn;
+ Alignment *BUF;
+ int n=0;
+ int *tpp_int;
+
+ if ( name==NULL)return aln2random_order(A);
+
+
+ BUF=copy_aln ( A,NULL);
+ for ( a=0; a<nseq; a++)
+ {
+ sn =name_is_in_list ( name[a],BUF->name, A->nseq,STRING);
+ if ( sn==-1)
+ {
+ ;
+ }
+ else
+ {
+
+
+ SWAPP(A->order[n], BUF->order[sn], tpp_int);
+ sprintf ( A->name[n], "%s", BUF->name[sn]);
+ sprintf ( A->seq_al[n], "%s",BUF->seq_al[sn]);
+ sprintf ( A->seq_comment[n], "%s", BUF->seq_comment[sn]);
+
+ n++;
+
+ }
+ }
+
+ for ( a=n; a< A->nseq; a++)A->name[a][0]=A->seq_al[a][0]='\0';
+ A->nseq=n;
+
+ if ( A->A)A->A=reorder_aln(A->A, name, nseq);
+ free_aln (BUF);
+ return A;
+ }
+Sequence * reorder_seq_2 ( Sequence *A, int **order,int field, int nseq)
+ {
+ char **name;
+ int a;
+
+ if (!A || !order) return A;
+ name=declare_char (A->nseq, 100);
+ for (a=0; a<nseq; a++)
+ sprintf ( name[a], "%s", A->name[order[a][field]]);
+ A=reorder_seq (A, name,nseq);
+ free_char (name, -1);
+ return A;
+ }
+Sequence * reorder_seq ( Sequence *A, char **name, int nseq)
+ {
+ int a,sn;
+ Sequence *nA;
+
+
+ nA=duplicate_sequence (A);
+
+
+ for ( a=0; a< nseq; a++)
+ {
+ sn=name_is_in_list (name[a] ,nA->name, nA->nseq, 100);
+ if (sn==-1)continue;
+
+ if ( nA->file) sprintf ( A->file[a], "%s", nA->file[sn]);
+
+ if ( nA->seq_comment)sprintf ( A->seq_comment[a], "%s", nA->seq_comment[sn]);
+ if ( nA->aln_comment)sprintf ( A->aln_comment[a], "%s", nA->aln_comment[sn]);
+ sprintf ( A->seq[a], "%s", nA->seq[sn]);
+ A->len[a]=nA->len[sn];
+ sprintf ( A->name[a], "%s", nA->name[sn]);
+ A->T[a][0]=nA->T[sn][0];
+ }
+ A->nseq=nseq;
+ free_sequence (nA, nA->nseq);
+
+ return A;
+}
+
+char * concatenate_seq ( Sequence *S, char *conc, int *order)
+ {
+ int a;
+
+ vfree (conc);
+ conc=vcalloc ( S->nseq*S->max_len, sizeof (char));
+
+ for ( a=0; a< S->nseq; a++)
+ {
+ conc=strcat ( conc, S->seq[order[a]]);
+ }
+ return conc;
+
+ }
+
+
+
+
+Alignment * rotate_aln ( Alignment *A, char *name)
+{
+ Alignment *B;
+ int a, b;
+
+ B=declare_aln2 (A->len_aln, A->nseq+1);
+ for ( a=0; a< A->nseq; a++)
+ for ( b=0; b< A->len_aln; b++)
+ {
+ B->seq_al[b][a]=A->seq_al[a][b];
+ }
+ for (a=0; a< A->len_aln; a++)
+ if (name && name[0])sprintf ( B->name[a], "%s_%s%d", name, (a<9)?"0":"",a+1);
+ else
+ sprintf ( B->name[a], "%d", a+1);
+
+
+ for (a=0; a< A->len_aln; a++)B->seq_al[a][A->nseq]='\0';
+ B->len_aln=A->nseq;
+ B->nseq=A->len_aln;
+ /*free_aln (A);*/
+ return B;
+}
+
+Alignment * invert_aln ( Alignment *A)
+{
+ char *buf;
+ int l, a, b, c;
+
+ for ( a=0; a< A->nseq; a++)
+ {
+ l=strlen ( A->seq_al[a]);
+ buf=vcalloc ( l+1,sizeof (char) );
+
+ for ( c=l-1,b=0; b< l; b++, c--)
+ {
+ buf[c]=A->seq_al[a][b];
+ }
+ buf[l]='\0';
+ sprintf ( A->seq_al[a], "%s", buf);
+ }
+ vfree(buf);
+ return A;
+}
+char * complement_string (char *s)
+{
+ char *buf;
+ int l, a, b, c;
+
+ l=strlen (s);
+ for ( b=0; b< l; b++)
+ {
+ char r;
+ r=s[b];
+ if ( r=='a')r='t';
+ else if (r=='A')r='T';
+ else if (r=='t')r='a';
+ else if (r=='T')r='A';
+ else if (r=='g')r='c';
+ else if (r=='G')r='C';
+ else if (r=='c')r='g';
+ else if (r=='C')r='G';
+ s[b]=r;
+ }
+
+ return invert_string (s);
+}
+Alignment * complement_aln ( Alignment *A)
+{
+ char *buf;
+ int l, a, b, c;
+
+ for ( a=0; a< A->nseq; a++)
+ {
+ A->seq_al[a]=complement_string (A->seq_al[a]);
+ }
+
+ return A;
+}
+
+Alignment * extract_nol_local_aln(Alignment *A, int start, int max_end)
+ {
+ A=extract_aln ( A, start, max_end);
+ A=trunkate_local_aln (A);
+ return A;
+ }
+
+Alignment * alnpos_list2block (Alignment *A, int n, char **in_list)
+{
+ int *pos;
+ int a;
+ char **list;
+ int list_declared=0;
+ Alignment *B;
+
+ if (check_file_exists (in_list[0]))
+ {
+ int mn;
+ char ***tmp_list;
+
+ mn=count_n_line_in_file (in_list[0]);
+ list=declare_char (mn, 100);
+ list_declared=1;
+ tmp_list=file2list (in_list[0], " ");
+ a=0;
+ n=0;
+ while (tmp_list[a])
+ {
+ if (tmp_list[a][1][0]!='!')
+ {
+ sprintf (list[n++], "%s", tmp_list[a][1]);
+ }
+ a++;
+ }
+ free_arrayN ((void **)tmp_list, 3);
+ }
+ else
+ {
+ list=in_list;
+ }
+
+
+ pos=vcalloc (A->len_aln, sizeof (int));
+ for (a=0; a<n; a++)
+ {
+
+ if (strstr (list[a], "-"))
+ {
+ int start, end, x;
+ x=sscanf (list[a], "%d-%d", &start, &end);
+ if (x!=2 || !A || start<=0 || start>=end || end>A->len_aln+1)
+ {
+ add_warning ( stderr, "Illegal coordinates in extract_pos_list [%s]", list[a]);
+ return A;
+ }
+ start--; end--;
+ for (a=start; a<end; a++)pos[a]=1;
+ }
+ else
+ {
+ int p;
+ p=atoi (list[a]);
+ if (p<1 || p>A->len_aln)
+ {
+ add_warning ( stderr, "Illegal coordinates in extract_pos_list [%s]", list[a]);
+ }
+ p--;
+ pos[p]=1;
+ }
+ }
+ B=alnpos2block(A, pos, NULL);
+ vfree (pos);
+ if ( list_declared)free_char (list, -1);
+
+ return B;
+}
+Alignment * aln2block (Alignment *A, int start, int end, Alignment *B)
+{
+ if ( !A || start<=0 || start>=end || end>A->len_aln+1)
+ {
+ add_warning ( stderr, "Illegal coordinates in extract_block start=%d end=%d len=%d [Note : [start-end[, with [1...n] ** Block Ingored", start, end, A->len_aln);
+ return A;
+ }
+ else
+ {
+ int *pos, p;
+ start--;
+ end--;
+ pos=vcalloc (A->len_aln, sizeof (int));
+ for (p=start;p<end;p++)
+ {
+ pos[p]=1;
+ }
+ B=alnpos2block (A, pos, B);
+ vfree (pos);
+ return B;
+ }
+}
+Alignment * alnpos2block (Alignment *A, int *pos, Alignment *B)
+{
+
+ //extract a subset of B without over-writing A
+ int a, b;
+
+ B=copy_aln (A, B);
+ B->len_aln=0;
+ for (a=0; a<=A->len_aln; a++)
+ {
+ if ( pos[a]!=0 || a==A->len_aln)
+ {
+ for ( b=0; b<A->nseq; b++)
+ B->seq_al[b][B->len_aln]=A->seq_al[b][a];
+ if ( a!=A->len_aln)B->len_aln++;
+ }
+ }
+
+ return B;
+}
+Alignment * extract_aln ( Alignment *A, int start, int end)
+{
+ return extract_aln2 ( A, start, end, "cons");
+}
+
+Alignment * extract_aln2 ( Alignment *A, int in_start, int in_end, char *seq)
+ {
+ char *tmp;
+ FILE *fp;
+
+
+ tmp=vtmpnam (NULL);
+ fp=vfopen (tmp, "w");
+ fprintf ( fp, "%s %d %d\n", seq, in_start, in_end);
+ vfclose (fp);
+ return extract_aln3 (A,tmp);
+ }
+Alignment * extract_aln3 ( Alignment *B, char *file)
+ {
+ int a, b, c;
+ int start, end;
+ int n, i, s, nline=0;
+ FILE *fp;
+ Alignment *A=NULL;
+ int *col;
+ char name[MAXNAMES];
+ char line[VERY_LONG_STRING];
+ int *offset;
+
+ /*Reads in a file
+ #comment
+ ! seq_name offset
+ seqname pos
+ OR
+ seqname start end[
+ modifies the incoming alignment
+ */
+
+ offset=vcalloc ( B->nseq+1, sizeof (int));
+ fp=vfopen (file,"r");
+ while ( (c=fgetc(fp))!=EOF)
+ {
+ s=-1;
+ fgets ( line, VERY_LONG_STRING,fp);
+ if ( c=='!')
+ {
+ sscanf (line, "%s %d", name, &start);
+ s=name_is_in_list (name,B->name,B->nseq,MAXNAMES);
+ }
+ if (s!=-1)
+ offset[s]=start;
+ }
+
+ vfclose (fp);
+
+ A=copy_aln (B, A);
+ col=vcalloc ( A->len_aln, sizeof (int));
+
+ fp=vfopen ( file, "r");
+ while ( (c=fgetc(fp))!=EOF)
+ {
+ nline++;
+ if ( c=='#' || c=='!')fgets ( line, VERY_LONG_STRING,fp);
+ else
+ {
+ ungetc(c, fp);
+ fgets ( line, VERY_LONG_STRING,fp);
+
+ if (sscanf (line, "%s %d %d", name, &start, &end)==3);
+ else if (sscanf (line, "%s %d", name, &start)==2)
+ {
+ end=start+1;
+ }
+ else
+ {
+ add_warning ( stderr, "Wrong format in coordinate file (line=%d) ** Line Ignored", nline);
+ continue;
+ }
+ if ( end==0)end=A->len_aln+1;
+
+ s=name_is_in_list (name,A->name,A->nseq,MAXNAMES);
+
+
+ if ( s==-1 && !strm (name, "cons"))
+ {
+ add_warning ( stderr, "Seq %s does not belong to the alignment (line %d) ** Line ignored", name,nline);
+ continue;
+ }
+ else if ( start>end)
+ {
+ add_warning ( stderr, "Illegal coordinates [%s %d %d] (line %d) ** Line ignored", name,start, end,nline);
+ continue;
+ }
+ else
+ {
+ int done=0;
+ if ( s!=-1)
+ {
+ start-=offset[s]-1;
+ end-=offset[s]-1;
+ }
+ for (n=0, a=0; done!=1 && a< A->len_aln; a++)
+ {
+ i=(strm (name, "cons"))?1:!is_gap(A->seq_al[s][a]);
+
+ n+=i;
+ if (n>=start && n<end)
+ {
+ col[a]=1;
+ }
+ if (n>=end)done=1;
+ //if (n>=start && n<end && !(i==0 && n==end-1))
+ //{
+ // col[a]=1;
+ //}
+ //else if ( n>=end)a=A->len_aln;
+ }
+ if ( done==0)
+ {
+ HERE ("Warning Missing positions in File %s",file );
+ }
+ }
+ }
+ }
+ vfclose ( fp);
+
+
+
+ /*Extract [start-end[*/
+ for ( b=0,a=0; a< A->len_aln; a++)
+ {
+ if ( col[a])
+ {
+ for (c=0; c< A->nseq; c++)A->seq_al[c][b]=A->seq_al[c][a];
+ b++;
+ }
+ }
+ A->len_aln=b;
+
+ for (c=0; c< A->nseq; c++)A->seq_al[c][A->len_aln]='\0';
+ vfree (col);
+
+ return A;
+
+ }
+Alignment * trunkate_local_aln ( Alignment *A)
+ {
+ int a, b;
+ int **pos;
+ int **cache;
+ int seq;
+
+
+ cache=declare_int (return_max_int (A->order,read_size_int ( A->order,sizeof (int*)),0)+1,return_max_int (A->order,read_size_int ( A->order,sizeof (int*)),1)+A->len_aln+1);
+ pos=aln2pos_simple(A,A->nseq);
+
+ for ( b=0; b<A->len_aln; b++)
+ for ( a=0; a< A->nseq; a++)
+ {
+ seq=A->order[a][0];
+ if ( pos[a][b]<=0);
+ else if ( pos[a][b]>0)
+ {
+
+ if (cache[seq][pos[a][b]]==0)cache[seq][pos[a][b]]++;
+ else if ( cache[seq][pos[a][b]]>=1)
+ {
+ cache[seq][pos[a][b]]++;
+ A->seq_al[a][b]='\0';
+ }
+ }
+ }
+
+ A->len_aln=get_shortest_string ( A->seq_al, A->nseq, NULL, NULL);
+ pad_string_array ( A->seq_al, A->nseq, A->len_aln, '-');
+
+ free_int (pos, -1);
+ free_int ( cache,-1);
+
+
+ return A;
+ }
+
+int get_nol_aln_border ( Alignment *A, int start, int direction)
+ {
+ int a, b;
+ int **pos;
+ int **cache;
+ int seq,end;
+
+ /*This Function Returns the limit position for a non overlaping alignment*/
+
+ cache=declare_int (return_max_int (A->order,read_size_int ( A->order,sizeof (int*)),0)+1,return_max_int (A->order,read_size_int ( A->order,sizeof (int)),1)+A->len_aln+1);
+ pos=aln2pos_simple(A,A->nseq);
+ end=(direction==GO_RIGHT)?A->len_aln:-1;
+
+
+ for ( b=start; b!=end;b+=direction)
+ for ( a=0; a< A->nseq; a++)
+ {
+ seq=A->order[a][0];
+ if ( pos[a][b]<=0);
+ else if ( pos[a][b]>0)
+ {
+
+ if (cache[seq][pos[a][b]]==0)cache[seq][pos[a][b]]++;
+ else if ( cache[seq][pos[a][b]]>=1)
+ {
+ cache[seq][pos[a][b]]++;
+ free_int(cache, -1);
+ return b-direction;
+ }
+ }
+ }
+
+ free_int ( cache,-1);
+ free_int (pos, -1);
+ return end-direction;
+ }
+
+
+
+
+
+char * extract_defined_seq ( char *in, int in_of, int in_start, int *aa_def, int dir, int *out_start, char *out)
+ {
+ int start=0, end,l;
+ int b, c, d;
+
+
+
+ if ( dir==GO_LEFT){start=in_start-1;}
+ else if ( dir==GO_RIGHT){start=in_start+1;}
+
+ end=start;
+ while (aa_def[end]!=UNDEFINED)
+ {
+ end+=dir;
+ }
+ end-=dir;
+
+ if (end<start)SWAP(end,start);
+
+ l=strlen ( in);
+ out_start[0]=-1;
+ for (b=0,d=0,c=in_of;b<l; b++)
+ {
+ c+=1-is_gap(in[b]);
+ if ( c>=start && c<=end)
+ {
+ if ( out_start[0]==-1)out_start[0]=c-!is_gap(in[b]);
+ out[d++]=in[b];
+ }
+ }
+ out[d]='\0';
+
+
+ return out;
+ }
+
+Alignment * aln2N_replicate (Alignment *A,char *nn, char *name)
+{
+ int a, n;
+ char *fname;
+
+ fname=vcalloc (100, sizeof (char));
+ if (nn)n=atoi(nn);
+ else n=100;
+ if (!name){name=vcalloc (100, sizeof (char)); sprintf (name, "replicate");}
+
+
+ for (a=0; a< n;a++)
+ {
+ FILE *fp;
+ sprintf (fname, "%s.%d.rep",name, a+1);
+ fp=vfopen (fname, "w");
+
+ vfclose(aln2replicate (A, fp));
+ fprintf ( stdout, ">%s Alignment Replicate #%d\n",fname, a+1);
+ }
+ myexit (EXIT_SUCCESS);
+}
+FILE *aln2replicate (Alignment *A, FILE *fp)
+{
+ int a, b;
+ int *p;
+ float tot=0;
+ float corr;
+ if (A->col_weight)for (a=0; a<A->len_aln; a++)tot+=A->col_weight[a];
+ else tot=A->len_aln;
+
+ p=vcalloc (A->len_aln, sizeof (int));
+ corr=(float)A->len_aln/tot;
+
+ for (a=0; a<A->len_aln; a++)
+ {
+ int x;
+ x=rand()%(int)tot;
+
+ p[a]=(int)(x*corr);
+ }
+
+ for (a=0; a<A->nseq; a++)
+ {
+ fprintf ( fp, ">%s\n", A->name[a]);
+ //for (b=0;b<A->len_aln; b++)fprintf ( stdout, "%d ", (int)p[b]);
+ for (b=0;b<A->len_aln; b++)fprintf ( fp, "%c", A->seq_al[a][p[b]]);
+ fprintf ( fp, "\n");
+ }
+
+ vfree (p);
+ return fp;
+}
+
+Alignment * orthologous_concatenate_aln (Alignment *A, Sequence *S, char *mode)
+{
+ Alignment *C;
+ char **name, *cname;
+ int nname=0;
+ int a, b,c, i;
+
+ if (mode && strm (mode, "voronoi"))seq_weight2species_weight (A, S);
+
+
+ cname=vcalloc ( 100, sizeof (char));
+ name=declare_char (A->nseq, 100);
+ for (a=0; a<A->nseq; a++)
+ {
+ char *p=strstr (A->name[a], "_");
+ if (!p)
+ {
+ fprintf ( stderr, "\nWARNING: Seq %s could not be included.", A->name[a]);
+ }
+ p+=1;
+ if ( name_is_in_list (p, name,nname, 100)==-1)
+ {
+ sprintf ( name[nname++], "%s", p);
+ }
+ }
+
+ C=declare_aln2 (nname, (A->len_aln*S->nseq)+1);
+ free_char (C->name,-1); C->name=name;
+ C->nseq=nname;
+ C->col_weight=vcalloc ( A->len_aln*S->nseq, sizeof(float));
+
+ C->len_aln=0;
+ for (a=0; a<S->nseq; a++)
+ {
+ for (b=0; b<C->nseq; b++)
+ {
+ sprintf (cname, "%s_%s", S->name[a],C->name[b]);
+ if ((i=name_is_in_list (cname, A->name, A->nseq, 100))==-1)
+ {
+ char *s=generate_null (A->len_aln);
+ strcat (C->seq_al[b], s);
+ vfree (s);
+ }
+ else
+ strcat (C->seq_al[b], A->seq_al[i]);
+ }
+ for (c=C->len_aln, b=0;b<A->len_aln;b++, c++)
+ {
+ C->col_weight[c]=(S->W)->SEQ_W[a];
+ }
+ C->len_aln+=A->len_aln;
+ }
+ return C;
+}
+
+
+Alignment * concatenate_aln ( Alignment *A1, Alignment *A2, char *spacer)
+{
+ Alignment *A;
+ int a, i;
+
+ A=declare_aln2( A1->nseq+A2->nseq , A1->len_aln+A2->len_aln+1);
+ for ( a=0; a< A1->nseq; a++)
+ {
+ if ((i=name_is_in_list ( A1->name[a], A2->name, A2->nseq, 100))!=-1)
+ {
+ sprintf ( A->name[A->nseq], "%s", A1->name[a]);
+ sprintf (A->seq_al[A->nseq], "%s%s%s", A1->seq_al[a],(spacer)?spacer:"", A2->seq_al[i]);
+ A->nseq++;
+ }
+ else
+ {
+ char *buf;
+ buf=generate_string (A2->len_aln, '-');
+ sprintf ( A->name[A->nseq], "%s", A1->name[a]);
+ sprintf (A->seq_al[A->nseq], "%s%s", A1->seq_al[a], buf);
+ A->nseq++;
+ vfree (buf);
+ }
+ }
+ for ( a=0; a< A2->nseq; a++)
+ {
+ if ((i=name_is_in_list ( A2->name[a], A1->name, A1->nseq, 100))==-1)
+ {
+ char *buf;
+ buf=generate_string (A1->len_aln, '-');
+ sprintf ( A->name[A->nseq], "%s", A2->name[a]);
+ sprintf (A->seq_al[A->nseq], "%s%s", buf, A2->seq_al[a]);
+ A->nseq++;
+ vfree (buf);
+ }
+ }
+ A->len_aln=A1->len_aln+A2->len_aln;
+ return A;
+}
+Alignment * aln_cat ( Alignment *A, Alignment *B)
+ {
+ int a;
+
+ if ( A->nseq!=B->nseq)
+ {
+ fprintf ( stderr, "\nERROR IN ALN CAT: DIFFERENT NSEQ\n");
+ myexit(EXIT_FAILURE);
+ }
+
+ A=realloc_alignment2(A, A->nseq,A->len_aln+B->len_aln+1);
+
+ for ( a=0;a< A->nseq; a++)
+ {
+ strcat ( A->seq_al[a], B->seq_al[a]);
+ }
+ A->len_aln+=B->len_aln;
+ return A;
+ }
+int verify_aln ( Alignment *A, Sequence *S, char *message)
+ {
+ int a, b, c,s,r;
+
+
+ for ( a=0;a< A->nseq; a++)
+ {
+ s=A->order[a][0];
+ r=A->order[a][1];
+ for ( b=0, c=0; b< A->len_aln; b++)
+ {
+ if ( !is_gap(A->seq_al[a][b]))
+ {
+ if (tolower(A->seq_al[a][b])!=tolower(S->seq[s][c+r]))
+ {
+ fprintf ( stderr, "\n%s\nResidue [%c %d, %c %d] line %d seq %d",message,A->seq_al[a][b], b,S->seq[s][c+r], c+r,a,s);
+ output_Alignment_with_res_number(A, stderr);
+ myexit(EXIT_FAILURE);
+ return 0;
+ }
+ c++;
+ }
+ }
+ }
+ return 1;
+ }
+
+Alignment *adjust_est_aln ( Alignment *PW, Alignment *M, int s)
+{
+ /*This function reajusts M, threading M onto PW
+ two seqences in PW
+ s+1 seq in M
+
+ seq 0 PW ----> 0->s-1 in M
+ seq 1 PW ----> 1->s in M;
+
+ */
+ int a, b;
+ static char **array;
+
+
+ int top_M=0;
+ int bottom_M=0;
+
+
+ if ( array==NULL)
+ {
+ array=declare_char (500, 100000);
+ }
+
+ for ( a=0; a< PW->len_aln; a++)
+ {
+ if ( is_gap(PW->seq_al[0][a]))
+ {
+ for ( b=0; b< s; b++)
+ array[b][a]='-';
+ }
+ else
+ {
+ for ( b=0; b< s; b++)
+ array[b][a]=M->seq_al[b][top_M];
+ top_M++;
+ }
+
+ if ( is_gap(PW->seq_al[1][a]))
+ {
+ array[s][a]='-';
+ }
+ else
+ {
+
+ array[s][a]=M->seq_al[s][bottom_M];
+ bottom_M++;
+ }
+ }
+
+ M->len_aln=PW->len_aln;
+ for (a=0; a<s; a++)
+ {
+ for (b=0; b<PW->len_aln; b++)
+ M->seq_al[a][b]=array[a][b];
+ M->seq_al[a][b]='\0';
+ }
+
+
+ M->nseq=s+1;
+
+ return M;
+}
+
+
+Alignment * rename_seq_in_aln (Alignment *A, char ***list)
+{
+ int n, i;
+ if ( !A)return A;
+
+
+
+ n=0;
+ while ( list[n][0][0])
+ {
+ if ( (i=name_is_in_list (list[n][0], A->name, A->nseq, 100))!=-1)
+ {
+ sprintf ( A->name[i], "%s", list[n][1]);
+ }
+ n++;
+ }
+
+ A->S=rename_seq_in_seq (A->S, list);
+ return A;
+}
+Sequence * rename_seq_in_seq (Sequence *A, char ***list)
+{
+ int n, i;
+ if ( !A || !list)return A;
+
+ n=0;
+ while ( list[n][0][0])
+ {
+ if ( (i=name_is_in_list (list[n][0], A->name, A->nseq, 100))!=-1)
+ {
+ sprintf ( A->name[i], "%s", list[n][1]);
+ }
+ n++;
+ }
+ return A;
+}
+/********************************************************************/
+/* */
+/* FLOAT SIMILARITIES */
+/* */
+/* */
+/* */
+/********************************************************************/
+float get_seq_fsim ( char *string1, char *string2, char *ignore, char *similarity_set,int **matrix, int MODE )
+ {
+ int len, a, r1, r2, nr1=0, nr2=0;
+ float pos=0, sim=0;
+
+
+ len=MIN((strlen (string1)),(strlen (string2)));
+ if ( len==0)return 0;
+
+ for ( a=0; a< len; a++)
+ {
+
+ r1=string1[a];
+ r2=string2[a];
+ nr1+=!is_gap(r1);
+ nr2+=!is_gap(r2);
+
+ if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
+ {
+ pos++;
+ if ( matrix)sim+=matrix[r1-'A'][r2-'A'];
+ else if (is_in_same_group_aa(r1,r2,0, NULL,similarity_set))
+ {
+ sim++;
+ }
+ }
+ }
+ if ( MODE==UNGAPED_POSITIONS)return ( sim*100)/pos;
+ else if ( MODE==ALIGNED_POSITIONS)return (sim*100)/len;
+ else if ( MODE==AVERAGE_POSITIONS)return (sim*200)/(nr1+nr2);
+ else
+ {
+ return 0;
+ }
+
+ }
+float get_seq_fsim2 ( char *string1, char *string2, char *ignore, char *in_mode)
+ {
+ int len1;
+ int a;
+ int p1, p2;
+ int r1=0,r2=0;
+ char *p;
+ char mode[1000];
+ float r=0, pos1, pos2, pos0, gap, sim;
+
+
+ sprintf ( mode, "%s", in_mode);
+
+ /*mode: <mat>__<sim_mode>
+ mat: idscore to get the alignment done
+ any legal cw matrix
+ sim_mode: sim1->identities/matches
+ sim2->identities/min len
+ */
+
+
+ if ( (p=strstr (mode, "_"))!=NULL)
+ {
+ p[0]='\0';
+ p++;
+ }
+
+
+ if (strstr (mode, "idscore"))
+ {
+ static int **mat;
+ if (!mat) mat=read_matrice ("blosum62mt");
+ return idscore_pairseq (string1, string2, -12, -1, mat,mode);
+
+ }
+
+ len1=strlen (string1);
+ for ( sim=pos1=pos2=pos0=gap=0,a=0; a< len1; a++)
+ {
+ r1=string1[a];
+ r2=string2[a];
+ p1=1-is_in_set (r1, ignore);
+ p2=1-is_in_set (r2, ignore);
+ pos1+=p1; pos2+=p2;
+ if (p1 && p2)
+ {
+ pos0++;
+ if (is_in_same_group_aa(r1,r2,0, NULL, mode))
+ {
+ sim++;
+ }
+ }
+ else if (p1+p2==1)
+ {
+ gap++;
+ }
+ }
+
+ if ( p==NULL || strm (p, "sim1") || strm (p, "sim"))
+ {
+ r=(pos0==0)?0:(sim*MAXID)/pos0;
+ }
+ else if ( strm (p, "sim2"))
+ {
+ r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MIN(pos1,pos2);
+ }
+ else if ( strm (p, "sim3"))
+ {
+ r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MAX(pos1,pos2);
+ }
+ else if ( strm (p, "gap1"))
+ {
+ r=(len1==0)?MAXID:(gap*MAXID)/len1;
+ r=MAXID-r;
+ }
+ else if ( strm (p, "logid"))
+ {
+ r=logid_score (pos0, sim);
+ }
+
+ return r;
+
+ }
+
+/********************************************************************/
+/* */
+/* ALIGNMENT ANALYSES */
+/* */
+/* */
+/* */
+/********************************************************************/
+int **dist_array2sim_array ( int **p, int max)
+{
+ int s1, s2, a, b;
+ s1=read_array_size ((void *)p, sizeof (void *));
+ s2=read_array_size ((void*)p[0],sizeof (int));
+ /* s2=read_array_size ((void*)p[0],sizeof (void *)); OLD before 64 BITS*/
+ for ( a=0; a< s1; a++)
+ for ( b=0; b< s2; b++)
+ {
+ p[a][b]=max-p[a][b];
+ }
+ return p;
+}
+
+int **sim_array2dist_array ( int **p, int max)
+{
+ int s1, s2, a, b;
+ s1=read_array_size ((void *)p, sizeof (void *));
+ s2=read_array_size ((void*)p[0],sizeof (int));
+
+ /*s2=read_array_size ((void*)p[0],sizeof (void *)); OLD before 64 Bits stuff*/
+ for ( a=0; a< s1; a++)
+ for ( b=0; b< s2; b++)
+ {
+ p[a][b]=max-(int)p[a][b];
+ }
+ return p;
+}
+
+int **normalize_array (int **p, int max, int norm)
+{
+int s1, s2, a, b;
+ s1=read_array_size ((void *)p, sizeof (void *));
+ s2=read_array_size ((void*)p[0],sizeof (int));
+
+ /*s2=read_array_size ((void*)p[0],sizeof (void *)); OLD before 64 Bits stuff*/
+ for ( a=0; a< s1; a++)
+ for ( b=0; b< s2; b++)
+ {
+ p[a][b]=(p[a][b]*norm)/max;
+ }
+ return p;
+}
+
+int aln2most_similar_sequence ( Alignment *A, char *mode)
+{
+ int **w;
+ int a, b;
+ int avg, best_avg=0, best_seq=0;
+ char *buf;
+ int coverage;
+
+
+ if ( !A) return -1;
+ else if ( A->nseq==1)return 0;
+ else
+ {
+ buf=vcalloc ( A->len_aln+1, sizeof (char));
+ w=get_sim_aln_array ( A, mode);
+
+ for ( a=0; a< A->nseq; a++)
+ {
+ sprintf ( buf, "%s", A->seq_al[a]);
+ ungap(buf);
+ coverage=(strlen(buf)*MAXID)/A->len_aln;
+
+ for ( avg=0,b=0; b< A->nseq; b++)avg+=w[a][b]*coverage;
+ if ( avg>best_avg){best_avg=avg; best_seq=a;}
+ }
+ free_int (w, -1);
+ vfree (buf);
+ return best_seq;
+ }
+
+}
+
+int aln2coverage ( Alignment *A, int ref_seq)
+{
+ int a,b;
+ int cov_pos=0, npos=0;
+
+
+ for ( a=0; a< A->len_aln; a++)
+ {
+ if ( !is_gap ( A->seq_al[ref_seq][a]))
+ {
+ npos++;
+ for ( b=0; b< A->nseq; b++)
+ {
+ if ( b!=ref_seq && !is_gap ( A->seq_al[b][a])){cov_pos++;break;}
+ }
+ }
+ }
+
+ return (int) (npos==0)?0:(( MAXID*cov_pos)/npos);
+}
+
+
+int sub_aln2sim ( Alignment *A, int *ns, int **ls, char *mode)
+{
+ int a, b, n;
+ float avg;
+
+ n=0; avg=0;
+ if (!A || (ns==NULL && A->nseq<2))return -1;
+ else if (ns==NULL)
+ {
+ for (a=0; a< A->nseq-1; a++)
+ for ( b=a+1; b< A->nseq;b++, n++)
+ avg+=generic_get_seq_sim (A->seq_al[a], A->seq_al[b], NULL, mode);
+ }
+ else
+ {
+ for (a=0; a<ns[0]; a++)
+ for (b=0; b< ns[1]; b++, n++)
+ {
+ avg+=generic_get_seq_sim (A->seq_al[ls[0][a]], A->seq_al[ls[1][b]], NULL, mode);
+ }
+ }
+ return (int)(n==0)?0:((float)avg/(float)n);
+}
+int sub_aln2max_sim ( Alignment *A, int *ns, int **ls, char *mode)
+{
+ int a, b, n;
+ float avg;
+
+ n=0; avg=0;
+ if (!A || (ns==NULL && A->nseq<2))return -1;
+ else if (ns==NULL)
+ {
+ for (a=0; a< A->nseq-1; a++)
+ for ( b=a+1; b< A->nseq;b++, n++)
+ avg=MAX(avg,generic_get_seq_sim (A->seq_al[a], A->seq_al[b], NULL, mode));
+ }
+ else
+ {
+ for (a=0; a<ns[0]; a++)
+ for (b=0; b< ns[1]; b++, n++)
+ {
+ avg=MAX(avg,generic_get_seq_sim (A->seq_al[ls[0][a]], A->seq_al[ls[1][b]], NULL, mode));
+ }
+ }
+ return avg;
+}
+
+
+double aln2entropy (Alignment *A, int *in_ls, int in_ns, float gap_threshold)
+{
+ int ns, a, s, col, r,ncol;
+ int *ls;
+ double *count;
+ double entropy=0;
+ float ng;
+
+ ls=vcalloc ( A->nseq, sizeof (int));
+ count=vcalloc ( 26, sizeof (double));
+
+
+ if ( in_ls)
+ {
+ ns=in_ns;
+ for ( a=0; a< ns; a++)ls[a]=in_ls[a];
+ }
+ else
+ {
+ ns=A->nseq;
+ for ( a=0; a< ns; a++)ls[a]=a;
+ }
+
+ if ( ns==0)
+ {
+ vfree(ls);vfree(count);return 0;
+ }
+ for (ncol=0,col=0; col<A->len_aln; col++)
+ {
+ for (ng=0,a=0; a< ns; a++)
+ {
+ s=ls[a];
+ ng+=is_gap(A->seq_al[s][col]);
+ }
+ ng/=ns;
+ if ( ng>gap_threshold)continue;
+
+ ncol++;
+
+ for ( a=0; a<ns; a++)
+ {
+ s=ls[a];
+ r=tolower(A->seq_al[s][col]);
+ if (!is_gap(r))count[r-'a']++;
+ }
+ for (a=0; a<26; a++)
+ {
+ if ( count[a]==0);
+ else
+ {
+ count[a]/=(double)ns;
+
+ entropy+=count[a]*log(count[a]);
+ count[a]=0;
+ }
+ }
+ }
+ entropy/=-ncol;
+ vfree (ls); vfree(count);
+
+ return entropy;
+}
+int aln2sim ( Alignment *A, char *mode)
+{
+ return sub_aln2sim ( A, NULL, NULL, mode);
+ /*
+ if ( !A || A->nseq<2) return -1;
+ w=get_sim_aln_array ( A, mode);
+
+ for (c=0, a=0; a< A->nseq-1; a++)
+ for ( b=a+1; b< A->nseq; b++, c++)
+ {
+ avg+=(float)w[a][b];
+ }
+ free_int (w, -1);
+ return (int)((float)avg/(float)c);
+ */
+}
+
+int aln_is_aligned ( Alignment *A)
+{
+ int a, b;
+
+ if ( !A)return 0;
+ for (a=0; a< A->nseq; a++)
+ for ( b=A->len_aln-1; b>0; b--)
+ {
+ if (!is_gap(A->seq_al[a][b]) && is_gap(A->seq_al[a][b-1]))return 1;
+ }
+ return 0;
+}
+
+
+int seq2aln2sim_old ( char *seq1, char *seq2, char *mode_aln, char *mode_id)
+{
+ Alignment *A;
+ int sim;
+
+ A=align_two_sequences (seq1, seq2, "pam250mt", -10, -1, mode_aln);
+ sim=aln2sim (A, mode_id);
+ free_aln (A);
+ return sim;
+}
+int seq2aln2sim ( char *seq1, char *seq2, char *mode_aln, char *mode_id)
+{
+ Alignment *A;
+ int sim;
+ static int gop;
+
+ if (!gop)
+ {
+ int **m;
+ m=read_matrice ("blosum62mt");
+ gop=get_avg_matrix_mm(m, AA_ALPHABET)*10;
+ free_int (m, -1);
+ }
+
+ A=align_two_sequences (seq1, seq2, "blosum62mt",gop,-1, mode_aln);
+ sim=aln2sim (A, mode_id);
+ free_aln (A);
+ return sim;
+}
+int* get_cdna_seq_winsim ( int *cache, char *string1, char *string2, char *ignore, char *mode,int *w )
+ {
+ int len1, len2;
+ int a, x;
+
+
+ len1=strlen (string1);
+ len2=strlen (string2);
+
+ if ( len1!=len2)
+ {
+ fatal_exit( stderr,EXIT_FAILURE, "\nTHE TWO cDNAs DO NOT HAVE THE SAME LENGTH [FATAL:get_cdna_seq_sim:%s", PROGRAM);
+ }
+
+ x=get_cdna_seq_sim(cache, string1, string2, ignore, "");
+ for ( a=0; a< len1; a++)
+ w[a]=x;
+
+ add_warning (stderr, "\nWARNING: winsim not implemented for cDNA");
+ return w;
+ }
+
+int get_cdna_seq_sim ( int *cache, char *string1, char *string2, char *ignore, char *mode)
+ {
+ int len1;
+ int len2;
+ int a;
+ int pos=0;
+ int sim=0;
+ char r1=0, r2=0;
+
+ len1=strlen (string1);
+ len2=strlen (string2);
+
+
+
+ if ( len1!=len2)
+ {
+ fprintf ( stderr, "\nTHE TWO cDNAs DO NOT HAVE THE SAME LENGTH [FATAL:get_cdna_seq_sim:%s", PROGRAM);
+ crash("");
+ }
+
+ for ( a=0; a< len1;)
+ {
+
+ if ( cache[a]==0){a++;continue;}
+ else if ( cache[a]==1)
+ {
+
+ r1=translate_dna_codon (string1+a, 'x');
+ r2=translate_dna_codon (string2+a, 'x');
+ a+=3;
+ }
+
+ if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
+ {
+ pos++;
+ if (is_in_same_group_aa(r1,r2,0, NULL,mode+4))
+ {
+ sim++;
+ }
+ }
+ }
+
+
+
+ if (pos==0)
+ return 0;
+ else
+ return (int) (sim*MAXID)/pos;
+
+ }
+
+int* get_seq_winsim ( char *string1, char *string2, char *ignore, char *mode, int*w)
+ {
+ int len1, len2, len;
+ int left, right;
+ int a,b;
+ int sim=0;
+ int window;
+ int r1, r2;
+
+ len1=strlen (string1);
+ len2=strlen (string2);
+ window=atoi(mode);
+ len=2*window+1;
+
+ if ( len1!=len2)return 0;
+ if (window==0 || (window*2+1)>=len1)
+ {
+ sim=get_seq_sim (string1, string2, ignore, "");
+ for (a=0; a<len1; a++)w[a]=sim;
+ return w;
+ }
+
+
+ for ( a=0; a< len1; a++)
+ {
+
+ left =MAX(0, a-window);
+ right=MIN(len1, left+len);
+ for (sim=0,b=left; b<right; b++)
+ {
+ r1=string1[b];
+ r2=string2[b];
+ if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
+ {
+ if (r1==r2)sim++;
+ }
+ }
+ w[a]=(sim*MAXID)/len;
+ }
+ return w;
+ }
+
+
+int get_seq_sim ( char *string1, char *string2, char *ignore, char *in_mode)
+ {
+ int len1;
+ int a;
+ int pos1, pos2, pos0,gap=0, sim;
+ int p1, p2;
+ int r=0,r1=0,r2=0;
+ char *p;
+ static char *mode;
+
+ if (!mode)mode=vcalloc (100, sizeof (char));
+ else mode[0]='\0';
+ if (in_mode)
+ {
+ while (in_mode[0]=='_')in_mode++;
+ sprintf ( mode, "%s", in_mode);
+ }
+
+ /*mode: <mat>__<sim_mode>
+ mat: idscore to get the alignment done
+ any legal cw matrix
+ sim_mode: sim1->identities/matches
+ sim2->identities/min len
+ */
+
+
+ if ( (p=strstr (mode, "_"))!=NULL)
+ {
+ p[0]='\0';
+ p++;
+ }
+
+
+ if (strstr (mode, "idscore"))
+ {
+ static int **mat;
+ if (!mat) mat=read_matrice ("blosum62mt");
+ return idscore_pairseq (string1, string2, -12, -1, mat,mode);
+
+ }
+ len1=strlen (string1);
+ for ( sim=pos1=pos2=pos0=0,a=0; a< len1; a++)
+ {
+ r1=string1[a];
+ r2=string2[a];
+ p1=1-is_in_set (r1, ignore);
+ p2=1-is_in_set (r2, ignore);
+
+ pos1+=p1; pos2+=p2;
+ if (p1 && p2)
+ {
+ pos0++;
+ if (is_in_same_group_aa(r1,r2,0, NULL, mode))
+ {
+ sim++;
+ }
+ }
+ else if (p1+p2==1)
+ {
+ gap++;
+ }
+ }
+
+ if ( strstr (mode, "cov"))
+ {
+ r=(pos0+gap==0)?0:(pos0*MAXID)/(pos0+gap);
+ }
+ else if ( p==NULL || strm (p, "sim1") || strm (p, "sim"))
+ {
+ r=(pos0==0)?0:(sim*MAXID)/pos0;
+ }
+ else if ( strm (p, "sim2"))
+ {
+ r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MIN(pos1,pos2);
+ }
+ else if ( strm (p, "sim3"))
+ {
+ r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MAX(pos1,pos2);
+ }
+ else if ( strm (p, "gap1"))
+ {
+ r=(len1==0)?MAXID:(gap*MAXID)/len1;
+ r=MAXID-r;
+ }
+ else if ( strm (p, "logid"))
+ {
+ r=logid_score (pos0, sim);
+ }
+ else if ( strstr (mode, "sim"))
+ {
+ r=(pos0==0)?0:(sim*MAXID)/pos0;
+ }
+
+
+ return r;
+
+ }
+int get_seq_sim_2 ( char *string1, char *string2, char *ignore, char **gr, int ng)
+ {
+ int len1;
+ int len2;
+ int a;
+ int pos=0;
+ int sim=0;
+ char r1, r2;
+
+
+ len1=strlen (string1);
+ len2=strlen (string2);
+
+ if ( len1!=len2)return 0;
+
+ for ( a=0; a< len1; a++)
+ {
+ r1=string1[a];
+ r2=string2[a];
+ if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
+ {
+ pos++;
+ if (is_in_same_group_aa(r1,r2,ng, gr, NULL))
+ {
+ sim++;
+ }
+ }
+ }
+
+ if (pos==0)
+ return 0;
+ else
+ return (int) (sim*MAXID)/pos;
+
+ }
+
+int get_seq_sim_3 ( char *string1, char *string2, char *ignore, int **mat)
+ {
+ int len1;
+ int len2;
+ int a;
+
+ int sim=0;
+ char r1, r2;
+
+
+ len1=strlen (string1);
+ len2=strlen (string2);
+
+ if ( len1!=len2)return 0;
+
+ for ( a=0; a< len1; a++)
+ {
+ r1=string1[a];
+ r2=string2[a];
+ if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
+ {
+ sim+=mat[r1-'A'][r2-'A'];
+ }
+ }
+ return sim;
+
+ }
+int * get_aln_col_weight ( Alignment *A, char *mode)
+ {
+ int a, b;
+ char *col;
+ int *weight;
+
+ col=vcalloc ( A->nseq, sizeof (int));
+ weight=vcalloc (A->len_aln, sizeof (int));
+
+ for (a=0; a< A->len_aln; a++)
+ {
+ for ( b=0; b< A->nseq; b++)
+ col[b]=A->seq_al[b][a];
+ weight[a]=(find_group_aa_distribution (col, A->nseq,0,NULL,NULL, mode )*MAXID)/A->nseq;
+ }
+ vfree (col);
+ return weight;
+
+ }
+
+int analyse_aln_column ( Alignment *B, int col)
+ {
+
+ char r=' ';
+ int a, b, c=0;
+ static char *mat;
+ static int ng_cw_star;
+ static char **cw_star;
+ int *cw_star_count;
+
+ static int ng_cw_col;
+ static char **cw_col;
+ int *cw_col_count;
+
+ static int ng_cw_dot;
+ static char **cw_dot;
+ int *cw_dot_count;
+
+
+
+
+
+
+ if ( !B->S || !(B->S)->type)B= get_aln_type (B);
+
+ if ( !mat)mat=vcalloc ( STRING, sizeof (char));
+
+ if ( !ng_cw_star)
+ {
+ cw_star=make_group_aa ( &ng_cw_star, strcpy ( mat,"idmat"));
+ cw_col=make_group_aa ( &ng_cw_col, strcpy (mat,"clustalw_col"));
+ cw_dot=make_group_aa ( &ng_cw_dot, strcpy (mat, "clustalw_dot"));
+ }
+
+ cw_star_count=vcalloc (ng_cw_star, sizeof (int));
+ cw_col_count=vcalloc ( ng_cw_col, sizeof (int));
+ cw_dot_count=vcalloc (ng_cw_dot, sizeof (int));
+
+ for ( a=0; a< B->nseq; a++)
+ {
+ c=tolower (B->seq_al[a][col]);
+ if (is_gap(c)){r=' ';break;}
+
+ for ( b=0; b< ng_cw_star; b++)
+ cw_star_count[b]+=is_in_set (c, cw_star[b]);
+ for ( b=0; b< ng_cw_col; b++)
+ cw_col_count[b]+=is_in_set (c, cw_col[b]);
+ for ( b=0; b< ng_cw_dot; b++)
+ cw_dot_count[b]+=is_in_set (c, cw_dot[b]);
+ }
+
+
+
+
+
+ if ( !is_gap(c) && r==' ')
+ for ( b=0; b< ng_cw_star; b++)if ( cw_star_count[b]==B->nseq){r='*'; break;}
+ if ( !is_gap(c) && r==' ' && !(strm((B->S)->type, "DNA")||strm ((B->S)->type,"RNA")))
+ for ( b=0; b< ng_cw_col ; b++)if ( cw_col_count [b]==B->nseq){r=':'; break;}
+ if ( !is_gap(c) && r==' ' && !(strm((B->S)->type, "DNA")||strm ((B->S)->type,"RNA")))
+ for ( b=0; b< ng_cw_dot ; b++)if ( cw_dot_count [b]==B->nseq){r='.'; break;}
+
+
+
+ vfree(cw_star_count);
+ vfree(cw_col_count);
+ vfree(cw_dot_count);
+
+ return r;
+ }
+
+
+int ** get_cov_aln_array ( Alignment *A, char *mode)
+{
+ int **w;
+ int a, b, c, t;
+
+ w=declare_int ( A->nseq, A->nseq);
+
+
+ for ( a=0; a< A->nseq-1; a++)
+ {
+ w[a][a]=100;
+ for ( t=0,b=a+1; b< A->nseq; b++)
+ {
+ for ( c=0; c< A->len_aln; c++)
+ {
+ t+=(!is_gap(A->seq_al[a][c]) &&!is_gap(A->seq_al[b][c]));
+ }
+ w[a][b]=w[b][a]=(t*100)/A->len_aln;
+ }
+ }
+ return w;
+}
+
+int ** get_cov_master_aln_array ( Alignment *A,int n, char *mode)
+{
+ int **w;
+ int b, c, t;
+
+ w=declare_int ( A->nseq, A->nseq);
+
+
+ for (b=0; b< A->nseq; b++)
+ {
+
+ for (t=0, c=0; c< A->len_aln; c++)
+ {
+ t+=(!is_gap(A->seq_al[n][c]) &&!is_gap(A->seq_al[n][c]));
+ }
+ w[n][b]=w[b][n]=(t*100)/A->len_aln;
+ }
+
+ return w;
+}
+int ** get_sim_master_aln_array ( Alignment *A,int n, char *mode)
+ {
+ int **w;
+ int a;
+
+ w=declare_int ( A->nseq, A->nseq);
+
+
+ for ( a=0; a< A->nseq; a++)
+ {
+ if ( strm (mode, "cdna"))
+ w[n][a]=w[a][n]=get_cdna_seq_sim ( A->cdna_cache[0], A->seq_al[a], A->seq_al[n],GAP_LIST, mode);
+ else
+ w[n][a]=w[a][n]=get_seq_sim ( A->seq_al[n], A->seq_al[a],GAP_LIST, mode);
+ }
+ return w;
+ }
+int ** get_dist_aln_array ( Alignment *A, char *mode)
+{
+
+ int **w;
+
+ w=get_sim_aln_array ( A, mode);
+ return sim_array2dist_array(w,MAXID);
+}
+Sequence * seq2filter (Sequence *Sin, int min, int max)
+{
+ int *keep;
+ char *tmpfile;
+ Sequence *S, *Sout;
+ int a, b, sim;
+ int **M;
+ FILE *fp;
+ int n;
+
+ S=duplicate_sequence (Sin);
+ for (a=0; a<S->nseq; a++)ungap(S->seq[a]);
+ keep=vcalloc (S->nseq, sizeof (int));
+ M=read_matrice ("blossum62mt");
+ for (a=0; a<S->nseq; a++)
+ {
+ output_completion ( stderr, a, S->nseq, 100, "Distance Matrix Computation: ");
+ for ( b=a+1; b<S->nseq; b++)
+ {
+
+ sim=idscore_pairseq(S->seq[a], S->seq[b],-10, -2,M, "sim");
+ if ( sim>min && sim<max)keep[a]=keep[b]=1;
+ fprintf ( stderr, "\nSim %d Min %d Max %d", sim, min, max);
+ }
+ }
+
+ tmpfile=vtmpnam (NULL);
+ fp=vfopen (tmpfile, "w");
+ for (n=0,a=0; a< S->nseq; a++)
+ if ( keep[a])
+ {
+ fprintf ( fp, ">%s %s\n%s", S->name[a], S->seq_comment[a], S->seq[a]);
+ n++;
+ }
+ vfclose (fp);
+ if (n==0) return NULL;
+ Sout=main_read_seq(tmpfile);
+ free_int (M, -1); vfree (keep); free_sequence (S, -1);
+ return Sout;
+}
+
+Alignment * grep_seq (Alignment *S,char *field, char *mode, char *string)
+{
+ int a;
+ FILE *fp;
+ char *tmp;
+ int n=0;
+
+ tmp=vtmpnam (NULL);
+ fp=vfopen (tmp, "w");
+
+ if ( !strm(mode, "KEEP") && ! strm (mode, "REMOVE"))
+ {
+ add_warning ( stderr, "\nERROR: +grep <field> <KEEP|REMOVE> <string> [FATAL: %s]", PROGRAM);
+ myexit (EXIT_FAILURE);
+ }
+ else if ( !strm(field, "SEQ") && ! strm (field, "COMMENT") && ! strm(field, "NAME"))
+ {
+ add_warning ( stderr, "\nERROR: +grep <NAME|COMMENT|SEQ> <mode> <string> [FATAL: %s]", PROGRAM);
+ myexit (EXIT_FAILURE);
+ }
+
+
+ for (n=0, a=0; a< S->nseq; a++)
+ {
+ int found=0;
+
+ if (strm(field, "NAME") && perl_strstr (S->name[a], string))found=1;
+ else if (strm(field, "COMMENT") && S->seq_comment[a][0] && perl_strstr (S->seq_comment[a], string) )found=1;
+ else if (strm(field, "SEQ") && perl_strstr (S->seq_al[a], string))found=1;
+
+ if ( (strm (mode, "KEEP") && found) || (strm (mode, "REMOVE") && !found))
+ {
+ n++;
+ fprintf (fp, ">%s", S->name[a]);
+ if (S->seq_comment[a][0])fprintf (fp, " %s", S->seq_comment[a]);
+ fprintf (fp, "\n%s\n", S->seq_al[a]);
+ }
+ }
+
+ vfclose (fp);
+
+ free_aln (S);
+ if ( n==0) return NULL;
+ else
+ return main_read_aln (tmp, NULL);
+}
+
+Alignment * modify_seq (Alignment *S, char *field, char *string1, char *string2)
+{
+ int a;
+ FILE *fp;
+ char *tmp;
+
+ tmp=vtmpnam (NULL);
+ fp=vfopen (tmp, "w");
+ for ( a=0; a< S->nseq; a++)
+ {
+ if (strm(field, "NAME"))S->name[a]=substitute ( S->name[a], string1, string2);
+ else if (strm(field, "COMMENT"))S->seq_comment[a]=substitute ( S->seq_comment[a], string1, string2);
+ else if (strm(field, "SEQ"))S->seq_al[a]=substitute ( S->seq_al[a], string1, string2);
+ fprintf (fp, ">%s", S->name[a]);
+ if (S->aln_comment[a][0])fprintf (fp, " %s", S->aln_comment[a]);
+ fprintf (fp, "\n%s\n", S->seq_al[a]);
+ }
+ vfclose (fp);
+ free_aln (S);
+ S=main_read_aln (tmp, NULL);
+ return S;
+}
+
+int ** seq2sim_mat (Sequence *S, char *mode)
+{
+ return seq2comp_mat ( S,mode, "sim");
+}
+int ** seq2cov_mat (Sequence *S, char *mode)
+{
+ return seq2comp_mat ( S,mode, "cov");
+}
+
+int ** seq2comp_mat (Sequence *S, char *mode, char *comp_mode)
+{
+ int a, b;
+ int **sim;
+ char file[1000];
+ Alignment *A;
+ char *name;
+
+
+ /*Use pre_computed value if available in the current dir*/
+
+ name=path2filename(S->file[0]);
+ sprintf ( file, "%s%s.%s.%s_file", get_cache_dir(),name, mode, comp_mode);
+ A=seq2aln(S,NULL, RM_GAP);
+ if ( check_file_exists (file) && is_distance_matrix_file (file) && (sim=input_similarities(file, A, NULL))!=NULL)
+ {
+ display_input_filename (stderr, "SIMILARITY_MATRIX", "SIMILARITY_MATRIX_FORMAT_01", file, CHECK);
+ fprintf ( stderr, "\n");
+ }
+ else
+ {
+ char mode2[1000];
+ int **M;
+
+ M=read_matrice (mode);
+ sim=declare_int ( S->nseq, S->nseq);
+ for ( a=0; a< S->nseq; a++)
+ {
+ ungap (S->seq[a]);
+ sim[a][a]=100;
+ }
+
+ for ( a=0; a<S->nseq-1; a++)
+ {
+
+ output_completion4halfmat ( stderr, a, S->nseq, 100, "Similarity Matrix Computation: ");
+ for ( b=a+1; b< S->nseq; b++)
+ {
+ sim[a][b]=sim[b][a]=idscore_pairseq(S->seq[a], S->seq[b],-12, -1,M, comp_mode);
+ }
+ }
+ free_int (M,-1);
+ sprintf ( mode2, "_memory_%ld", (long int)sim);
+ output_similarities( file, A, mode2);
+ display_output_filename (stderr, "SIMILARITY_MATRIX", "SIMILARITY_MATRIX_FORMAT_01", file, CHECK);
+ fprintf ( stderr, "\n");
+ }
+ free_aln (A);
+ return sim;
+}
+
+int ** fast_aln2sim_list (Alignment *A, char *mode, int *ns, int **ls)
+{
+ int **simm;
+ int p1, p2, p3, r1, r2;
+ int gap,pos0,pos1,pos2,len,sim;
+ int a, b, c, m, s=0,s1, s2, n;
+ int free_ns=0;
+
+ if (ns==NULL)
+ {
+ free_ns=1;
+ ns=vcalloc (2, sizeof (int));
+ ns[0]=ns[1]=A->nseq;
+ ls=declare_int (2, A->nseq);
+ for ( a=0; a< 2; a++)
+ for (b=0; b<A->nseq; b++)
+ ls[a][b]=b;
+ }
+
+
+ simm=declare_int (ns[0]*ns[1]+1, 3);
+
+ if (strstr (mode, "sim1"))m=0;
+ else if (strstr (mode, "sim2"))m=1;
+ else if (strstr (mode, "sim3"))m=2;
+ else if (strstr (mode, "gap1"))m=3;
+ else if (strstr (mode, "cov1"))m=4;
+ else if (strstr (mode, "logid"))m=5;
+ else m=0;
+
+
+
+ for (n=0,a=0; a<ns[0]; a++)
+ {
+ s1=ls[0][a];
+ for ( b=0; b<ns[1]; b++, n++)
+ {
+ s2=ls[1][b];
+ gap=pos0=pos1=pos2=len=sim=0;
+
+ for ( c=0; c< A->len_aln; c++)
+ {
+ r1=tolower (A->seq_al[s1][c]);
+ r2=tolower (A->seq_al[s2][c]);
+ p1=(r1!='-')?1:0;
+ p2=(r2!='-')?1:0;
+ p3=p1+p2;
+ if ( p3==0)continue;
+ if ( p3==1)gap++;
+ if ( r1==r2)sim++;
+ pos1+=p1;
+ pos2+=p2;
+ pos0+=(p3==2)?1:0;
+ len++;
+ }
+
+ if (m==0)s=(pos0==0)?0:(sim*MAXID)/pos0; //sim1
+ else if (m==1) s=(MIN(pos1,pos2)==0)?0:(sim*MAXID)/MIN(pos1,pos2);//sim2
+ else if (m==2) s=(MAX(pos1,pos2)==0)?0:(sim*MAXID)/MAX(pos1,pos2);//sim3
+ else if (m==3) s=(len==0) ?0:((len-gap)*MAXID)/len;//gap1
+ else if (m==4) s=(len==0) ?0:((pos0)*MAXID)/len; //cov
+ else if (m==5)
+ {
+ s=logid_score ( sim, len);
+ }
+ simm[n][0]=s1;
+ simm[n][1]=s2;
+ simm[n][2]=s;
+ }
+ }
+
+ if ( free_ns) {vfree(ns); free_int (ls, -1);}
+ simm[n][0]=-1;
+ return simm;
+}
+
+int ** fast_aln2sim_mat (Alignment *A, char *mode)
+{
+ int **simm;
+ int p1, p2, p3, r1, r2;
+ int gap,pos0,pos1,pos2,len,sim;
+ int a, b, c, m;
+
+ simm=declare_int (A->nseq, A->nseq);
+
+
+
+ if (strstr (mode, "sim1"))m=0;
+ else if (strstr (mode, "sim2"))m=1;
+ else if (strstr (mode, "sim3"))m=2;
+ else if (strstr (mode, "gap1"))m=3;
+ else if (strstr (mode, "cov1"))m=4;
+ else if (strstr (mode, "logid"))m=5;
+ else m=0;
+
+
+
+ for ( a=0; a< A->nseq-1; a++)
+ {
+ simm[a][a]=MAXID;
+ for ( b=a+1; b< A->nseq; b++)
+ {
+ gap=pos0=pos1=pos2=len=sim=0;
+
+ for ( c=0; c< A->len_aln; c++)
+ {
+ r1=tolower (A->seq_al[a][c]);
+ r2=tolower (A->seq_al[b][c]);
+ p1=(r1!='-')?1:0;
+ p2=(r2!='-')?1:0;
+ p3=p1+p2;
+ if ( p3==0)continue;
+ if ( p3==1)gap++;
+ if ( r1==r2)sim++;
+ pos1+=p1;
+ pos2+=p2;
+ pos0+=(p3==2)?1:0;
+ len++;
+ }
+
+ if (m==0)simm[a][b]=simm[b][a]=(pos0==0)?0:(sim*MAXID)/pos0; //sim1
+ else if (m==1) simm[a][b]=simm[b][a]=(MIN(pos1,pos2)==0)?0:(sim*MAXID)/MIN(pos1,pos2);//sim2
+ else if (m==2) simm[a][b]=simm[b][a]=(MAX(pos1,pos2)==0)?0:(sim*MAXID)/MAX(pos1,pos2);//sim3
+ else if (m==3) simm[a][b]=simm[b][a]=(len==0) ?0:((len-gap)*MAXID)/len;//gap1
+ else if (m==4) simm[a][b]=simm[b][a]=(len==0) ?0:((pos0)*MAXID)/len; //cov
+ else if (m==5)
+ {
+
+ //Inspired from Muscle +mafft 5
+ simm[a][b]=simm[b][a]=logid_score ( sim, len);
+ }
+ }
+ }
+ return simm;
+}
+int logid_score ( int sim, int len)
+{
+ float score;
+
+ if ( len==0)return (int)(0.33*(float)MAXID);
+
+ score=(float)sim/(float)len;
+ if (score>0.9) score=1.0;
+ else score=-log10 (1.0-score);
+
+ score=(score*MAXID);
+ return score;
+}
+int ** aln2sim_mat (Alignment *A, char*mode)
+{
+
+
+ if ( strstr (mode, "idmat"))return fast_aln2sim_mat(A, mode);
+ return get_sim_aln_array(A, mode);
+}
+int ** aln2cov (Alignment *A)
+{
+ int a, b, c;
+ int r1, r2, gr1, gr2, pos0, gap;
+ int **cov;
+ cov=declare_int (A->nseq, A->nseq);
+
+ for (a=0; a< A->nseq-1; a++)
+ {
+ cov[a][a]=100;
+ for ( b=a+1; b<A->nseq; b++)
+ {
+ for (gap=0,pos0=0,c=0;c<A->len_aln; c++)
+ {
+ r1=A->seq_al[a][c];
+ r2=A->seq_al[b][c];
+ gr1=is_gap(r1); gr2=is_gap(r2);
+ if ( gr1+gr2==0)pos0++;
+ else if ( gr1+gr2<2)gap++;
+ }
+ cov[a][b]=cov[b][a]=((gap+pos0)==0)?0:((pos0*100)/(gap+pos0));
+ }
+ }
+ return cov;
+}
+int ** get_raw_sim_aln_array (Alignment *A, char *mode)
+{
+ int **w;
+ int **M;
+ int a, b, c, r1, r2, set, max, min;
+
+ w=declare_int (A->nseq, A->nseq);
+ if (strstr(mode, "sar"))M=NULL;
+ else M=read_matrice (mode);
+
+ HERE ("RAW STUFF");
+
+ for ( set=0,a=0; a< A->nseq; a++)
+ for (b=a; b<A->nseq; b++)
+ {
+ if (M)
+ {
+ for (c=0; c<A->len_aln; c++)
+ {
+ r1=A->seq_al[a][c];
+ r2=A->seq_al[b][c];
+
+ if ( !is_gap(r1) && !is_gap(r2))
+ w[a][b]+=M[r1-'A'][r2-'A'];
+ }
+ }
+ else if ( strm (mode, "sarmat2"))
+ {
+ w[a][b]=get_sar_sim2 (A->seq_al[a], A->seq_al[b]);
+ }
+ else
+ {
+ HERE ("ERROR: %s is an unknown mode of raw_sim\n", mode); myexit (EXIT_FAILURE);
+ }
+
+ w[b][a]=w[a][b];
+ if (!set){min=max=w[a][b];set=1;}
+ min=MIN(min,w[a][b]);
+ max=MAX(max,w[a][b]);
+ }
+ for (a=0; a<A->nseq; a++)
+ for (b=a; b<A->nseq; b++)
+ {
+ w[b][a]=((max-min)==0)?0:((w[b][a]-min)*100)/(max-min);
+ w[a][b]=w[b][a];
+ }
+ free_int (M, -1);
+ return w;
+}
+int ** get_sim_aln_array ( Alignment *A, char *mode)
+ {
+ int **w;
+ int a, b;
+
+
+ w=declare_int ( A->nseq, A->nseq);
+
+ for ( a=0; a< A->nseq-1; a++)
+ {
+ for ( b=a+1; b< A->nseq; b++)
+ {
+
+ w[a][b]=w[b][a]=generic_get_seq_sim ( A->seq_al[a], A->seq_al[b], (A->cdna_cache)?A->cdna_cache[0]:NULL, mode);
+ }
+ }
+ return w;
+ }
+int generic_get_seq_sim ( char *seq1, char *seq2, int*cache, char *mode)
+{
+
+
+ if ( strm (mode, "cdna"))
+ return get_cdna_seq_sim ( cache, seq1, seq2,GAP_LIST, mode);
+ else if ( strnm (mode, "ktup",4))
+ return ktup_comparison (seq1, seq2,atoi(mode+4));
+ else if ( strstr (mode, "sarmat2"))
+ {
+
+ return get_sar_sim2 (seq1, seq2);
+ }
+ else if ( strstr (mode, "sarmat"))
+ return (int) get_sar_sim (seq1,seq2);
+ else
+ {
+ return get_seq_sim ( seq1,seq2,GAP_LIST, mode);
+ }
+}
+int *** get_winsim_aln_array ( Alignment *A,char *mode, int ***w)
+ {
+ int a, b;
+ for ( a=0; a< A->nseq; a++)
+ for ( b=0; b< A->nseq; b++)
+ {
+ if ( strm (mode, "cdna"))
+ w[a][b]=get_cdna_seq_winsim ( A->cdna_cache[0], A->seq_al[a], A->seq_al[b],GAP_LIST, mode, w[a][b]);
+ else
+ w[a][b]=get_seq_winsim ( A->seq_al[a], A->seq_al[b],GAP_LIST, mode, w[a][b]);
+ }
+ return w;
+ }
+
+Alignment * seq2profile (Sequence *S, int i)
+{
+ Alignment *A;
+
+ if ((A=seq2R_template_profile (S, i)))
+ {
+ return A;
+ }
+ else
+ {
+ char *tmp;
+ FILE *fp;
+ tmp=vtmpnam (NULL);
+ fp=vfopen ( tmp, "w");
+ fprintf (fp, ">%s\n%s\n", S->name[i], S->seq[i]);
+ vfclose (fp);
+
+ (S->T[i])->R=fill_R_template (S->name[i], tmp, S);
+
+ return seq2R_template_profile (S, i);
+ }
+}
+Alignment* remove_seq_from_aln (Alignment *A, char *seq)
+{
+ int a, n;
+ for (n=0,a=0; a<A->nseq; a++)
+ {
+ if ( strm (seq, A->name[a]))continue;
+ else if ( n==a);
+ else
+ {
+ sprintf (A->name[n], "%s",A->name[a]);
+ sprintf (A->seq_al[n], "%s",A->seq_al[a]);
+ if (A->seq_comment[a])sprintf (A->seq_comment[n], "%s", A->seq_comment[a]);
+ if (A->aln_comment[a])sprintf (A->aln_comment[n], "%s", A->aln_comment[a]);
+ A->order[n][0]=A->order[a][0];
+ A->order[n][1]=A->order[a][1];
+ }
+ n++;
+ }
+ A->nseq=n;
+ return A;
+}
+
+
+Alignment* aln2sub_aln_file (Alignment *A, int n, char **string)
+{
+ char ***list;
+ int a;
+
+ list=vcalloc (A->nseq, sizeof (char***));
+ if ( n==0)return A;
+ else if (n>1)
+ {
+ int l;
+ char *buf;
+
+ for (l=0,a=0; a< n; a++)l+=strlen (string[a]);
+ buf=vcalloc ( 2*n+l+1, sizeof (char));
+ for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");}
+ list[0]=string2list (buf);
+ vfree (buf);
+ }
+ else if ( file_exists (NULL,string[0]))
+ {
+ list=read_group (string[0]);
+
+ }
+ else
+ {
+ fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM);
+ myexit (EXIT_FAILURE);
+ }
+
+
+ a=0;
+ while (list[a])
+ {
+ int i, b;
+ FILE *fp;
+ n=atoi (list[a][0]);
+ fp=vfopen (list[a][1], "w");
+ for (b=2; b<n; b++)
+ {
+ i=name_is_in_list (list[a][b], A->name, A->nseq, MAXNAMES);
+ if (n==3)ungap (A->seq_al[i]);
+ fprintf (fp, ">%s\n%s\n", A->name[i], A->seq_al[i]);
+ }
+ vfclose (fp);
+ free_char (list[a], -1);
+ a++;
+ }
+ vfree(list);
+ return A;
+}
+Sequence *remove_empty_sequence (Sequence *S)
+{
+ int a, b;
+ char *c;
+ Sequence *NS;
+
+ c=vcalloc ( S->max_len+1, sizeof (char));
+
+ for (a=0, b=0; a< S->nseq; a++)
+ {
+ sprintf ( c, "%s",S->seq[a]);
+ ungap (c);
+ if ( strlen (c)==0)
+ {
+ //vfree (S->seq[a]);
+ S->seq[a]=NULL;
+ add_warning ( stderr, "WARNING: Sequence %s does not contain any residue: automatically removed from the set [WARNING:%s]",S->name[a], PROGRAM);
+ }
+ }
+ NS=duplicate_sequence (S);
+ free_sequence (S, S->nseq);
+ vfree (c);
+ return NS;
+}
+Alignment* aln2sub_seq (Alignment *A, int n, char **string)
+{
+ char ***list;
+ int a;
+ Sequence *S=NULL;
+
+ list=vcalloc (A->nseq, sizeof (char***));
+ if ( n==0)return A;
+ else if (n>1)
+ {
+ int l;
+ char *buf;
+
+ for (l=0,a=0; a< n; a++)l+=strlen (string[a]);
+ buf=vcalloc ( 2*n+l+1, sizeof (char));
+ for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");}
+ list[0]=string2list (buf);
+ vfree (buf);
+ }
+ else if ( file_exists (NULL,string[0]))
+ {
+ list=read_group (string[0]);
+
+ }
+ else
+ {
+ fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM);
+ myexit (EXIT_FAILURE);
+ }
+
+
+
+ a=0;
+ while (list[a])
+ {
+ int t;
+ Alignment *B;
+ Sequence *subS;
+
+
+ B=main_read_aln (list[a][1], NULL);
+ t=aln2most_similar_sequence(B, "idmat");
+ subS=extract_one_seq(B->name[t],0,0,B,KEEP_NAME);
+ S=add_sequence (subS,S,0);
+ free_aln (B);free_sequence (subS, -1);
+ vremove (list[a][1]);
+ a++;
+ }
+ vfree(list);
+ return seq2aln (S, NULL, RM_GAP);
+}
+
+Alignment * aln2collapsed_aln (Alignment * A, int n, char **string)
+{
+ Alignment *B;
+ char ***list;
+ char **list2;
+ char *buf=NULL;
+ FILE *fp;
+ int a, b,c, ns, m, l;
+ int *collapsed;
+
+ list=vcalloc (A->nseq, sizeof (char***));
+ ns=0;
+ if ( n==0)return A;
+ else if (n>1)
+ {
+ for (l=0,a=0; a< n; a++)l+=strlen (string[a]);
+ buf=vcalloc ( 2*n+l+1, sizeof (char));
+ for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");}
+
+ list[0]=string2list (buf);ns=1;
+
+ }
+ else if ( file_exists (NULL,string[0]))
+ {
+ /*Format: Fasta like, the name fo the group followed with the name of the sequences
+ ><Group name> <First Seq> <second seq> ....
+ Groups must NOT be overlaping
+ */
+ l=measure_longest_line_in_file (string[0])+1;
+ buf=vcalloc (l, sizeof (char));
+ ns=0;
+ fp=vfopen (string[0], "r");
+ while ((c=fgetc(fp))!=EOF)
+ {
+ buf=fgets (buf,l-1, fp);
+ if ( c=='>')list[ns++]=string2list (buf);
+ }
+ vfclose (fp);
+ }
+ else
+ {
+ fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM);
+ myexit (EXIT_FAILURE);
+ }
+
+ vfree (buf); buf=NULL;
+
+ /*Identify lost sequences*/
+ collapsed=vcalloc (A->nseq, sizeof (int));
+ for ( a=0; a< ns; a++)
+ {
+ m=atoi (list[a][0]);
+ for (b=2; b<m ; b++)
+ {
+ c=name_is_in_list (list[a][b], A->name, A->nseq, MAXNAMES);
+ if ( c>=0)collapsed[c]=1;
+ }
+ }
+ for ( a=0; a< A->nseq; a++)
+ {
+ if ( collapsed[a]==0)
+ {
+ list[ns]=declare_char (3, MAXNAMES);
+ sprintf ( list[ns][0], "3");
+ sprintf ( list[ns][1], "%s", A->name[a]);
+ sprintf ( list[ns][2], "%s", A->name[a]);
+ ns++;
+ }
+ }
+ vfree (collapsed);
+
+
+
+
+
+ list2=declare_char (A->nseq, 100);
+ /*1 Collapse the alignment*/
+ for ( a=0; a< ns; a++)
+ {
+ sprintf ( list2[a], "%s", list[a][2]);
+ }
+ B=extract_sub_aln2 ( A, ns, list2);
+ /*2 Rename the sequences*/
+ for ( a=0; a< ns; a++)
+ {
+ sprintf ( B->name[a], "%s", list[a][1]);
+ }
+ /*replace sequence with consensus*/
+
+ for ( a=0; a< ns; a++)
+ {
+ m=atoi (list[a][0]);
+ for (c=0, b=2; b<m;c++, b++)
+ {
+ sprintf ( list2[c], "%s", list[a][b]);
+ }
+ buf=sub_aln2cons_seq_mat2 ( A,m-2,list2, "blosum62mt");
+ sprintf (B->seq_al[a], "%s", buf);
+ }
+ vfree (buf);
+
+ free_aln (A);
+ B->S=aln2seq(B);
+ return B;
+}
+Alignment * aln2profile (Alignment * A)
+ {
+ Alignment *B=NULL;
+ char *cons;
+
+ if (!A->P)
+ {
+ A->P=declare_profile (AA_ALPHABET,A->len_aln+1);
+ }
+ B=copy_aln (A, B);
+ free_int ((A->P)->count, -1);
+ free_int ((A->P)->count2, -1);
+ free_int ((A->P)->count3, -1);
+ (A->P)->count=aln2count_mat (A);
+ (A->P)->count2=aln2count_mat2 (A);
+
+ cons=aln2cons_seq_mat (A, "blosum62mt");
+
+ sprintf (B->seq_al[0], "%s", cons);
+ B->nseq=1;
+ (A->P)->count3=aln2count_mat2 (B);
+ vfree (cons);
+ free_aln (B);
+
+
+
+ return A;
+
+ }
+
+int** aln2count_mat2 ( Alignment *A)
+{
+ return sub_aln2count_mat2 (A, 0, NULL);
+}
+
+int sub_aln2nseq_prf ( Alignment *A, int ns, int *ls)
+{
+
+
+ int a, c, s;
+ Alignment *R;
+ int n;
+ int free_ls=0;
+
+
+ if ( ns==0)
+ {
+ n=ns=A->nseq;
+ ls=vcalloc (n, sizeof (int));
+ for ( a=0; a<A->nseq; a++)ls[a]=a;
+ free_ls=1;
+ }
+ else
+ {
+ n=ns;
+ }
+
+ for (c=0,a=0; a<ns; a++)
+ {
+ s=ls[a];
+ if ( A->S && (R=seq2R_template_profile (A->S, A->order[s][0]))!=NULL)
+ {
+ n+=R->nseq;
+ }
+ else
+ {
+ ;
+ }
+ }
+
+ if ( free_ls) vfree (ls);
+ return n;
+}
+
+int** sub_aln2count_mat2 ( Alignment *A, int ns, int *ls)
+{
+ char **p;
+ int **count;
+ int a, b, c, s;
+ Alignment *R;
+ int n;
+ int free_ls=0;
+
+ if ( ns==0)
+ {
+ n=ns=A->nseq;
+ p=vcalloc ( n, sizeof (char*));
+ ls=vcalloc (n, sizeof (int));
+ for ( a=0; a<A->nseq; a++)ls[a]=a;
+ free_ls=1;
+ }
+ else
+ {
+ n=ns;
+ p=vcalloc (n, sizeof (char*));
+ }
+
+ for (c=0,a=0; a<ns; a++)
+ {
+ s=ls[a];
+ if ( A->S && (R=seq2R_template_profile (A->S, A->order[s][0]))!=NULL)
+ {
+ n+=R->nseq;
+ p=vrealloc (p, n*sizeof (char*));
+ for (b=0; b<R->nseq; b++)
+ {
+ p[c++]=R->seq_al[b];
+ }
+ }
+ else
+ {
+ int w;
+ w=A->order[s][4]+1;
+
+ for (b=0; b<w; b++)
+ p[c++]=A->seq_al[s];
+ }
+ }
+ count=sub_aln2count_mat3 (p,c);
+ vfree (p);
+ if ( free_ls) vfree (ls);
+ return count;
+}
+int** sub_aln2count_mat3 (char **al, int ns)
+{
+ int **count;
+ int used[1000];
+ int a, b;
+ int r;
+
+ int len;
+ int us;
+
+
+ /*count[x][0]=n symbols in column
+ count[x][1]=total_size of line
+ count[x][2]=Gap frequency
+
+ count[x][n]=symbol n
+ count[x][n+1]=N occurence symbol n;
+ count[x][n+2]=N frequence symbol n*100;
+
+ special multi-channeling
+ count[x][count[x][1]]=Nseq
+ count[x][count[x][1]+s]=residue col x, sequence s
+ */
+
+
+ for (a=0; a< 1000; a++)used[a]=0;
+ len=strlen (al[0]);
+
+ count=declare_int (len+2,100+ns+2);
+ count[len][0]=END_ARRAY;
+ count[len][1]=ns;
+ count[len][2]=len;
+
+
+
+ for (a=0; a<len; a++)
+ {
+ for (us=ns, b=0; b<ns; b++)
+ {
+ r=tolower (al[b][a]);
+
+ if (is_gap(r))us--;
+ else if (used[r])
+ {
+ count[a][used[r]*3+1]++;
+ }
+ else
+ {
+ used[r]=++count[a][0];
+ count[a][used[r]*3]=r;
+ count[a][used[r]*3+1]++;
+ }
+ }
+ count[a][1]=count[a][0]*3+2;
+ /*count[a][2]=(A->nseq-us)*100/A->nseq;*/
+ count[a][2]=ns-us;
+
+ for (b=3; b<count[a][1]; b+=3)
+ {
+ count[a][b+2]=(count[a][b+1]*100)/us;
+ used[count[a][b]]=0;
+ }
+
+
+ /*Option for multi channeling*/
+
+ /*
+ count[a][count[a][1]]=A->nseq;
+ for (b=1; b<=A->nseq; b++)
+ count [a][count[a][1]+b]=(is_gap(A->seq_al[b-1][a]))?0:A->seq_al[b-1][a];
+ */
+ }
+#ifdef XXXXXX
+ HERE ("Display ");
+ for (a=0; a< 5; a++)
+ {
+ fprintf ( stderr, "\n");
+ for ( b=3; b< count[a][1]; b+=3)
+ {
+ fprintf ( stderr, "[%c %d]", count[a][b], count[a][b+1]);
+ }
+ fprintf ( stderr, "\n");
+ for ( b=0; b<ns; b++)
+ {
+ fprintf ( stderr, "%c", al[b][a]);
+ }
+ }
+ HERE ("End of Display");
+#endif
+ return count;
+}
+
+int** aln2count_mat ( Alignment *A)
+ { /*
+ function documentation: start
+
+ int output_freq_mat ( char *outfile, Aligmnent *A)
+
+ This function counts the number of residues in each column of an alignment (Prot/NA)
+ It outputs these values in the following format
+
+ This format can be piped into:
+ The routine used for computing the p-value gmat-inf-gc-v2c
+
+ function documentation: end
+ */
+
+ int a, b,x;
+ int **freq_mat;
+ int alp_size;
+
+ alp_size=sizeof (AA_ALPHABET);
+ freq_mat=declare_int (alp_size+2, A->len_aln);
+
+
+ for ( a=0; a<A->len_aln; a++)
+ {
+ for ( b=0; b< A->nseq; b++)
+ {
+ if ( is_gap ( A->seq_al[b][a]))freq_mat[alp_size][a]++;
+ else
+ {
+ x=tolower(A->seq_al[b][a]);
+ freq_mat[x-'a'][a]++;
+ freq_mat[alp_size+1][a]++;
+
+ }
+ }
+ }
+
+ return freq_mat;
+ }
+char *aln2random_seq (Alignment *A, int pn1, int pn2, int pn3, int gn)
+ {
+
+ /*
+
+
+ Given the frequencies in A ( read as total counts of each Residue in
+ freq[A->nseq][A->len_aln], and pn1, pn2 and pn3:
+
+ 1-Generate a new amino-acid at each position
+ 2-Insert Gaps, using a HMM.
+
+
+ pn3=Weight of the noise induced with sub mat.
+
+ pn1=% noise type 1 ( Varies with entropi)
+ n1=Ratio noise type 1
+
+ T =Nseq
+ t1=Noise 1 expressed in Nseq
+ al=alphabet size;
+ ncat=number of non 0 cat for a given position
+ ICi initial count for residue i
+
+ Ci=freq[seq][AA]
+ t1=T*n1*(1-1/ncat);
+ t2=T*n2;
+
+ Ci= ICi*(T-(t1+t2))/T +(t1)/al+(t2)/al
+
+ */
+
+ int **freq;
+ int **count;
+ float T, tot_t1, tot_t2,tot_t3, n1, n2, n3;
+ float ncat;
+
+ double gf;
+ double *init_freq;
+ double *blur_freq;
+ double *t1, *t2,*t3;
+ int a, b, c, x;
+ char *seq;
+ int tot;
+ /*Viterbi Parameters */
+
+ int p;
+ int AL=0; /*Allowed Transition*/
+ int F=-100000; /*Forbiden Transition*/
+
+ int GAP_TRANSITION;
+ int IGAP=0, IAA=1;
+
+ int state,best_state=0, score, best_score=0;
+ int p_state;
+ int e=0;
+ int **score_tab;
+ int **state_tab;
+ int nstate=2;
+ int **transitions;
+
+ int max;
+
+ seq=vcalloc ( A->len_aln+1, sizeof (char));
+ count=aln2count_mat(A);
+ freq=aln2count_mat(A);
+
+ T=100;
+
+ n1=(float)pn1/100;
+ n2=(float)pn2/100;
+ n3=(float)pn3/100;
+
+ for ( a=0; a< A->len_aln; a++)
+ {
+ for ( b=0; b<26; b++)
+ freq[b][a]=freq[b][a]*((T)/(A->nseq-freq[26][a]));
+ freq[26][a]= (freq[26][a]*T)/A->nseq;
+ }
+
+
+ init_freq=vcalloc ( 26, sizeof (double));
+ blur_freq=vcalloc ( 26, sizeof (double));
+
+ tot_t1=tot_t2=tot_t3=0;
+
+ t1=vcalloc ( 27, sizeof (double));
+ t2=vcalloc ( 27, sizeof (double));
+ t3=vcalloc ( 27, sizeof (double));
+ for (a=0; a< A->len_aln; a++)
+ {
+
+ /*Compute Frequencies*/
+ for (tot=0, b=0; b<26; b++)
+ {
+ if ( is_aa(b+'A'))
+ {
+ init_freq[b]=freq[b][a];
+ tot+=freq[b][a];
+ }
+ }
+ /*Count the number of different amino acids*/
+ for ( ncat=0, b=0; b<=26; b++)
+ {
+ ncat+=(freq[b][a]!=0)?1:0;
+ }
+ /*Blurr the distribution using */
+ blur_freq=compute_matrix_p (init_freq,tot);
+
+
+ /*compute noise 1: biased with blurred content * enthropy--> keeps prosite motifs*/
+ tot_t1=T*n1*(1-1/ncat);
+ for ( b=0; b< 26; b++)if ( is_aa(b+'A')){t1[b]=blur_freq[b]*(1-1/ncat)*n1;}
+
+ /*Compute noise 2: completely random*/
+ tot_t2=T*n2;
+ for ( b=0; b< 26; b++)if ( is_aa(b+'A')){t2[b]=tot_t2/21;}
+
+ /*compute noise 3: biased with the sole content(pam250mt)*/
+ tot_t3=T*n3;
+ for ( b=0; b<26; b++)if ( is_aa(b+'A')){t3[b]=blur_freq[b]*n3;}
+
+ for ( b=0; b<26; b++)
+ {
+ if ( is_aa('A'+b))
+ freq[b][a]=freq[b][a]*(T-(tot_t1+tot_t2+(tot_t3)))/T+t1[b]+t2[b]+t3[b];
+ }
+
+ /*end of the loop that mutates position a*/
+ }
+
+ vfree (blur_freq);
+ vfree (init_freq);
+ vfree ( t3);
+
+ /*1-Generate the amino acids of the new sequence new*/
+
+
+ vsrand (0);
+
+ for ( a=0; a< A->len_aln; a++)
+ {
+
+ for (T=0,b=0; b<26; b++)T+=freq[b][a];
+ x=rand ()%((int)T);
+ for (c=0,b=0; b<26; b++)
+ {
+ c+=freq[b][a];
+ if ( c>=x)
+ {
+ seq[a]='A'+b;
+ c=-1;
+ break;
+ }
+ }
+ if ( c!=-1)seq[a]='-';
+ }
+ seq[a]='\0';
+
+
+ /*2 Generate the gaps in the new sequence*/
+
+
+
+ if ( gn<0);
+ else
+ {
+
+ transitions=declare_int ( nstate, nstate);
+ score_tab=declare_int ( A->len_aln+2, nstate );
+ state_tab=declare_int ( A->len_aln+2, nstate );
+
+
+
+ for (a=0; a<nstate;a++)
+ for (b=0; b<nstate;b++)
+ {transitions[a][b]=F;}
+
+ GAP_TRANSITION=AL-gn;
+
+ transitions[IGAP ][IGAP ]=AL;
+ transitions[IAA][IAA]=AL;
+ transitions[IAA ][IGAP]=GAP_TRANSITION;
+ transitions[IGAP][IAA ]=GAP_TRANSITION;
+
+
+ for ( p=1; p<=A->len_aln; p++){for (state=0; state< nstate; state++){score_tab[p][state]=F;state_tab[p][state]=-1;} }
+
+ for (p=1; p<= A->len_aln; p++)
+ {
+ for (max=0,a=0; a<26; a++)max=MAX(max, freq[a][p-1]);
+ max=(max*(A->nseq-count[26][p-1]))/A->nseq;
+
+ for (state=0; state< nstate; state++)
+ {
+
+
+ gf=freq[26][p-1];
+ if ( state==IGAP) e=gf-50;
+ else if ( state==IAA ) e=max-50;
+ for (p_state=0; p_state<nstate; p_state++)
+ {
+ score=(score_tab[p-1][p_state]==F)?F:(e+transitions[p_state][state]+score_tab[p-1][p_state]);
+ if(p_state==0 || score>best_score){ best_score=score;best_state=p_state;}
+ }
+ score_tab[p][state]=best_score;
+ state_tab[p][state]=best_state;
+ }
+ }
+
+ for (state=0; state<nstate; state++)
+ {
+ if (state==0 || score_tab[p-1][state]>best_score){best_score=score_tab[p-1][state]; best_state=state;}
+ }
+
+ for (p=A->len_aln; p>0;)
+ {
+ if ( best_state==IGAP)
+ {
+ seq[p-1]='-';
+ }
+ else if ( best_state==IAA)
+ {
+ seq[p-1]=seq[p-1];
+ }
+ best_state=state_tab[p][best_state];
+ p--;
+ }
+ }
+
+ free_int (freq, -1);
+ return seq;
+ }
+
+/********************************************************************/
+/* */
+/* Weighting functions */
+/* */
+/* */
+/* */
+/********************************************************************/
+Alignment * master_trimseq( Alignment *A, Sequence *S,char *mode)
+ {
+ Alignment *NA;
+ char *p;
+ int a, b;
+ int use_aln=0, upper_sim=0, min_nseq=0, lower_sim=0;
+ float f_upper_sim, f_lower_sim;
+ char weight_mode[1000];
+ char method[1000];
+ int statistics=0;
+ int trim_direction=TOP;
+ float **sim_weight;
+ int *seq_list;
+ int table=0;
+
+
+
+
+ /*
+ mode:
+ (trim)_<seq or aln>_%<percentage of tot weight to keep>_n<number of seq to keep>_w<weight mode>
+ */
+
+
+
+ seq_list=vcalloc ( S->nseq, sizeof (int));
+ for ( a=0; a< A->nseq; a++)
+ {
+ seq_list[a]=1;
+ }
+
+
+ use_aln=aln_is_aligned(A);
+
+ if ( mode[0]=='\0')
+ {
+
+ upper_sim=50;
+ lower_sim=0;
+ min_nseq=0;
+ sprintf (weight_mode, "pwsim");
+ sprintf ( method, "clustering2");
+ }
+ else
+ {
+
+ upper_sim=lower_sim=min_nseq;
+ sprintf (weight_mode, "pwsim");
+ sprintf ( method, "clustering2");
+ }
+
+ /*
+ U or % (deprecated) Upper bound for pairwise similarity
+ L or m (depercated) Lower bound for pairwise similarity
+ n max number of sequences
+ N max number of sequences as a fraction of thet total
+ S print Statistics
+ T print Table of distances
+ */
+
+
+
+ while ( (p=strtok(mode, "_")))
+ {
+ mode=NULL;
+ if (strm (p, "seq"))use_aln=0;
+ else if ( strm(p,"aln"))use_aln=1;
+ else if (p[0]=='s')statistics=1;
+ else if (p[0]=='t')table=1;
+ else if (p[0]=='U')upper_sim=atoi(p+1);
+ else if (p[0]=='L')lower_sim=atoi(p+1);
+ else if (p[0]=='n')min_nseq=atoi(p+1);
+ else if (p[0]=='N')min_nseq=atoi(p+1)*-1;
+ else if (p[0]=='B')trim_direction=BOTTOM;
+ else if (p[0]=='T')trim_direction=TOP;
+ else if (p[0]=='W')sprintf (weight_mode, "%s", p+1);
+ else if (p[0]=='M')sprintf (method, "%s", p+1);
+ else if (p[0]=='K')
+ {
+
+ while ((p=strtok(NULL, ":")))
+ {
+
+ if ( p[0]=='#')
+ {
+ seq_list[atoi(p+1)-1]=2;
+ }
+ else if ( (a=name_is_in_list (p, A->name, A->nseq, 100))!=-1)
+
+ {
+ seq_list[a]=2;
+ }
+ }
+ }
+ }
+
+ if ( !upper_sim && !min_nseq && !lower_sim)upper_sim=50;
+
+
+
+ if (!S)
+ {
+ fprintf ( stderr, "\ntrimseq requires a set of sequences[FATAL:%s]\n", PROGRAM);
+ crash("");
+ }
+
+ else if ( min_nseq> S->nseq)
+ {
+ min_nseq=S->nseq;
+ }
+ else if ( min_nseq<0)
+ {
+ if ( min_nseq<-100)
+ {
+ add_warning ( stderr, "\nWARNING: trimseq: Nseq(N) max_val=100%% [Automatic reset]\n");
+ min_nseq=-100;
+ }
+
+ min_nseq=(int)((float)S->nseq*((float)min_nseq/100)*-1);
+ }
+
+
+ NA=seq2subseq3 (A, S,use_aln,lower_sim,upper_sim,min_nseq,trim_direction, weight_mode,&sim_weight, seq_list );
+
+ if ( table)
+ {
+ fprintf ( stderr, "\nSIMILARITY MATRIX\n");
+ for ( a=0; a< A->nseq-1; a++)
+ for ( b=a+1; b< A->nseq; b++)
+ {
+ fprintf ( stderr, "%15s Vs %15s : %3.2f %% id\n", A->name[a], A->name[b], 100-sim_weight[a][b]);
+ }
+ }
+ if ( statistics)
+ {
+ f_upper_sim=(upper_sim>100)?((float)upper_sim/(float)100):upper_sim;
+ f_lower_sim=(upper_sim>100)?((float)lower_sim/(float)100):lower_sim;
+
+ fprintf ( stderr, "\nTRIM Informations:\n");
+ fprintf ( stderr, "\tUse...........: %s\n",(use_aln)?"multiple_aln":"pairwise_aln");
+ fprintf ( stderr, "\tcluster_mode..: %s\n" ,method);
+ fprintf ( stderr, "\tsim_mode......: %s\n" ,weight_mode);
+ fprintf ( stderr, "\tlower_id_bound: %.2f%%\n" ,(f_lower_sim==0)?-1:f_lower_sim);
+ fprintf ( stderr, "\tupper_id_bound: %.2f%%\n",(f_upper_sim==0)?-1:f_upper_sim);
+ fprintf ( stderr, "\tnseq_kept.....: %d (out of %d)\n" ,NA->nseq, S->nseq);
+ fprintf ( stderr, "\treduction.....: %d%% of original set\n" ,(NA->nseq*100)/S->nseq);
+ fprintf ( stderr, "\tTrim_direction: From %s \n" ,(trim_direction==BOTTOM)?"Bottom":"Top");
+ }
+
+ return NA;
+ }
+
+Alignment *sim_filter (Alignment *A, char *in_mode, char *seq)
+{
+ int **sim, **cov;
+ int *list;
+ int *keep;
+ int maxnseq, nseq_ratio, nc;
+ int new_nseq;
+ int a, s, n, k;
+ Alignment *R;
+ char *mode;
+ int outlayers;
+ int direction=1;//remove the higher than
+ int coverage=0; //remove based on coverage
+ static char *field;
+ int maxsim, minsim, maxcov, mincov;
+
+ if ( !field) field=vcalloc (1000, sizeof (char));
+
+ mode=vcalloc ( strlen (in_mode)+10, sizeof (char));
+ sprintf ( mode, "_%s_", in_mode);
+
+ strget_param ( mode, "_I", "100", "%d", &maxsim);
+ strget_param ( mode, "_i", "0", "%d", &minsim);
+ strget_param ( mode, "_C", "100", "%d", &maxcov);
+ strget_param ( mode, "_c", "0", "%d", &mincov);
+
+
+
+
+
+ keep=vcalloc ( A->nseq, sizeof (int));
+ list=vcalloc ( A->nseq, sizeof (int));
+
+
+
+
+
+
+ if (!seq)s=0;
+ else s=name_is_in_list (seq, A->name, A->nseq, 100);
+ if (s==-1)
+ {
+
+ if ( s==-1)printf_exit (EXIT_FAILURE, stderr, "ERROR: %s is not a valid sequence", seq);
+ }
+ else
+ keep[s]=1;
+
+ //get the distances
+ if ( strstr (mode, "_seq_"))
+ {
+ char **seq;
+ int **M;
+
+ M=read_matrice ("blosum62mt");
+ seq=declare_char (A->nseq, A->len_aln+1);
+ for (a=0; a<A->nseq; a++)
+ {
+ sprintf ( seq[a], "%s", A->seq_al[a]);
+ ungap (seq[a]);
+ }
+
+ sim=declare_int (A->nseq, A->nseq);
+ cov=declare_int (A->nseq, A->nseq);
+
+ for (a=0; a<A->nseq; a++)
+ {
+ if ( s!=a)
+ {
+ sim[s][a]=sim[a][s]=idscore_pairseq(seq[s], seq[a],-12, -1,M,"sim");
+ cov[s][a]=cov[a][s]=idscore_pairseq(seq[s], seq[a],-12, -1,M,"cov");
+
+ }
+ }
+ free_char (seq, -1);
+ free_int (M,-1);
+ }
+ else
+ {
+ sim=aln2sim_mat (A, "idmat");
+ cov=aln2cov (A);
+ }
+
+ for (a=0; a< A->nseq; a++)
+ {
+ if (a==s)continue;
+ else
+ {
+ if ( sim[s][a]>maxsim || sim[s][a]<minsim|| cov[s][a]<mincov||cov[s][a]>maxcov)keep[a]=-1;
+ else keep[a]=1;
+ }
+ }
+
+ for ( n=0, a=0; a< A->nseq; a++)
+ {
+ if ( keep[a]!=-1)
+ {
+ list[n++]=a;
+ }
+ }
+
+ R=extract_sub_aln (A, n, list);
+ free_int (sim, -1); free_int (cov, -1);vfree (list);
+
+ return R;
+}
+
+
+static int find_worst_seq ( int **sim, int n, int *keep, int max, int direction);
+Alignment *simple_trimseq (Alignment *A, Alignment *K, char *in_mode, char *seq_list, int **sim)
+{
+ int *list;
+ int *keep;
+ int maxnseq, maxsim, nseq_ratio, nc;
+ int new_nseq;
+ int a,b, s, n, k;
+ Alignment *R;
+ char *mode;
+ int outlayers;
+ int direction=1;//remove the higher than
+ int coverage=0; //remove based on coverage
+ static char *field;
+ int *tot_avg;
+ int KeepN=0;
+ int Print=0;
+
+ if ( !field) field=vcalloc (1000, sizeof (char));
+
+ mode=vcalloc ( strlen (in_mode)+10, sizeof (char));
+ sprintf ( mode, "_%s_", in_mode);
+
+ strget_param ( mode, "_%%", "0", "%d", &maxsim);
+ strget_param ( mode, "_n", "0", "%d", &maxnseq);
+ strget_param ( mode, "_N", "0", "%d", &nseq_ratio);
+ strget_param ( mode, "_F", "0", "%d", &nc);
+ strget_param ( mode, "_O", "0", "%d", &outlayers);
+ strget_param ( mode, "_K", "0", "%d", &KeepN);
+
+ strget_param ( mode, "_f", "NAME", "%s", field);
+
+ if ( strstr (mode, "_P_"))Print=1;
+
+ if ( strstr (mode, "_min"))direction=-1;
+ else direction=1;
+
+ if ( strstr (mode, "_cov"))coverage=1;
+ else coverage=0;
+
+
+ if ( nseq_ratio)
+ {
+ maxnseq=(A->nseq*nseq_ratio)/100;
+ maxsim=0;
+ }
+ else if ( maxnseq)
+ {
+ maxsim=0;
+ }
+ else if ( !maxsim)
+ {
+ maxsim=100;
+ }
+
+
+ keep=vcalloc ( A->nseq, sizeof (int));
+ list=vcalloc ( A->nseq, sizeof (int));
+
+
+
+
+ /*Remove Sequences that do not have at least one residue in the first and last nc columns*/
+ if ( nc)
+ {
+ int left, right, full_n,x, y;
+ int *full_list;
+
+ Alignment *F;
+
+ full_list=vcalloc ( A->nseq, sizeof (int));
+ full_n=0;
+ for (x=0; x< A->nseq; x++)
+ {
+ for ( left=0,y=0; y<MIN(A->len_aln,nc); y++)
+ if (!is_gap(A->seq_al[x][y]))left=1;
+
+ for ( right=0,y=MAX(0,(A->len_aln-nc)); y<A->len_aln; y++)
+ if (!is_gap(A->seq_al[x][y]))right=1;
+
+ if ( left && right)full_list[full_n++]=x;
+ }
+ F=extract_sub_aln (A, full_n, full_list);
+ free_aln (A);
+ vfree (full_list);
+ A=F;
+ }
+
+ /*Reorder the sequences according to the tree order: hopefully better phylogenetic coverage after trim*/
+ if (strstr (mode, "_T"))
+ {
+ NT_node **T;
+ Sequence *O;
+
+ if (!sim)sim=sim_array2dist_array ( NULL, MAXID);
+ T=int_dist2nj_tree (sim, A->name, A->nseq, NULL);
+ O=tree2seq (T[3][0], NULL);
+ A=reorder_aln (A, O->name, O->nseq);
+
+ free_int (sim, -1);
+ free_sequence (O, -1);
+ }
+
+ if ( coverage==0)
+ {
+ if ( strstr (mode, "seq_") && !sim)sim=seq2comp_mat (aln2seq(A), "blosum62mt", "sim");
+ else sim=aln2sim_mat (A, "idmat");
+ }
+ else
+ {
+ int b;
+ if ( strstr (mode, "seq_") && !sim)sim=seq2comp_mat (aln2seq(A), "blosum62mt", "cov");
+ else sim=aln2cov (A);
+
+ }
+
+
+ if ( K && K->nseq>0)
+ {
+ for ( a=0; a< K->nseq; a++)
+ if ( (k=name_is_in_list (K->name[a], A->name, A->nseq, MAXNAMES+1))!=-1)
+ {
+
+ keep[k]=1;
+ }
+ }
+ if ( seq_list)
+ {
+ for ( a=0; a< A->nseq; a++)
+ {
+ if (strstr (field, "NAME") && perl_strstr (A->name[a], seq_list)){keep[a]=1;}
+ else if (strstr (field, "COMMENT") && A->seq_comment && perl_strstr(A->seq_comment[a], seq_list)){keep[a]=1;}
+ else if (strstr (field, "SEQ") && perl_strstr((A->S)->seq[a], seq_list)){keep[a]=1;}
+ }
+
+ }
+ for (a=0; a<KeepN; a++)keep[a]=1;
+
+ if (Print)
+ {
+ for ( a=0; a< A->nseq; a++)
+ if ( keep[a]) fprintf ( stderr, "\nFORCED KEEP %s", A->name[a]);
+ }
+
+ new_nseq=A->nseq;
+
+
+ while ( (s=find_worst_seq (sim, A->nseq, keep, maxsim, direction))!=-1 && new_nseq>maxnseq)
+ {
+ for ( a=0; a< A->nseq; a++)sim[a][s]=sim[s][a]=-1;
+ keep[s]=-1;
+ new_nseq--;
+ }
+
+ /*Trim Outlayers*/
+ if (outlayers!=0)
+ {
+ int nn, b;
+ tot_avg=vcalloc ( A->nseq, sizeof (int));
+
+ for (a=0; a<A->nseq; a++)
+ {
+ if ( keep[a]==-1)tot_avg[a]=-1;
+ else
+ {
+ for (nn=0, b=0; b< A->nseq; b++)
+ {
+ if (a==b || keep[b]==-1)continue;
+ else
+ {
+ tot_avg[a]+=sim[a][b];
+ nn++;
+ }
+ }
+ tot_avg[a]=(nn==0)?-1:(tot_avg[a])/nn;
+ }
+ }
+ for ( a=0; a<A->nseq; a++)
+ {
+ if (tot_avg[a]!=-1 && tot_avg[a]<outlayers)
+ {
+ fprintf ( stderr, "\nREMOVED OUTLAYER: %3d %% avg similarity with remaining sequences [Seq %s]", tot_avg[a],A->name[a]);
+ keep[a]=-1;
+ }
+ }
+ vfree ( tot_avg);
+ }
+
+ for ( n=0, a=0; a< A->nseq; a++)
+ {
+ if ( keep[a]!=-1)
+ {
+ list[n++]=a;
+ }
+ }
+
+ R=extract_sub_aln (A, n, list);
+ free_int (sim, -1); vfree (list);
+
+ return R;
+}
+
+int find_worst_seq ( int **sim, int n, int *keep,int max,int direction)
+{
+ int **sc;
+ int a, b, r=0;
+ int si;
+
+ sc=declare_int (n, 2);
+ if (direction==-1)max=100-max;
+
+ for ( a=0; a< n; a++) sc[a][0]=a;
+ for ( a=0; a< n-1; a++)
+ {
+ for ( b=a+1; b<n; b++)
+ {
+
+ if (sim[a][b]>=0)si=(direction==-1)?100-sim[a][b]:sim[a][b];
+ else si=sim[a][b];
+ if ( si>max)
+ {
+ if ( keep[a]!=1)sc[a][1]+=si;
+ if ( keep[b]!=1)sc[b][1]+=si;
+ }
+ }
+ }
+
+ sort_int_inv ( sc, 2, 1, 0, n-1);
+ if ( sc[0][1]>0)r=sc[0][0];
+ else r=-1;
+
+ free_int (sc, -1);
+ if (r!=-1 && keep && keep[r])return -1;
+ else return r;
+}
+
+int find_worst_seq_old ( int **sim, int n, int *keep,int max,int direction)
+{
+ int **sc;
+ int a, b, r=0;
+
+ sc=declare_int (n, 2);
+
+ for ( a=0; a< n; a++) sc[a][0]=a;
+ for ( a=0; a< n-1; a++)
+ {
+ for ( b=a+1; b<n; b++)
+ {
+ if ( direction==1)
+ {
+ if ( sim[a][b]>max)
+ {
+ if ( keep[a]!=1)sc[a][1]+=sim[a][b];
+ if ( keep[b]!=1)sc[b][1]+=sim[a][b];
+ }
+ }
+ else if ( direction == -1)
+ {
+ if ( sim[a][b]<max && sim[a][b]>=0)
+ {
+ if ( keep[a]!=1)sc[a][1]+=sim[a][b];
+ if ( keep[b]!=1)sc[b][1]+=sim[a][b];
+ }
+ }
+ }
+ }
+
+ if ( direction ==1) //remove max
+ {
+ sort_int_inv ( sc, 2, 1, 0, n-1);
+ if ( sc[0][1]>0)r=sc[0][0];
+ else r=-1;
+
+ }
+ else if ( direction ==-1)//remove min
+ {
+ sort_int_inv ( sc, 2, 1, 0, n-1);
+ if ( sc[0][1]>=0)r=sc[0][0];
+ else r=-1;
+ HERE ("** %d %d\n", r,sc[0][1]);
+ }
+ free_int (sc, -1);
+ if (r!=-1 && keep && keep[r])return -1;
+ else return r;
+}
+
+
+Alignment * trimseq( Alignment *A, Sequence *S,char *mode)
+ {
+ Alignment *NA;
+ char *p;
+ int a, b;
+ int use_aln=0, upper_sim=0, min_nseq=0, lower_sim=0;
+ char weight_mode[1000];
+ char method[1000];
+ int statistics=0;
+ int trim_direction=TOP;
+ float **sim_weight;
+ int *seq_list;
+ int table=0;
+ int print_name=0;
+ float f_lower_sim, f_upper_sim;
+
+
+
+ /*
+ mode:
+ (trim)_<seq or aln>_%<percentage of tot weight to keep>_n<number of seq to keep>_w<weight mode>
+ */
+
+
+
+ seq_list=vcalloc ( S->nseq, sizeof (int));
+ for ( a=0; a< A->nseq; a++)
+ {
+ seq_list[a]=1;
+ }
+
+
+ use_aln=aln_is_aligned(A);
+
+
+ if ( mode[0]=='\0')
+ {
+
+ upper_sim=50;
+ lower_sim=0;
+ min_nseq=0;
+ sprintf (weight_mode, "pwsim_fragment");
+ sprintf ( method, "clustering2");
+ }
+ else
+ {
+
+ upper_sim=lower_sim=min_nseq;
+ sprintf (weight_mode, "pwsim_fragment");
+ sprintf ( method, "clustering2");
+ }
+
+ /*
+ U or % (deprecated) Upper bound for pairwise similarity
+ L or m (depercated) Lower bound for pairwise similarity
+ n max number of sequences
+ N max number of sequences as a fraction of thet total
+ S print Statistics
+ T print Table of distances
+ */
+
+
+
+ while ( (p=strtok(mode, "_")))
+ {
+ mode=NULL;
+ if (strm (p, "seq"))use_aln=0;
+ else if ( strm(p,"aln"))use_aln=1;
+ else if (p[0]=='s')statistics=1;
+ else if (p[0]=='t')table=1;
+ else if (p[0]=='p')print_name=1;
+ else if (p[0]=='U')upper_sim=atoi(p+1);
+ else if (p[0]=='L')lower_sim=atoi(p+1);
+ else if (p[0]=='n')min_nseq=atoi(p+1);
+ else if (p[0]=='N')min_nseq=atoi(p+1)*-1;
+ else if (p[0]=='B')trim_direction=BOTTOM;
+ else if (p[0]=='T')trim_direction=TOP;
+ else if (p[0]=='W')sprintf (weight_mode, "%s", p+1);
+ else if (p[0]=='M')sprintf (method, "%s", p+1);
+ else if (p[0]=='K')
+ {
+
+ while ((p=strtok(NULL, ":")))
+ {
+
+ if ( (a=name_is_in_list (p, A->name, A->nseq, 100))!=-1)
+ {
+ seq_list[a]=2;
+ }
+ }
+ }
+ }
+
+ if ( !upper_sim && !min_nseq && !lower_sim)upper_sim=50;
+
+
+
+ if (!S)
+ {
+ fprintf ( stderr, "\ntrimseq requires a set of sequences[FATAL:%s]\n", PROGRAM);
+ crash("");
+ }
+
+ else if ( min_nseq> S->nseq)
+ {
+ min_nseq=S->nseq;
+ }
+ else if ( min_nseq<0)
+ {
+ if ( min_nseq<-100)
+ {
+ add_warning ( stderr, "\nWARNING: trimseq: Nseq(N) max_val=100%% [Automatic reset]\n");
+ min_nseq=-100;
+ }
+
+ min_nseq=(int)((float)S->nseq*((float)min_nseq/100)*-1);
+ }
+
+
+ NA=seq2subseq2 (A, S,use_aln,lower_sim,upper_sim,min_nseq,trim_direction, weight_mode,&sim_weight, seq_list );
+
+ if ( table)
+ {
+ fprintf ( stderr, "\nSIMILARITY MATRIX\n");
+ for ( a=0; a< A->nseq-1; a++)
+ for ( b=a+1; b< A->nseq; b++)
+ {
+ fprintf ( stderr, "%15s Vs %15s : %3.2f %% id\n", A->name[a], A->name[b], 100-sim_weight[a][b]);
+ }
+ }
+
+ NA=seq_name2removed_seq_name(S, NA,sim_weight);
+
+ if ( print_name)
+ {
+ fprintf ( stderr, "\nList of sequences with their closest removed neighbors\n");
+ for ( a=0; a< NA->nseq; a++)fprintf ( stderr, "\n%s: %s\n", NA->name[a], NA->seq_comment[a]);
+ }
+
+ if ( statistics)
+ {
+ f_lower_sim=(lower_sim>100)?(float)lower_sim/100:lower_sim;
+ f_upper_sim=(upper_sim>100)?(float)upper_sim/100:upper_sim;
+
+ fprintf ( stderr, "\nTRIM seq Informations:\n");
+ fprintf ( stderr, "\tUse...........: %s\n",(use_aln)?"multiple_aln":"pairwise_aln");
+ fprintf ( stderr, "\tcluster_mode..: %s\n" ,method);
+ fprintf ( stderr, "\tsim_mode......: %s\n" ,weight_mode);
+ fprintf ( stderr, "\tlower_id_bound: %.2f%%\n" ,(f_lower_sim==0)?-1:f_lower_sim);
+ fprintf ( stderr, "\tupper_id_bound: %.2f%%\n",(f_upper_sim==0)?-1:f_upper_sim);
+ fprintf ( stderr, "\tnseq_kept.....: %d (out of %d)\n" ,NA->nseq, S->nseq);
+ fprintf ( stderr, "\treduction.....: %d%% of original set\n" ,(NA->nseq*100)/S->nseq);
+ fprintf ( stderr, "\tTrim_direction: From %s \n" ,(trim_direction==BOTTOM)?"Bottom":"Top");
+ }
+
+ return NA;
+ }
+
+Alignment * tc_trimseq( Alignment *A, Sequence *S,char *mode)
+ {
+ Alignment *NA;
+ Sequence *TS;
+ char *trimfile, *alnfile;
+ int *seq_list;
+ int a, nseq=0, sim=0;
+ char *p;
+ char command[100000];
+ char keep_list[10000];
+
+ int top, bottom, middle, pmiddle;
+
+ keep_list[0]='\0';
+
+ seq_list=vcalloc ( S->nseq, sizeof (int));
+ for ( a=0; a< A->nseq; a++)
+ {
+ seq_list[a]=1;
+ }
+
+ trimfile=vtmpnam (NULL);
+ alnfile=vtmpnam (NULL);
+ if ( !aln_is_aligned (A))
+ {
+ fprintf ( stderr, "\ntrimTC: computation of an Approximate MSA [");
+ A=compute_tcoffee_aln_quick ( A, NULL);
+ fprintf ( stderr, "DONE]\n");
+ }
+ output_clustal_aln (alnfile, A);
+
+
+ while ( (p=strtok(mode, "#")))
+ {
+ mode=NULL;
+
+
+ if (p[0]=='%' || p[0]=='S')sim=(p[1]=='%')?atoi(p+2):atoi(p+1);
+ else if (p[0]=='n' || p[0]=='N')nseq=atoi(p+1);
+ else if (p[0]=='K')
+ {
+ if ( (a=name_is_in_list (p+1, A->name, A->nseq, 100))!=-1)
+ {
+ seq_list[a]=2;
+ }
+
+ }
+ }
+ if ( nseq ==0 && sim ==0)
+ {
+ fprintf ( stderr, "\nERROR: trimTC\nIndicate the maximum number of sequences Nnseq\nOR the maximum average similarity of the chosen sequencesSx\nEX: +trimTC S20 OR +trimTC N5");
+ fprintf ( stderr, "\n[FATAL:%s]", PROGRAM);
+ myexit (EXIT_FAILURE);
+ }
+
+ for ( a=0; a<A->nseq; a++)if (seq_list[a]==2){strcat ( keep_list, A->name[a]);strcat ( keep_list," ");}
+
+ if ( sim)
+ {
+ sprintf ( command , "%s -infile %s -trim -trimfile=%s -split_score_thres %d -convert -iterate 0 ",get_string_variable("t_coffee"), alnfile, trimfile,sim);
+ if ( keep_list[0]){strcat ( command, " -seq_to_keep ");strcat ( command, keep_list);}
+ my_system ( command);
+ TS=read_sequences (trimfile);
+ }
+ else if ( nseq && A->nseq>nseq)
+ {
+
+ top=100;bottom=0;
+ pmiddle=0;middle=50;
+
+ sprintf ( command , "%s -infile %s -trim -trimfile=%s -split_score_thres %d -convert -iterate 0",get_string_variable("t_coffee"), alnfile, trimfile,middle);
+ if ( keep_list[0]){strcat ( command, " -seq_to_keep ");strcat ( command, keep_list);}
+ my_system ( command);
+
+ TS=read_sequences (trimfile);
+ fprintf ( stderr, "\n\tTrimTC: Sim %d Nseq %d\t",middle, TS->nseq);
+
+ if ( TS->nseq>nseq)top=middle;
+ else if ( TS->nseq<nseq)bottom=middle;
+ pmiddle=middle;
+ middle=(top-bottom)/2+bottom;
+
+ while (TS->nseq!=nseq && pmiddle!=middle)
+ {
+
+ sprintf ( command , "%s -infile %s -trim -trimfile=%s -split_score_thres %d -convert -iterate 0 ",get_string_variable("t_coffee"), alnfile, trimfile,middle);
+ if ( keep_list[0]){strcat ( command, " -seq_to_keep ");strcat ( command, keep_list);}
+ my_system ( command);
+ free_sequence (TS, -1);
+ TS=read_sequences (trimfile);
+ fprintf ( stderr, "\n\tTrimTC: Sim %d Nseq %d\t", middle, TS->nseq);
+
+ if ( TS->nseq>nseq)top=middle;
+ else if ( TS->nseq<nseq)bottom=middle;
+ pmiddle=middle;
+ middle=(top-bottom)/2+bottom;
+ }
+ }
+ else
+ {
+ TS=aln2seq (A);
+ }
+ NA=seq2aln (TS, NULL, 1);
+ vremove ( alnfile);
+ fprintf ( stderr, "\n");
+
+ return NA;
+ }
+
+Alignment* seq2subseq3( Alignment *A, Sequence *S,int use_aln, int int_lower_sim,int int_upper_sim, int min_nseq, int trim_direction, char *weight_mode, float ***sim_weight, int *seq_list)
+{
+ int a, b;
+ int new_nseq;
+
+ /*OUTPUT*/
+ char **seq, **name;
+ Sequence *NS;
+ Alignment *NA;
+ float sim, lower_sim, upper_sim;
+
+ lower_sim=(int_lower_sim>100)?(float)int_lower_sim/100:int_lower_sim;
+ upper_sim=(int_upper_sim>100)?(float)int_upper_sim/100:int_upper_sim;
+
+ sim_weight[0]=get_weight ((use_aln)?A:NULL, S, weight_mode);
+
+ name=declare_char (S->nseq, (MAXNAMES+1));
+ seq= declare_char (S->nseq, S->max_len+1);
+
+ /*
+ Remove every sequence that is more than upper_sim and less than lower_sim similar to the master sequences
+ the master sequence(s) are those for which seq_list[x]==2
+ */
+
+
+
+
+ new_nseq=A->nseq;
+
+
+ for (a=0; a< A->nseq; a++)
+ {
+ if ( seq_list[a]==2)
+ {
+
+ for ( b=0; b< A->nseq;b++)
+ {
+ sim=100-sim_weight[0][a][b];
+ if (seq_list[b]==1 && (sim>upper_sim || sim<lower_sim))
+ {
+ seq_list[b]=0;
+ new_nseq--;
+ }
+ }
+
+ }
+ }
+
+ /*Prepare the new sequence List*/
+
+ for (b=0, a=0; a<S->nseq; a++)
+ {
+ if ( seq_list[a])
+ {
+ sprintf ( name[b], "%s", S->name[a]);
+ sprintf ( seq[b] , "%s",(use_aln)?A->seq_al[a]: S->seq[a] );
+ b++;
+ }
+ }
+
+
+ NS=fill_sequence_struc (new_nseq,seq,name);
+ NA=seq2aln(NS,NULL,1);
+
+ if ( use_aln && A)
+ {
+ NA=realloc_aln2 ( NA,A->max_n_seq,A->len_aln+1);
+
+ for (b=0, a=0; a<S->nseq; a++)
+ {
+ if ( seq_list[a])
+ {
+ sprintf ( NA->seq_al[b] , "%s",A->seq_al[a]);
+ b++;
+ }
+ }
+
+ NA->len_aln=A->len_aln;
+ ungap_aln(NA);
+ }
+
+
+ return NA;
+}
+Alignment* seq2subseq2( Alignment *A, Sequence *S,int use_aln, int int_lower_sim,int int_upper_sim, int min_nseq, int trim_direction, char *weight_mode, float ***sim_weight, int *seq_list)
+{
+ int a, b;
+ int new_nseq;
+ int seq_index=0;
+ /*OUTPUT*/
+ char **seq, **name;
+ Sequence *NS;
+ Alignment *NA;
+ float lower_sim, upper_sim;
+
+ lower_sim=(int_lower_sim>100)?(float)int_lower_sim/100:int_lower_sim;
+ upper_sim=(int_upper_sim>100)?(float)int_upper_sim/100:int_upper_sim;
+
+
+ sim_weight[0]=get_weight ((use_aln)?A:NULL, S, weight_mode);
+
+ name=declare_char (S->nseq, (MAXNAMES+1));
+ seq= declare_char (S->nseq, S->max_len+1);
+
+ /*
+ 1 REMOVE OUTLAYERS
+ 2 REMOVE CLOSELY RELATED SEQUENCES
+ 3 IF STILL TOO MANY SEQUENCES:
+ REMOVE THE MOST CLOSELY RELATED ONES
+ */
+
+
+ /*1 Remove outlayers*/
+
+ new_nseq=A->nseq;
+
+
+ /*1 Remove outlayers*/
+ while ( lower_sim && (extreme_seq(BOTTOM,A,sim_weight[0],seq_list, &seq_index) <lower_sim) && ((new_nseq)>min_nseq) && seq_index!=-1)
+ {
+
+ if ( seq_list[seq_index]==1)
+ {
+ seq_list[seq_index]=0;
+ new_nseq--;
+ }
+ }
+ /*2 Remove close relative*/
+
+
+ while ( upper_sim && (extreme_seq(TOP, A,sim_weight[0],seq_list, &seq_index)>upper_sim) && ((new_nseq)>min_nseq)&& seq_index!=-1)
+ {
+
+ if ( seq_list[seq_index]==1)
+ {
+ seq_list[seq_index]=0;
+ new_nseq--;
+ }
+ }
+
+
+ /*Remove extra sequences*/
+
+ while ( min_nseq>0 && new_nseq>min_nseq && seq_index!=-1)
+ {
+
+ extreme_seq(trim_direction, A,sim_weight[0],seq_list, &seq_index);
+
+ if ( seq_index==-1)break;
+ if ( seq_list[seq_index]==1)
+ {
+ seq_list[seq_index]=0;
+ new_nseq--;
+ }
+ }
+
+
+ /*Prepare the new sequence List*/
+
+ for (b=0, a=0; a<S->nseq; a++)
+ {
+ if ( seq_list[a])
+ {
+ sprintf ( name[b], "%s", S->name[a]);
+ sprintf ( seq[b] , "%s",(use_aln)?A->seq_al[a]: S->seq[a] );
+ b++;
+ }
+ }
+
+
+ NS=fill_sequence_struc (new_nseq,seq,name);
+ NA=seq2aln(NS,NULL,1);
+
+ if ( use_aln && A)
+ {
+ NA=realloc_aln2 ( NA,A->max_n_seq,A->len_aln+1);
+
+ for (b=0, a=0; a<S->nseq; a++)
+ {
+ if ( seq_list[a])
+ {
+ sprintf ( NA->seq_al[b],"%s",A->seq_al[a]);
+ b++;
+ }
+ }
+
+ NA->len_aln=A->len_aln;
+ ungap_aln(NA);
+ }
+
+
+ return NA;
+}
+
+float extreme_seq (int direction, Alignment *A,float **sim_weight,int *seq_list, int *seq_index)
+{
+
+ /*find the closest relative of each sequence
+ Return:
+ Direction= BOTTOM: the sequence whose closest relative is the most distant
+ Direction= TOP: the sequence whose closest relative is the closest
+ weight: different sequences=100
+ similar sequences =0
+ */
+ int a, b;
+
+ float top_sim,bottom_sim, best_sim, sim;
+ int top_seq, bottom_seq;
+
+ bottom_seq=top_seq=seq_index[0]=-1;
+ top_sim=-1;
+ bottom_sim=101;
+
+ for (a=0; a< A->nseq; a++)
+ {
+ if (seq_list[a]!=1)continue;
+
+ for ( best_sim=0, b=0; b< A->nseq; b++)
+ {
+ if ( a==b || !seq_list[b])continue;
+
+ sim=100-sim_weight[a][b];
+ if (sim>best_sim)
+ {
+ best_sim=sim;
+ }
+ }
+
+ if ( best_sim>top_sim)
+ {
+ top_seq=a;
+ top_sim=best_sim;
+ }
+
+ if ( best_sim<bottom_sim)
+ {
+ bottom_seq=a;
+ bottom_sim=best_sim;
+ }
+
+ }
+ if ( direction==BOTTOM ){seq_index[0]= bottom_seq; return bottom_sim;}
+ else if ( direction==TOP){seq_index[0]= top_seq; return top_sim;}
+ else
+ {
+ seq_index[0]=-1;
+ return -1;
+ }
+}
+
+
+
+
+Alignment* seq2subseq1( Alignment *A, Sequence *S,int use_aln, int percent,int max_nseq, int ms,char *weight_mode)
+ {
+ float **pw_weight,**sim_weight, **seq_weight;
+ int a,b,c,d;
+ float sum, chosen,last_chosen, last_nchosen,nchosen;
+ int condition1, condition2;
+ Sequence *NS;
+ Alignment *NA;
+ char **name, **seq;
+ float score, best_score;
+ int best_seq=0;
+ int *seq_list, *used_seq_list;
+
+ /*
+ mode:
+ (trim)_<seq or aln>_%<percentage of tot weight to keep>_n<number of seq to keep>_w<weight mode>
+ */
+
+ sim_weight=get_weight ((use_aln)?A:NULL, S, weight_mode);
+ pw_weight=declare_float (S->nseq, S->nseq);
+ seq_weight=declare_float ( S->nseq, 2);
+
+
+ for (best_score=0,a=0; a<S->nseq; a++)
+ {
+ for ( b=0; b<S->nseq; b++)
+ {
+ if ( a==b)continue;
+ seq_weight[a][0]+=sim_weight[a][b];
+ }
+ seq_weight[a][0]=seq_weight[a][0]/(S->nseq-1);
+ score=seq_weight[a][0]=100-seq_weight[a][0];
+
+ if ( score>best_score)
+ {
+ best_seq=a;
+ best_score=score;
+ }
+
+ }
+ for (a=0; a<S->nseq; a++)
+ {
+ for ( b=0; b<S->nseq; b++)
+ {
+ if ( a==b)continue;
+ pw_weight[a][b]=sim_weight[a][b]*seq_weight[a][0]*seq_weight[b][0]/(100*100);
+
+ }
+ }
+
+
+ seq_list=vcalloc ( S->nseq, sizeof (int));
+ used_seq_list=vcalloc ( S->nseq, sizeof (int));
+
+
+
+ name=declare_char (S->nseq, (MAXNAMES+1));
+ seq= declare_char (S->nseq, S->max_len+1);
+
+ /*compute the normalization factor*/
+ for (sum=0,d=0; d< S->nseq; d++)
+ {
+ for (score=0,c=0; c<S->nseq; c++)
+ {
+ if ( c!=d)
+ score=MAX(score, 100-sim_weight[c][d]);
+ }
+ sum+=score;
+ }
+ sum=sum/S->nseq;
+ /*chose the first sequence */
+ for ( best_score=0,a=0; a< S->nseq; a++)
+ {
+ for (score=0, b=0; b< S->nseq; b++)
+ {
+ score+=100-sim_weight[a][b];
+ }
+ if ( score>best_score)
+ {
+ best_seq=a;
+ best_score=score;
+ }
+
+ }
+
+
+ last_chosen=chosen=((best_score/S->nseq)*100)/sum;
+ nchosen=last_nchosen=1;
+ seq_list[0]=best_seq;
+ used_seq_list[best_seq]=1;
+
+ sprintf ( name[0],"%s", S->name[seq_list[0]]);
+ sprintf ( seq[0],"%s", S->seq[seq_list[0]]);
+ nchosen=last_nchosen=1;
+
+
+ fprintf ( stderr, "\nTRIM:\n");
+ fprintf ( stderr, "\n1-Chosen Sequences\n");
+ /*Assemble the list of sequences*/
+ for (a=1; a< S->nseq; a++)
+ {
+ for (best_score=0,b=0; b< S->nseq; b++)
+ {
+ if (used_seq_list[b]);
+ else
+ {
+ score=pw_weight[seq_list[0]][b]+1;
+ for (c=0; c<a; c++)
+ score=MIN(score,pw_weight[seq_list[c]][b]);
+
+ if ( score>=best_score)
+ {
+ best_seq=b;
+ best_score=score;
+ }
+
+ }
+ }
+ seq_list[a]=best_seq;
+ used_seq_list[best_seq]=1;
+
+
+
+ for ( chosen=0,d=0; d< S->nseq; d++)
+ {
+ for (score=0, c=0; c<=a; c++)
+ {
+ if ( seq_list[c]!=d)
+ score=MAX(score, 100-sim_weight[seq_list[c]][d]);
+ }
+ chosen+=score;
+
+ }
+
+ chosen=((chosen/S->nseq)*100)/sum;
+ nchosen=a+1;
+
+ condition1= (int)chosen<=(int)percent || !percent;
+ condition2=(nchosen)<=max_nseq || !max_nseq;
+
+ if (condition1 && condition2)
+ {
+ fprintf ( stderr, "\tADD %s (set score: %.2f %%)\n", S->name[seq_list[a]], chosen);
+ sprintf ( name[a],"%s", S->name[seq_list[a]]);
+ sprintf ( seq[a],"%s", S->seq[seq_list[a]]);
+
+ }
+ else
+ {
+ break;
+ }
+ last_chosen=chosen;
+ last_nchosen=nchosen;
+ }
+
+ NS=fill_sequence_struc (last_nchosen,seq,name);
+ NA=seq2aln(NS,NULL,1);
+ fprintf ( stderr, "\n2-Informations:\n");
+ fprintf ( stderr, "\tUse...........: %s\n",(use_aln)?"multiple_aln":"pairwise_aln");
+ fprintf ( stderr, "\tweight_mode...: %s\n" ,weight_mode);
+ fprintf ( stderr, "\tpercent_weight: %.2f%% (max=%d%%)\n",last_chosen,percent);
+ fprintf ( stderr, "\tn_seq.........: %d\n" ,NS->nseq);
+ fprintf ( stderr, "\treduction.....: %d%% of original set\n" ,(NS->nseq*100)/S->nseq);
+
+ return NA;
+ }
+Sequence * seq_weight2species_weight (Alignment *A, Sequence *S)
+{
+ float *wsp;
+ float *wseq;
+ int a,b;
+
+ S->W=declare_weights(S->nseq);
+ if (!A->S || !(A->S)->W)aln2voronoi_weights (A);
+
+ wseq=((A->S)->W)->SEQ_W;
+ wsp=(S->W)->SEQ_W;
+ for ( a=0; a< S->nseq; a++)
+ {
+ for (b=0; b<A->nseq; b++)
+ if ( strstr (A->name[b], S->name[a]))wsp[a]+=wseq[b];
+ }
+ for (a=0; a<S->nseq; a++)
+ fprintf ( stderr, "\nVoronoi Weights: Species %s ---> %.2f\n", S->name[a], wsp[a]);
+ return S;
+}
+Alignment * aln2voronoi_weights (Alignment *A)
+{
+ int a, b, c;
+ float t=0;
+ int **tab;
+ float *w;
+
+ tab=declare_int (256, A->nseq+1);
+ if (A->S)free_sequence (A->S, (A->S)->nseq);
+ A->S=aln2seq(A);
+ (A->S)->W=declare_weights (A->nseq);
+ w=((A->S)->W)->SEQ_W;
+
+ for (a=0; a<A->len_aln; a++)
+ {
+ for ( b=0; b<A->nseq; b++)
+ {
+ c= A->seq_al[b][a];
+ if (!is_gap(c))
+ {
+ c=tolower(c);
+ tab[c][++tab[c][0]]=b;
+ }
+ }
+ for (c=0; c<256; c++)
+ {
+ if (tab[c][0])
+ {
+ for (b=1; b<=tab[c][0]; b++)
+ {
+ w[tab[c][b]]+=(float)1/(float)tab[c][0];
+ t+=(float)1/(float)tab[c][0];
+ }
+ }
+ tab[c][0]=0;
+ }
+ }
+ for (a=0; a<A->nseq; a++)
+ {
+ w[a]=(w[a]/t)*A->nseq;
+ }
+
+ return A;
+}
+
+float ** get_weight ( Alignment *A, Sequence *S, char *mode)
+{
+ char *aln_name;
+ char *weight_name;
+ char *seq_name;
+ char command[LONG_STRING];
+ char program[LONG_STRING];
+ float **weight;
+ FILE *fp;
+ int c;
+
+ if ( !mode || !mode[0] || strm (mode, "msa"))
+ {
+ if ( getenv ( "SEQ2MSA_WEIGHT")==NULL)sprintf (program, "%s",SEQ2MSA_WEIGHT);
+ else sprintf ( program, "%s", (getenv ( "SEQ2MSA_WEIGHT")));
+ }
+ else if ( strm(mode, "pwsim") ||strm(mode, "pwsim_fragment") )
+ {
+ return seq2pwsim (A, S, mode);
+ }
+ else
+ {
+ if (getenv (mode))sprintf ( program, "%s", (getenv (mode)));
+ else fprintf ( stderr, "\nERROR: %s is not a valid mode for weight computation [FATAL:%s]", mode, PROGRAM);
+ }
+
+ /*MSA weights*/
+ seq_name=vtmpnam(NULL);
+ aln_name=vtmpnam(NULL);
+ weight_name=vtmpnam(NULL);
+ weight=declare_float (S->nseq+1, 2);
+
+
+
+ if (A)
+ {
+ output_clustal_aln (seq_name,A);
+ output_fasta_seq (aln_name,A);
+ sprintf ( command, "%s %s -i %s -w %s", program, seq_name, aln_name, weight_name);
+ }
+ else
+ {
+ A=seq2aln(S,A,1);
+ output_fasta_seq (seq_name,A);
+ sprintf ( command, "%s %s -w %s", program, seq_name, weight_name);
+ }
+
+
+ my_system ( command);
+
+ fp=vfopen( weight_name, "r");
+ while ( (c=fgetc(fp))!='$');
+ c=fgetc(fp);
+ c=0;
+ while ( (fscanf (fp, "%*s %f\n",&(weight[c][1])))==1)
+ {weight[c][0]=c;c++;}
+ vfclose (fp);
+
+
+ return weight;
+}
+
+float **seq2pwsim ( Alignment *A, Sequence *S, char *mode)
+{
+ int a, b, c;
+ float d,t;
+ float **W;
+ Alignment *B;
+ W=declare_float (S->nseq, S->nseq);
+
+
+
+ for (a=0; a< S->nseq; a++)
+ for ( b=a; b<S->nseq; b++)
+ {
+ if ( a==b){d=1;}
+ else if (!A)
+ {
+
+ B=align_two_sequences ((S)->seq[a], (S)->seq[b],"pam250mt", -10, -1, "fasta_pair_wise");
+ for (t=0,d=0,c=0; c<B->len_aln; c++)
+ {
+ d+=(B->seq_al[0][c]==B->seq_al[1][c] && !is_gap(B->seq_al[0][c]));
+ t+=(!is_gap(B->seq_al[0][c]) && !is_gap(B->seq_al[1][c]));
+ }
+ t=(strm ( mode, "pwsim_fragment"))?B->len_aln:t;
+
+ d=d/((t==0)?1:t);
+ free_aln(B);
+ }
+ else
+ {
+ for (t=0,d=0,c=0; c<A->len_aln; c++)
+ {
+ d+=(A->seq_al[a][c]==A->seq_al[b][c] && !is_gap(A->seq_al[a][c]));
+ t+=(!is_gap(A->seq_al[a][c]) && !is_gap(A->seq_al[b][c]));
+ }
+ d=d/((t==0)?1:t);
+ }
+
+
+ W[a][b]=W[b][a]=(1-d)*100;
+ }
+
+
+ return W;
+
+}
+
+float **seq2pwsim_fragment ( Alignment *A, Sequence *S, char *mode)
+{
+
+
+ int a, b, c;
+ float d,t;
+ float **W;
+ Alignment *B;
+ W=declare_float (S->nseq, S->nseq);
+
+
+
+
+ for (a=0; a< S->nseq; a++)
+ for ( b=a; b<S->nseq; b++)
+ {
+ if ( a==b){d=1;}
+ else if (!A)
+ {
+
+ B=align_two_sequences ((S)->seq[a], (S)->seq[b],"pam250mt", -10, -1, "fasta_pair_wise");
+ for (t=0,d=0,c=0; c<B->len_aln; c++)
+ {
+ d+=(B->seq_al[0][c]==B->seq_al[1][c] && !is_gap(B->seq_al[0][c]));
+ t+=(!is_gap(B->seq_al[0][c]) && !is_gap(B->seq_al[1][c]));
+ }
+
+ d=d/((t==0)?1:t);
+ free_aln(B);
+ }
+ else
+ {
+ for (t=0,d=0,c=0; c<A->len_aln; c++)
+ {
+ d+=(A->seq_al[a][c]==A->seq_al[b][c] && !is_gap(A->seq_al[a][c]));
+ t+=(!is_gap(A->seq_al[a][c]) && !is_gap(A->seq_al[b][c]));
+ }
+ d=d/((t==0)?1:t);
+ }
+
+
+ W[a][b]=W[b][a]=(1-d)*100;
+ }
+
+
+ return W;
+
+}
+
+/********************************************************************/
+/* */
+/* AMINO ACID FUNCTIONS */
+/* */
+/* */
+/* */
+/********************************************************************/
+//Builds an extended alphabet from a string
+char** string2alphabet (char *string, int depth, int *falp_size)
+{
+ int max_s;
+ int a, b,c, l, n;
+ char buf[1000];
+ char **alp;
+ int alp_size;
+
+ char ***alp2;
+ int *alp2_size;
+
+ int *array;
+ char **falp;
+
+
+ l=strlen (string);
+ array=vcalloc ( 256, sizeof (int));
+
+
+ max_s=l+1;
+ falp_size[0]=0;
+ falp=declare_char (l+1, 2);
+
+ alp=declare_char(l,2);
+ alp_size=0;
+
+ array=vcalloc ( 256, sizeof (int));
+ for (a=0;a<l; a++)
+ {
+ if (!array[(int)string[a]])
+ {
+ array[(int)string[a]]=1;
+ sprintf (alp[alp_size++], "%c", string[a]);
+ sprintf (falp[falp_size[0]++], "%c", string[a]);
+ }
+ }
+ sprintf ( falp[falp_size[0]++], "*");
+ vfree (array);
+
+ if ( depth==1)
+ {
+ free_char (alp, -1);
+ return falp;
+ }
+ alp2=vcalloc ( depth, sizeof (char**));
+ alp2_size=vcalloc (depth, sizeof (int));
+
+ for (a=0; a<depth; a++)
+ {
+ alp2[a]=alp;
+ alp2_size[a]=alp_size;
+ }
+
+
+ for (a=2; a<=depth; a++)
+ {
+ char ***result_array;
+
+ result_array=generate_array_string_list (a, alp2, alp2_size, &n, NULL, NO_OVERLAP);
+ max_s+=n+1;
+ falp=vrealloc (falp, sizeof (char**)*max_s);
+ for (b=0; b<n; b++)
+ {
+ buf[0]='\0';
+ for (c=0; c<a; c++)
+ {
+ strcat (buf, result_array[b][c]);
+ }
+ falp[falp_size[0]]=vcalloc (strlen (buf)+1, sizeof (char));
+ sprintf ( falp[falp_size[0]++], "%s", buf);
+ vfree ( result_array[b]);
+ }
+ vfree (result_array);
+
+ }
+
+ falp[falp_size[0]]=vcalloc (2, sizeof (char));
+ sprintf ( falp[falp_size[0]++], "*");
+ free_char (alp, -1);
+ return falp;
+}
+
+char** make_group_aa (int *ngroup, char *mode)
+ {
+/*mode: indicates which matrix will be used for the grouping*/
+/*n_group: pointer to the number of groups */
+/*return value: an array of strings containing the AA of each group */
+
+
+ int **matrix;
+ int a, b,c,is_in;
+ char buf[28];
+ char **group_list;
+ char *matrix_name;
+ int extend=0;
+ matrix_name=vcalloc ( 100, sizeof (char));
+
+ if (ngroup[0]==-1)extend=1;
+
+ ngroup[0]=0;
+ group_list=declare_char ( 100, 27);
+
+ if (extend)
+ {
+ sprintf ( group_list[ngroup[0]++], "gG");
+ sprintf ( group_list[ngroup[0]++], "pP");
+ sprintf ( group_list[ngroup[0]++], "aA");
+ sprintf ( group_list[ngroup[0]++], "cC");
+ sprintf ( group_list[ngroup[0]++], "dD");
+ sprintf ( group_list[ngroup[0]++], "eE");
+
+ sprintf ( group_list[ngroup[0]++], "fF");
+ sprintf ( group_list[ngroup[0]++], "hH");
+ sprintf ( group_list[ngroup[0]++], "iI");
+ sprintf ( group_list[ngroup[0]++], "kK");
+ sprintf ( group_list[ngroup[0]++], "lL");
+ sprintf ( group_list[ngroup[0]++], "mM");
+ sprintf ( group_list[ngroup[0]++], "nN");
+ sprintf ( group_list[ngroup[0]++], "qQ");
+ sprintf ( group_list[ngroup[0]++], "rR");
+
+ sprintf ( group_list[ngroup[0]++], "sS");
+ sprintf ( group_list[ngroup[0]++], "tT");
+ sprintf ( group_list[ngroup[0]++], "vV");
+ sprintf ( group_list[ngroup[0]++], "wW");
+ sprintf ( group_list[ngroup[0]++], "*");
+ }
+
+ if ( mode && mode[0]=='_'){mode++;sprintf ( matrix_name, "%s", mode);}
+
+ if (mode==NULL || mode[0]=='\0')sprintf ( matrix_name, "idmat");
+ else if ( strstr (mode, "sim") || strm (mode, "idmat") || mode==NULL)
+ {
+ sprintf ( group_list[ngroup[0]++], "aA");
+ sprintf ( group_list[ngroup[0]++], "bB");
+ sprintf ( group_list[ngroup[0]++], "cC");
+ sprintf ( group_list[ngroup[0]++], "dD");
+ sprintf ( group_list[ngroup[0]++], "eE");
+ sprintf ( group_list[ngroup[0]++], "fF");
+ sprintf ( group_list[ngroup[0]++], "gG");
+ sprintf ( group_list[ngroup[0]++], "hH");
+ sprintf ( group_list[ngroup[0]++], "iI");
+ sprintf ( group_list[ngroup[0]++], "jJ");
+ sprintf ( group_list[ngroup[0]++], "kK");
+ sprintf ( group_list[ngroup[0]++], "lL");
+ sprintf ( group_list[ngroup[0]++], "mM");
+ sprintf ( group_list[ngroup[0]++], "nN");
+ sprintf ( group_list[ngroup[0]++], "oO");
+ sprintf ( group_list[ngroup[0]++], "pP");
+ sprintf ( group_list[ngroup[0]++], "qQ");
+ sprintf ( group_list[ngroup[0]++], "rR");
+ sprintf ( group_list[ngroup[0]++], "sS");
+ sprintf ( group_list[ngroup[0]++], "tT");
+ sprintf ( group_list[ngroup[0]++], "uU");
+ sprintf ( group_list[ngroup[0]++], "vV");
+ sprintf ( group_list[ngroup[0]++], "wW");
+ sprintf ( group_list[ngroup[0]++], "xX");
+ sprintf ( group_list[ngroup[0]++], "yY");
+ sprintf ( group_list[ngroup[0]++], "zZ");
+ vfree (matrix_name);
+ return group_list;
+ }
+ else if ( strm (mode, "simple"))
+ {
+ sprintf ( group_list[ngroup[0]++], "avilmAVILM");
+ sprintf ( group_list[ngroup[0]++], "dekrDEKR");
+ sprintf ( group_list[ngroup[0]++], "stcnqhSTCNQH");
+ sprintf ( group_list[ngroup[0]++], "wfyWFY");
+ sprintf ( group_list[ngroup[0]++], "gG");
+ sprintf ( group_list[ngroup[0]++], "pP");
+ vfree (matrix_name);
+ return group_list;
+ }
+
+ else if ( strm (mode, "mafft"))
+ {
+
+
+ sprintf ( group_list[ngroup[0]++],"agjopstAGJOPST");
+ sprintf ( group_list[ngroup[0]++],"ilmvILMV");
+ sprintf ( group_list[ngroup[0]++],"bdenqzBDENQZ");
+ sprintf ( group_list[ngroup[0]++],"hkrHKR");
+ sprintf ( group_list[ngroup[0]++],"fwyFWY");
+ sprintf ( group_list[ngroup[0]++],"cC");
+ vfree (matrix_name);
+ return group_list;
+ }
+ else if ( strm (mode, "clustalw"))
+ {
+
+ sprintf ( group_list[ngroup[0]++],"astaASTA");
+ sprintf ( group_list[ngroup[0]++],"bneqkBNEQK");
+ sprintf ( group_list[ngroup[0]++],"cnhqkCNHQK");
+ sprintf ( group_list[ngroup[0]++],"dndeqDNDEQ");
+ sprintf ( group_list[ngroup[0]++],"eqhrkEQHRK");
+ sprintf ( group_list[ngroup[0]++],"fmilvFMILV");
+ sprintf ( group_list[ngroup[0]++],"gmilfGMILF");
+ sprintf ( group_list[ngroup[0]++],"hhyHHY");
+ sprintf ( group_list[ngroup[0]++],"ifywIFYW");
+ sprintf ( group_list[ngroup[0]++],"jcJC");
+ sprintf ( group_list[ngroup[0]++],"kpKP");
+ vfree (matrix_name);
+ return group_list;
+ }
+ else if ( strm (mode, "polarity"))
+ {
+
+ sprintf ( group_list[ngroup[0]++],"eqrsdnkhtEQRSDNKHT");
+ sprintf ( group_list[ngroup[0]++],"pP");
+ sprintf ( group_list[ngroup[0]++],"gG");
+ sprintf ( group_list[ngroup[0]++],"cC");
+ sprintf ( group_list[ngroup[0]++],"fywFYW");
+ sprintf ( group_list[ngroup[0]++],"iavlmIAVLM");
+ vfree (matrix_name);
+ return group_list;
+ }
+ else if ( strm (mode, "vasiliky"))
+ {
+ ngroup[0]=0;
+ sprintf ( group_list[ngroup[0]++], "rkRK");
+ sprintf ( group_list[ngroup[0]++], "deDE");
+ sprintf ( group_list[ngroup[0]++], "qhQH");
+ sprintf ( group_list[ngroup[0]++], "vilmVILM");
+ sprintf ( group_list[ngroup[0]++], "fyFY");
+ sprintf ( group_list[ngroup[0]++], "sS");
+ sprintf ( group_list[ngroup[0]++], "wW");
+ sprintf ( group_list[ngroup[0]++], "aA");
+ sprintf ( group_list[ngroup[0]++], "cC");
+ sprintf ( group_list[ngroup[0]++], "gG");
+ sprintf ( group_list[ngroup[0]++], "nN");
+ sprintf ( group_list[ngroup[0]++], "pP");
+ sprintf ( group_list[ngroup[0]++], "tT");
+ vfree (matrix_name);
+ return group_list;
+ }
+ else if ( strm (mode, "clustalw_col"))
+ {
+ sprintf ( group_list[ngroup[0]++], "staSTA");
+ sprintf ( group_list[ngroup[0]++], "neqkNEQK");
+ sprintf ( group_list[ngroup[0]++], "nhqkNHQK");
+ sprintf ( group_list[ngroup[0]++], "ndeqNDEQ");
+ sprintf ( group_list[ngroup[0]++], "qhrkQHRK");
+ sprintf ( group_list[ngroup[0]++], "milvMILV");
+ sprintf ( group_list[ngroup[0]++], "milfMILF");
+ sprintf ( group_list[ngroup[0]++], "hyHY");
+ sprintf ( group_list[ngroup[0]++], "fywFYW");
+ sprintf ( group_list[ngroup[0]++], "gG");
+ sprintf ( group_list[ngroup[0]++], "pP");
+ sprintf ( group_list[ngroup[0]++], "cC");
+ vfree (matrix_name);
+
+ return group_list;
+ }
+ else if ( strm (mode, "clustalw_dot"))
+ {
+ sprintf ( group_list[ngroup[0]++], "csaCSA");
+ sprintf ( group_list[ngroup[0]++], "atvATV");
+ sprintf ( group_list[ngroup[0]++], "sagSAG");
+ sprintf ( group_list[ngroup[0]++], "stnkSTNK");
+ sprintf ( group_list[ngroup[0]++], "stpaSTPA");
+ sprintf ( group_list[ngroup[0]++], "sgndSGND");
+ sprintf ( group_list[ngroup[0]++], "sndeqkSNDEQK");
+ sprintf ( group_list[ngroup[0]++], "ndeqhkNDEQHK");
+ sprintf ( group_list[ngroup[0]++], "neqhrkNEQHRK");
+ sprintf ( group_list[ngroup[0]++], "fvlimFVLIM");
+ sprintf ( group_list[ngroup[0]++], "hfyHFY");
+ vfree (matrix_name);
+ return group_list;
+ }
+ else if ( strm (mode, "make_all"))
+ {
+ ngroup[0]=1;
+ sprintf ( group_list[0], "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+ vfree (matrix_name);
+ return group_list;
+ }
+ else sprintf ( matrix_name, "%s", mode);
+
+ matrix=read_matrice ( matrix_name);
+
+ for ( a=0;a< 26; a++)
+ {
+ if ( matrix[a][a]>0)
+ {
+ for ( c=0,b=0;b< 26; b++)
+ {
+
+ if ( matrix[a][b]>0 && matrix[b][b]>0)
+ {
+ buf[c++]=b+'A';
+ buf[c++]=b+'a';
+ }
+ }
+ buf[c]='\0';
+ for ( is_in=0,b=0; b< ngroup[0]; b++)if ( strcmp (buf, group_list[b])==0)is_in=1;
+ if (is_in==0)sprintf ( group_list[ngroup[0]++], "%s", buf);
+
+ }
+ }
+ free_int (matrix, -1);
+ vfree (matrix_name);
+
+ return group_list;
+ }
+char** make_group_aa_upgma (char*matrix, int max_n)
+ {
+ char **group_list;
+ int **mat;
+ int *used;
+ int a, b, ba, bb, best, set, l, n;
+ l=26;
+
+ group_list=declare_char (l+1, l+1);
+ for (a=0; a<l; a++)group_list[a][0]='a'+a;
+ mat=read_matrice(matrix);
+ used=vcalloc ( l, sizeof (int));
+ n=l;
+
+ while (n>max_n)
+ {
+ for (set=0,a=0; a<l-1; a++)
+ for (b=a+1; b<l; b++)
+ {
+ if (used[a]||used[b])continue;
+
+ if (set==0 || mat[a][b]>best)
+ {
+ best=mat[a][b];
+ ba=a;
+ bb=b;
+ set=1;
+ }
+ }
+
+ for (a=0; a<l; a++)
+ {
+ mat[ba][a]=mat[a][ba]=(mat [ba][a]+mat[bb][a])/2;
+ used[bb]=1;
+ }
+ strcat (group_list[ba], group_list[bb]);
+ vfree (group_list[bb]);
+ group_list[bb]=NULL;
+
+ n--;
+ }
+
+ for (n=0,a=0; a<l; a++)
+ {
+ if ( group_list[a])
+ group_list[n++]=group_list[a];
+ }
+ vfree (used); free_int (mat, -1);
+ return group_list;
+ }
+
+int find_group_aa_distribution (char *col, int nseq,int n_group, char **gl, int *distrib, char *mode )
+ {
+ static int *distribution;
+ static char **lgl;
+ static int ln_group;
+ int a, b, c;
+ int *d;
+ char **gl2;
+ int n_group2;
+
+
+
+ if ( lgl==NULL)
+ lgl=make_group_aa ( &ln_group, mode);
+
+ if ( gl==NULL)
+ {
+ gl2=lgl;
+ n_group2=ln_group;
+ }
+ else
+ {
+ gl2=gl;
+ n_group2=n_group;
+ }
+
+ if ( distribution==NULL || ln_group<n_group)distribution=vcalloc ( n_group2, sizeof (int));
+ if ( distrib==NULL)d=distribution;
+ else d=distrib;
+
+
+ for ( a=0; a< n_group2; a++)d[a]=0;
+
+ for ( a=0; a< nseq; a++)
+ {
+ for ( b=0; b< n_group2; b++)
+ d[b]+=is_in_set (col[a], gl2[b]);
+ }
+ c=d[0];
+ for ( a=0; a< n_group2; a++)
+ c=(d[a]>c)?d[a]:c;
+ return c;
+ }
+
+
+
+int is_in_same_group_aa ( char r1, char r2, int n_group, char **gl, char *mode)
+ {
+ int a;
+ static char **lgl;
+ static int ln_group;
+
+ char **gl2;
+ int n_group2;
+
+ /*use mode=idmat for similarity based on id*/
+
+ r1=toupper(r1);
+ r2=toupper(r2);
+ if (mode==NULL)return (r1==r2)?1:0;
+
+ if ( strm (mode, "clean"))
+ {
+ free_char (lgl, -1);
+ lgl=NULL;
+ ln_group=0;
+ return 0;
+ }
+ else if ( strstr (mode, "cov"))
+ {
+ return 1;
+ }
+
+ if ( lgl==NULL)
+ {
+ lgl=make_group_aa ( &ln_group, mode);
+ }
+
+ if ( gl==NULL)
+ {
+ gl2=lgl;
+ n_group2=ln_group;
+ }
+ else
+ {
+ gl2=gl;
+ n_group2=n_group;
+ }
+
+ for ( a=0; a< n_group2; a++)
+ if ( is_in_set ( r1, gl2[a]) && is_in_set ( r2, gl2[a]))return 1;
+ return 0;
+ }
+
+
+Alignment * gene2prot (Alignment *A){return A; }
+char * test_gene2prot (Constraint_list *CL, int s1)
+ {
+ int a, b,q, nal;
+ int F=-10000000; /*FORBIDEN STATE*/
+ int AL=0; /*ALLOWED STATE*/
+ int SPLICE_PENALTY=1000;
+ int FRAME_PENALTY=1000;
+
+
+ int START, ORF1, ORF2, ORF3, s5NC;
+ int s3NC,ORF3_G1, ORF3_T2, ORF3_NC, ORF3_A3, ORF3_T4;
+ int U1_G1, U1_T2, U1_NC, U1_A3, U1_T4;
+ int U2_G1, U2_T2, U2_NC, U2_A3, U2_T4;
+ int U1, U2, U3, U4, U5, END;
+
+ int nstate=0;
+ int **transitions;
+ int **v_tab;
+ int **v_tab_p;
+ int **last_coding;
+ int **last_t4;
+ int *potential;
+ int v;
+
+ int orf1, orf2, orf3, ncp, p, state, pstate, e, best_state_p=0, best_state_v=0, best_pstate_p=0, best_pstate_v;
+ char *seq, *seq2, *seq3;
+ int l;
+ int *is_coding;
+ int *is_t4;
+ char *codon;
+ int s, r, s2, r2, w2;
+
+ static int *entry;
+ int tot=0;
+
+ seq=vcalloc ( strlen ((CL->S)->seq[s1])+1, sizeof (char));
+ seq2=vcalloc ( strlen ((CL->S)->seq[s1])+1, sizeof (char));
+ seq3=vcalloc ( strlen ((CL->S)->seq[s1])+1, sizeof (char));
+ sprintf ( seq, "%s", (CL->S)->seq[s1]);
+ ungap (seq);
+
+ l=strlen (seq);
+ for ( a=0; a< l; a++) seq[a]=tolower ( seq[a]);
+ for ( a=0; a< l; a++) seq[a]=(seq[a]=='t')?'u': seq[a];
+
+
+ potential=vcalloc (l+1, sizeof (int));
+
+ for (nal=0, s=0; s<(CL->S)->nseq; s++)
+ {
+ for ( r=1; r<=(CL->S)->len[s]; r++)
+ {
+ for ( b=1; b<CL->residue_index[s1][r][0]; b++)
+ {
+
+ s2=CL->residue_index[s][r][b+SEQ2];
+ r2=CL->residue_index[s][r][b+R2];
+ w2=CL->residue_index[s][r][b+WE];
+ if (s==s1)potential[r-1]+=w2;
+ else if ( s2==s1)potential[r2-1]+=w2;
+ tot+=w2;
+ nal++;
+ }
+ }
+ }
+
+
+ SPLICE_PENALTY=10000;
+ FRAME_PENALTY=1000;
+
+
+ nstate=0;
+ START=nstate++; ORF1=nstate++; ORF2=nstate++; ORF3=nstate++; s5NC=nstate++;
+ s3NC=nstate++;
+ ORF3_G1=nstate++;U1_G1=nstate++;U2_G1=nstate++;
+ ORF3_T2=nstate++;U1_T2=nstate++;U2_T2=nstate++;
+ ORF3_NC=nstate++;U1_NC=nstate++;U2_NC=nstate++;
+ ORF3_A3=nstate++;U1_A3=nstate++;U2_A3=nstate++;
+ ORF3_T4=nstate++;U1_T4=nstate++;U2_T4=nstate++;
+
+
+ U1=nstate++; U2=nstate++; U3=nstate++; U4=nstate++; U5=nstate++;
+ END=nstate++;
+
+ is_coding=vcalloc ( nstate, sizeof (int));
+ is_coding[ORF1]=is_coding[ORF2]=is_coding[ORF3]=is_coding[U1]=is_coding[U2]=1;
+ is_coding[U3]=is_coding[U4]=is_coding[U5]=1;
+
+ is_t4=vcalloc ( nstate, sizeof (int));
+ is_t4[ORF3_T4]=is_t4[U1_T4]=is_t4[U2_T4]=1;
+ transitions=declare_int ( nstate, nstate);
+ for (a=0; a< nstate; a++)
+ for ( b=0; b< nstate; b++)transitions[a][b]=F;
+
+ transitions[START][ORF1]=AL;
+ transitions[START][s5NC]=AL-FRAME_PENALTY;
+ transitions[s5NC][s5NC]=AL;
+
+ transitions[s5NC][ORF1]=AL-FRAME_PENALTY;
+
+ transitions[ORF1][ORF2]=AL;
+ transitions[ORF2][ORF3]=AL;
+ transitions[ORF3][U1]=AL;
+ transitions[ORF3][ORF1]=AL;
+ transitions[ORF3][ORF3_G1]=AL-SPLICE_PENALTY;
+
+
+ transitions[ORF3_G1][ORF3_T2]=AL;
+ transitions[ORF3_T2][ORF3_NC]=AL;
+ transitions[ORF3_NC][ORF3_NC]=AL;
+ transitions[ORF3_NC][ORF3_A3]=AL;
+ transitions[ORF3_A3][ORF3_T4]=AL;
+ transitions[ORF3_T4][ORF1]=AL-SPLICE_PENALTY;
+
+ transitions[U1][U2]=AL;
+ transitions[U1][U1_G1]=AL-SPLICE_PENALTY;
+ transitions[U1_G1][U1_T2]=AL;
+ transitions[U1_T2][U1_NC]=AL;
+ transitions[U1_NC][U1_NC]=AL;
+ transitions[U1_NC][U1_A3]=AL;
+ transitions[U1_A3][U1_T4]=AL;
+ transitions[U1_T4][U3]=AL-SPLICE_PENALTY;
+ transitions[U3][U4]=AL;
+ transitions[U4][ORF1]=AL;
+
+ transitions[U2][U2_G1]=AL-SPLICE_PENALTY;
+ transitions[U2_G1][U2_T2]=AL;
+ transitions[U2_T2][U2_NC]=AL;
+ transitions[U2_NC][U2_NC]=AL;
+ transitions[U2_NC][U2_A3]=AL;
+ transitions[U2_A3][U2_T4]=AL;
+ transitions[U2_T4][U5]=AL-SPLICE_PENALTY;
+ transitions[U5][ORF1]=AL;
+
+ transitions[ORF3][s3NC]=AL-FRAME_PENALTY;
+ transitions[ORF3][END]=AL;
+ transitions[s3NC][END]=AL;
+
+
+ v_tab=declare_int ( l+1,nstate);
+ v_tab_p=declare_int ( l+1,nstate);
+ last_coding=declare_int ( l+1,nstate);
+ last_t4=declare_int ( l+1,nstate);
+
+ for (a=0; a< l; a++) potential[a]-=200;
+
+ codon=vcalloc ( 4, sizeof (char));
+ best_pstate_p=START;
+ best_pstate_v=0;
+ nal=0;
+ for ( p=1; p<=l; p++)
+ {
+ if (translate_dna_codon (seq+(p-1), 'x')=='x' || p>(l-2))orf1=F;
+ else orf1=potential[p-1];
+
+ if (p<2 || translate_dna_codon (seq+(p-2), 'x')=='x' || p>(l-1))orf2=F;
+ else orf2=potential[p-1];
+
+
+ if (p<3 || translate_dna_codon (seq+(p-3), 'x')=='x' || p>l)orf3=F;
+ else orf3=potential[p-1];
+
+ if ( best_int (3, 1, &a, orf1, orf2, orf3)!=F)ncp=-best_int (3, 1, &a, orf1, orf2, orf3);
+ else ncp=1000;
+
+ for ( state=0; state< nstate; state++)
+ {
+
+ if ( state==ORF1)e=orf1;
+ else if ( state==ORF2)e=orf2;
+ else if ( state==ORF3)e=orf3;
+ else if ( state>=U1 && state<=U3)
+ {
+ e=0;
+ }
+ else if ( state==U4)
+ {
+ codon[2]=seq[p-1];
+ codon[1]=seq[last_coding[p-1][U3]-1];
+ codon[0]=seq[last_coding[p-2][U1_T4]-1];
+ if ( translate_dna_codon (codon, 'x')=='x')e=F;
+ else e=0;
+ }
+ else if ( state==U5)
+ {
+ codon[2]=seq[p-1];
+ codon[1]=seq[last_coding[p-1][U2_T4]-1];
+ q=seq[last_coding[p-1][U2_T4]];
+ codon[0]=seq[last_coding[q-1][U1]-1];
+ if ( translate_dna_codon (codon, 'x')=='x')e=F;
+ else e=0;
+ }
+
+ else if (state>=ORF3_G1 && state<=U2_G1)e=(p<l-1 && seq[p-1]=='g' && seq[p]=='u')?ncp:F;
+ else if ( state>=ORF3_T2 && state<=U2_T2)
+ {
+ e=(p>1 && seq[p-2]=='g' && seq[p-1]=='u')?ncp:F;
+ }
+ else if ( state>=ORF3_A3 && state<=U2_A3)e=(seq[p-1]=='a')?ncp:F;
+ else if ( state>=ORF3_T4 && state<=U2_T4)e=(seq[p-1]=='u')?ncp:F;
+ else e=ncp;
+
+ for ( pstate=0; pstate<nstate; pstate++)
+ {
+ if (e==F || transitions[pstate][state]==F || v_tab[p-1][pstate]==F)v=F;
+ else v=e+transitions[pstate][state]+v_tab[p-1][pstate];
+
+ if ( pstate==0 || v>best_pstate_v)
+ {best_pstate_v=v;best_pstate_p=pstate;}
+ }
+ v_tab[p][state]=best_pstate_v;
+ v_tab_p[p][state]=best_pstate_p;
+
+ if (!is_coding[state])last_coding[p][state]=last_coding[p-1][best_pstate_p];
+ else if (is_coding[state])last_coding[p][state]=p;
+
+ if (!is_t4[state])
+ {
+ if (is_coding[state] && last_t4[p-1][best_pstate_p]==0)last_t4[p][state]=p;
+ else last_t4[p][state]=last_t4[p-1][best_pstate_p];
+ }
+ else if (is_t4[state])last_t4[p][state]=p;
+
+ if (state==0 ||best_pstate_v>best_state_v ){best_state_p=state; best_state_v=best_pstate_v;}
+ }
+ }
+ tot=0;
+ for ( p=l; p>0; p--)
+ {
+ if ( best_state_p>=ORF1 && best_state_p<=ORF3){seq2[tot++]=tolower (seq[p-1]);}
+ else if ( best_state_p>=U1 && best_state_p<=U5){seq2[tot++]=tolower (seq[p-1]);}
+ if (best_state_p==ORF1)seq[p-1]=toupper (seq[p-1]);
+ else if (best_state_p==ORF2 || best_state_p==ORF3)seq[p-1]=tolower (seq[p-1]);
+ else if ( best_state_p==ORF3_NC || best_state_p==U1_NC || best_state_p==U2_NC) seq[p-1]='.';
+ else if ( best_state_p==U1 || best_state_p==U2 || best_state_p==U3 || best_state_p==U4 || best_state_p==U5) seq[p-1]=best_state_p-U1+'1';
+ else seq[p-1]=toupper (seq[p-1]);
+ best_state_p=v_tab_p[p][best_state_p];
+ }
+
+ for ( a=0, b=tot-1; b>=0; b--, a++)
+ seq3[a]=seq2[b];
+
+ fprintf ( stderr, "\n%s\n", seq);
+ fprintf ( stderr, "\nN coding=%d\n", tot);
+ for ( a=0; a< tot; a+=3)
+ {
+ b=translate_dna_codon (seq3+a, 'x');
+ fprintf ( stderr, "%c",b);
+ if ( b=='x'){fprintf ( stderr, "\n");myexit (EXIT_SUCCESS);}
+ }
+
+ fprintf ( stderr, "\n");
+ myexit (EXIT_SUCCESS);
+ return 0;
+
+
+
+ }
+Alignment * dna_aln2_3frame_cdna_aln(Alignment *A,int *ns,int **l_s)
+{
+ Alignment *B;
+ int a;
+ B=realloc_aln2 (NULL,6,strlen(A->seq_al[l_s[0][0]])+strlen(A->seq_al[l_s[1][0]]));
+ for ( a=0; a< 3; a++)
+ {
+ B->seq_al[a]=translate_dna_seq (A->seq_al[l_s[0][0]]+a, 0, 'o',B->seq_al[a]);
+ B->seq_al[a+3]=translate_dna_seq (A->seq_al[l_s[1][0]]+a, 0, 'o',B->seq_al[a+3]);
+ }
+ for ( a=1; a<3; a++)
+ {
+ if ( strlen(B->seq_al[a])<strlen(B->seq_al[0])) B->seq_al[a]=strcat ( B->seq_al[a], "x");
+ if ( strlen(B->seq_al[a+3])<strlen(B->seq_al[3])) B->seq_al[a+3]=strcat ( B->seq_al[a+3], "x");
+ }
+
+ B->nseq=6;
+ B->len_aln=strlen (B->seq_al[0]);
+ return B;
+}
+
+//JM_ADD
+//For normal distribution scan
+#ifndef PI
+#define PI 3.141592653589793238462643
+#endif
+
+double normal(double x, double mean, double std)
+{
+ return (1/(std*sqrt(2.0*PI)))*exp((-0.5*(x-mean)*(x-mean))/(std*std));
+}
+
+int ** get_sim_aln_array_normal_distribution ( Alignment *A, char *mode, int *STD, int *CENTER)
+ {
+ int **w;
+ int a, b;
+
+
+ w=declare_int ( A->nseq, A->nseq);
+
+ for ( a=0; a< A->nseq-1; a++)
+ {
+ for ( b=a+1; b< A->nseq; b++)
+ {
+
+ w[a][b]=w[b][a]=generic_get_seq_sim_normal_distribution ( A->seq_al[a], A->seq_al[b], (A->cdna_cache)?A->cdna_cache[0]:NULL, mode, STD, CENTER);
+ }
+ }
+ return w;
+ }
+int generic_get_seq_sim_normal_distribution ( char *seq1, char *seq2, int*cache, char *mode, int *STD, int *CENTER)
+{
+ return get_seq_sim_distribution ( seq1,seq2,GAP_LIST, mode, STD, CENTER);
+}
+
+int get_seq_sim_distribution ( char *string1, char *string2, char *ignore, char *in_mode, int *STD, int *CENTER)
+ {
+ int len1;
+ int a;
+ int pos0, gap=0;
+ int p1, p2;
+ int r=0,r1=0,r2=0;
+ char *p;
+ char mode[1000];
+
+ double sim;
+
+
+ sprintf ( mode, "%s", in_mode);
+
+ /*mode: <mat>__<sim_mode>
+ mat: idscore to get the alignment done
+ any legal cw matrix
+ sim_mode: sim1->identities/matches
+ sim2->identities/min len
+ */
+
+
+ if ( (p=strstr (mode, "_"))!=NULL)
+ {
+ p[0]='\0';
+ p++;
+ }
+
+
+ if (strstr (mode, "idscore"))
+ {
+ static int **mat;
+ if (!mat) mat=read_matrice ("blosum62mt");
+ return idscore_pairseq (string1, string2, -12, -1, mat,mode);
+ }
+
+ len1=strlen (string1);
+ for ( sim=pos0=0,a=0; a< len1; a++)
+ {
+ r1=string1[a];
+ r2=string2[a];
+ p1=1-is_in_set (r1, ignore);
+ p2=1-is_in_set (r2, ignore);
+ if (p1 && p2)
+ {
+ pos0++;
+ if (is_in_same_group_aa(r1,r2,0, NULL, mode))
+ {
+ sim += normal(a, *CENTER, *STD);
+ }
+ }
+ else if (p1+p2==1)
+ {
+ gap++;
+ }
+ }
+
+ if ( p==NULL || strm (p, "sim1") || strm (p, "sim"))
+ {
+ r=(pos0==0)?0:(sim*MAXID);
+ }
+/* else if ( strm (p, "sim2"))
+ {
+ r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MIN(pos1,pos2);
+ }
+ else if ( strm (p, "sim3"))
+ {
+ r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MAX(pos1,pos2);
+ }
+ else if ( strm (p, "gap1"))
+ {
+ r=(len1==0)?MAXID:(gap*MAXID)/len1;
+ r=MAXID-r;
+ }
+ else if ( strm (p, "logid"))
+ {
+ r=logid_score (pos0, sim);
+ }*/
+ return r;
+
+ }
+
+
+
+Alignment *aln2clean_pw_aln (Alignment *A, OveralnP *F)// char *mode, int t, int f, int p1,int p2, int p3, char *fsa_mode)
+{
+ int **C, **T;
+ int a, b, c;
+ Alignment *B;
+
+
+ if (F->t==0)F->t=2;
+
+ C=declare_int ( A->nseq, A->len_aln);
+ T=declare_int ( A->nseq, A->len_aln);
+ B=copy_aln (A, NULL);
+
+ for (a=0; a< A->nseq;a++)
+ {
+ for (b=0; b<A->nseq; b++)
+ {
+ int *w;
+ w=pw_aln2clean_aln_weight (A->seq_al[a], A->seq_al[b], 1,F);//f,p1, p2, p3, fsa_mode);
+ for (c=0; c<A->len_aln; c++)
+ {
+ if (A->seq_al[a][c]=='-')continue;
+ C[a][c]+=w[c];
+ T[a][c]++;
+ }
+ vfree (w);
+ }
+ }
+
+
+
+ for (a=0; a<A->nseq; a++)
+ {
+ for (b=0; b<A->len_aln; b++)
+ {
+ int c;
+ c=A->seq_al[a][b];
+ if ( c=='-');
+ else if (T[a][b]==0);
+ else
+ {
+ int r;
+ r=(C[a][b]*10)/T[a][b];
+ r=(r==10)?9:r;
+ if (!F->mode || strm (F->mode, "number"))
+ B->seq_al[a][b]='0'+r;
+ else if ( F->mode && (strm (F->mode, "unalign") ||strm (F->mode, "unalign2")))
+ B->seq_al[a][b]='0'+r;
+ else if ( F->mode && strm (F->mode, "lower") )
+ {
+ if (r<=F->t)B->seq_al[a][b]=tolower (B->seq_al[a][b]);
+ else B->seq_al[a][b]=toupper (B->seq_al[a][b]);
+ }
+ }
+ }
+ }
+
+ if (F->mode && strm (F->mode, "unalign"))
+ {
+ A=unalign_aln (A, B, F->t);
+ free_aln (B);
+ B=copy_aln (A, NULL);
+ }
+ else if (F->mode && strm (F->mode, "unalign2"))
+ {
+ A=unalign_aln_2 (A, B, F->t);
+ free_aln (B);
+ B=copy_aln (A, NULL);
+ }
+
+
+
+ free_int (C, -1);
+ free_int (T, -1);
+
+ return B;
+}
+
+char **pw_aln2clean_pw_aln_fsa1 (char ** aln, OveralnP *F);
+char **pw_aln2clean_pw_aln_fsa2 (char ** aln, OveralnP *F);
+
+int * pw_aln2clean_aln_weight ( char *seq1, char *seq2, int w, OveralnP *F)
+{
+ char **aln;
+ int *weight;
+ int l, a;
+
+ if ( (l=strlen (seq1)) !=strlen (seq2))
+ {
+ HERE ("\n%s\n%s\n", seq1, seq2);
+ printf_exit ( EXIT_FAILURE, stderr, "\nERROR: Comparing unaligned sequences [FATAL:%s]", PROGRAM);
+
+ }
+
+ aln=declare_char (2, l+1);
+ sprintf ( aln[0], "%s", seq1);
+ sprintf ( aln[1], "%s", seq2);
+
+
+ aln=pw_aln2clean_pw_aln (aln, F);
+
+ weight=vcalloc (l+1, sizeof (int));
+ for (a=0; a<l; a++)
+ {
+ if ( aln[0][a] || seq1[a]=='x' || seq1[a]=='X' || seq2[a]=='x' || seq2[a]=='X')weight[a]=w;
+ }
+ free_char (aln, -1);
+
+ return weight;
+}
+
+
+char **pw_aln2clean_pw_aln (char ** aln, OveralnP *F)
+{
+
+ if ( strm (F->model, "fsa2"))return pw_aln2clean_pw_aln_fsa2 (aln,F);
+ else if ( strm (F->model, "fsa1"))return pw_aln2clean_pw_aln_fsa1 (aln,F);
+ else return pw_aln2clean_pw_aln_fsa1 (aln,F);
+}
+
+char **pw_aln2clean_pw_aln_fsa2 (char ** aln, OveralnP *FO)
+{
+ int a, b, c, d, l, id;
+ int c1, c2, e0, e1,tb, obs;
+ int T0, T1,T2;
+ int **mat, **tran, **p, **t, *s, *ids;
+ int ns, ps, cs;
+ int S, M1, M2, m1, m2,B1, B2,G1,G2, K;
+ int F=-9999999;
+ int MID_EXON_FACTOR=50;
+ int best;
+ static int **smat;
+ int model_type=1;
+ int *translate;
+
+ if ( getenv ("MID_EXON_FACTOR"))MID_EXON_FACTOR=atoi (getenv ("MID_EXON_FACTOR"));
+
+
+
+ if (!smat)smat=read_matrice ( "blosum62mt");
+
+ l=strlen (aln[0]);
+
+ if ( l!=strlen (aln[1]))
+ {
+ printf_exit ( EXIT_FAILURE, stderr, "\nERROR: unaligned strings");
+ }
+
+
+
+ s=vcalloc (l, sizeof (int));
+ ids=vcalloc (l, sizeof (int));
+
+ //record the id level of each posotion
+ for (b=0; b<l; b++)
+ {
+ c1=tolower(aln[0][b]);c2=tolower(c2=aln[1][b]);
+
+ if (c1=='-' || c2=='-' || c1=='X' || c2=='X' || c1!=c2)ids[b]=0;
+ else ids[b]=1;
+ }
+
+ //record the state of each position: M, m, T, gap
+ for (id=0,b=0,a=0;a<l; a++)
+ {
+ c1=aln[0][a];c2=aln[1][a];
+ if (islower (c1))s[a]=3;
+ else if (c1=='-' || c2=='-' || c1=='X' || c2=='X')s[a]=2;
+ else
+ {
+ int sc;
+ sc=smat[c1-'A'][c2-'A'];
+ if (sc>=2){id++; s[a]=1;}
+ else {s[a]=0;}
+ b++;
+ }
+ }
+
+ if (b==0)
+ {
+ vfree(s);vfree (ids);
+ return aln;
+ }
+
+
+
+ FO->p1=(FO->p1==0)?5:FO->p1;
+ FO->p2=(FO->p2==0)?15:FO->p2;
+ FO->p3=(FO->p3==0)?0:FO->p3;
+ FO->p4=(FO->p4==0)?100:FO->p4;
+
+
+ T1=100*(float)id/(float)b;
+ T2=(FO->f==0)?30:T1*(float)((float)FO->f/(float)100);
+ T2=MAX(T2,20);
+
+ //0: unaligned
+ //1: aligned
+ //2: gap
+ //3: exon boundary
+
+ ns=0;
+ S=ns++;
+ M1=ns++;//1 matched aligned
+ m1=ns++;//2 mmatched aligned
+ M2=ns++;//3 matched unaligned
+ m2=ns++;//4 mmatched unaligned
+ B1=ns++;//5 transition aligned
+ B2=ns++;//6 transition unaligned
+
+ mat=declare_int (ns, 4);
+ tran=declare_int (ns, ns);
+ p=declare_int (l+1, ns);
+ t=declare_int (l+1, ns);
+
+ //emission Values
+ mat[M1][0]=F; //non id
+ mat[M1][1]=T1;//id
+ mat[M1][2]=0; //gap
+ mat[M1][3]=F; //transition
+
+ mat[M2][0]=F;
+ mat[M2][1]=T2;
+ mat[M2][2]=0;
+ mat[M2][3]=F;
+
+ mat[m1][0]=100-T1;
+ mat[m1][1]=F;
+ mat[m1][2]=0;
+ mat[m1][3]=F;
+
+ mat[m2][0]=100-T2;
+ mat[m2][1]=F;
+ mat[m2][2]=0;
+ mat[m1][3]=F;
+
+ mat[B1][0]=F;
+ mat[B1][1]=F;
+ mat[B1][2]=F;
+ mat[B1][3]=0;
+
+ mat[B2][0]=F;
+ mat[B2][1]=F;
+ mat[B2][2]=F;
+ mat[B2][3]=0;
+
+ //transition values
+ tran[S][m1]=0;
+ tran[S][m2]=0;
+ tran[S][M1]=0;
+ tran[S][M2]=0;
+ tran[S][B1]=0;
+ tran[S][B2]=0;
+
+
+ tran[M1][m1]= 0;
+ tran[M1][m2]=-FO->p4;
+ tran[M1][M1]=+FO->p2;
+ tran[M1][M2]= F;
+ tran[M1][S ]= F;
+ tran[M1][B1]= 0;
+ tran[M1][B2]=-FO->p1;
+
+ tran[M2][m1]= F;
+ tran[M2][m2]=+FO->p3;
+ tran[M2][M1]= F;
+ tran[M2][M2]= 0;
+ tran[M2][S] = F;
+ tran[M2][B1]= F;
+ tran[M2][B2]= 0;
+
+
+ tran[m1][m1]= 0;
+ tran[m1][m2]= F;
+ tran[m1][M1]= 0;
+ tran[m1][M2]= F;
+ tran[m1][S] = F;
+ tran[m1][B1]= 0;
+ tran[m1][B2]=-FO->p1;
+
+ tran[m2][m1]= F;
+ tran[m2][m2]= 0;
+ tran[m2][M1]= -FO->p4;
+ tran[m2][M2]= +FO->p3;
+ tran[m2][S] = F;
+ tran[m2][B1]= F;
+ tran[m2][B2]= 0;
+
+ tran[B1][m1]= 0;
+ tran[B1][m2]= F;
+ tran[B1][M1]= 0;
+ tran[B1][M2]= F;
+ tran[B1][S]= F;
+ tran[B1][B1]= F;
+ tran[B1][B2]= F;
+
+ tran[B2][m1]= -FO->p1;
+ tran[B2][m2]= 0;
+ tran[B2][M1]= -FO->p1;
+ tran[B2][M2]= 0;
+ tran[B2][S]= F;
+ tran[B2][B1]= F;
+ tran[B2][B2]= F;
+
+ translate=vcalloc (ns, sizeof (int));
+ translate[M1]=1;
+ translate[m1]=1;
+ translate[M2]=0;
+ translate[m2]=0;
+ translate[B1]=1;
+ translate[B2]=0;
+
+ for (a=1;a<=l; a++)
+ {
+ obs=s[a-1];
+
+ for (cs=0; cs<ns; cs++)
+ {
+ for (ps=0; ps<ns; ps++)
+ {
+ c=p[a-1][ps]+mat[cs][obs]+tran[ps][cs];
+ if (ps==0 || c>=best){t[a][cs]=ps;best=p[a][cs]=c;}
+ }
+
+ }
+ }
+
+
+ for (a=0; a<ns; a++)
+ {
+ if (a==0 || p[l][a]>=best){tb=a;best=p[l][a];}
+ }
+
+ for (a=l; a>0; a--)
+ {
+ int v;
+ int p2;
+
+ p2=a-1;
+ aln[0][p2]=aln[1][p2]=translate[tb];
+ tb=t[a][tb];
+
+ }
+
+ free_int (p, -1);
+ vfree(s);
+ free_int (t, -1);
+ free_int (mat, -1);
+ free_int (tran, -1);
+ vfree (translate);
+ return aln;
+}
+char **pw_aln2clean_pw_aln_fsa1 (char ** aln, OveralnP *FO)
+{
+ int a, b, c, d, l, id;
+ int c1, c2, e0, e1,tb, obs;
+ int T0, T1,T2;
+ int **mat, **tran, **p, **t, **s;
+ int ns, ps, cs;
+ int S, M1, M2, m1, m2, K;
+ int F=-9999999;
+ int best;
+ static int **smat;
+ int *translate;
+
+
+ if (!smat)smat=read_matrice ( "blosum62mt");
+
+ l=strlen (aln[0]);
+
+ if ( l!=strlen (aln[1]))
+ {
+ printf_exit ( EXIT_FAILURE, stderr, "\nERROR: unaligned strings");
+ }
+
+
+ s=declare_int (l+1, 2);
+ for (id=0,b=0,a=0;a<l; a++)
+ {
+ c1=aln[0][a];c2=aln[1][a];
+
+ if ( c1=='-' || c2=='-' || c1=='x' || c1=='X' || c2=='x' || c2=='X')continue;
+ else
+ {
+ int sc;
+ sc=smat[c1-'A'][c2-'A'];
+ if (sc>=2){id++; s[b][0]=1;}
+ else {s[b][0]=0;}
+ s[b][1]=a;
+ b++;
+
+ }
+ }
+ if (b==0)
+ {
+ free_int (s, -1);
+ return aln;
+ }
+ FO->f=(FO->f==0)?30:FO->f;
+ FO->p1=(FO->p1==0)?90:FO->p1;
+ FO->p2=(FO->p2==0)?15:FO->p2;
+ FO->p3=(FO->p3==0)?0:FO->p3;
+
+ l=b;//length of the ungapped aln
+ T1=100*(float)id/(float)b;
+ T2=FO->f;//T1*f;
+
+
+
+ //0: unaligned
+ //1: aligned
+
+
+ ns=0;
+ S=ns++;
+ M1=ns++;//1 matched aligned
+ m1=ns++;//2 mmatched aligned
+ M2=ns++;//3 matched unaligned
+ m2=ns++;//4 mmatched unaligned
+
+ mat=declare_int (ns, 2);
+ tran=declare_int (ns, ns);
+ p=declare_int (l+1, ns);
+ t=declare_int (l+1, ns);
+
+
+ mat[M1][0]=F;
+ mat[M1][1]=T1;
+
+ mat[M2][0]=F;
+ mat[M2][1]=T2;
+
+ mat[m1][0]=100-T1;
+ mat[m1][1]=F;
+
+ mat[m2][0]=100-T2;
+ mat[m2][1]=F;
+
+
+ tran[S][m1]=0;
+ tran[S][m2]=0;
+ tran[S][M1]=0;
+ tran[S][M2]=0;
+
+
+ tran[M1][m1]= 0;
+ tran[M1][m2]=-FO->p1;// -P;
+ tran[M1][M1]=+FO->p2;
+ tran[M1][M2]= F;
+ tran[M1][S] = F;
+
+ tran[M2][m1]= F;
+ tran[M2][m2]=+FO->p3;
+ tran[M2][M1]= F;
+ tran[M2][M2]= 0;
+ tran[M2][S]= F;
+
+ tran[m1][m1]= 0;
+ tran[m1][m2]= F;
+ tran[m1][M1]= 0;
+ tran[m1][M2]= F;
+ tran[m1][S]= F;
+
+ tran[m2][m1]= F;
+ tran[m2][m2]= 0;
+ tran[m2][M1]=-FO->p1;
+ tran[m2][M2]=+FO->p3;
+ tran[m2][S]= F;
+
+ translate=vcalloc (ns, sizeof (int));
+ translate[M1]=1;
+ translate[m1]=1;
+ translate[M2]=0;
+ translate[m2]=0;
+ translate[S]=1;
+
+
+ for (a=1;a<=l; a++)
+ {
+ obs=s[a-1][0];
+
+ for (cs=0; cs<ns; cs++)
+ {
+ for (ps=0; ps<ns; ps++)
+ {
+ c=p[a-1][ps]+mat[cs][obs]+tran[ps][cs];
+ if (ps==0 || c>=best){t[a][cs]=ps;best=p[a][cs]=c;}
+ }
+
+ }
+ }
+
+
+ for (a=0; a<ns; a++)
+ {
+ if (a==0 || p[l][a]>=best){tb=a;best=p[l][a];}
+ }
+ for (a=l; a>0; a--)
+ {
+ int p2=s[a-1][1];
+ aln[0][p2]=aln[1][p2]=translate[tb];
+
+ tb=t[a][tb];
+ }
+
+
+ free_int (p, -1);
+ free_int (s, -1);
+ free_int (t, -1);
+ free_int (mat, -1);
+ free_int (tran, -1);
+ vfree (translate);
+ return aln;
+}
+float* analyze_overaln ( Alignment *iA, Alignment *iB, char *mode, int filter, int f, int p1,int p2, int p3)
+{
+ Alignment *C, *D;
+ Alignment *A, *B;
+ OveralnP *F;
+
+ F=vcalloc (1, sizeof (OveralnP));
+ F->p1=p1;
+ F->p2=p2;
+ F->p3=p3;
+ F->f=f;
+ F->t=filter;
+ sprintf (F->mode, "%s", mode);
+
+
+ float *r;
+ A=copy_aln (iA, NULL);
+ B=copy_aln (iB, NULL);
+
+ C=aln2gap_cache (A,0);
+ A=filter_aln_upper_lower (A, C, 0, 0);
+ D=aln2clean_pw_aln (B, F);
+ r=aln2pred (A,D,mode);
+ free_aln (C);
+ free_aln (D);
+ free_aln (A);
+ free_aln (B);
+ return r;
+}
+float* aln2pred ( Alignment *A, Alignment*B, char *mode)
+{
+ int a, b, c, d, i, l, salp, s, n;
+ static char **list, *buf1, *buf2, *alp, *alp_lu;
+ static int ***r;
+ int T, N;
+ int fp, fn, tn, tp;
+ int tfp, tfn, ttn, ttp;
+ float sp, sn, sen2, best, result;
+ int print=1;
+ float *fresult;
+
+ fresult=vcalloc ( 3, sizeof (float));
+
+ if ( mode && strstr (mode, "case"))
+ {
+ A=aln2case_aln (A,"u","l");
+ B=aln2case_aln (B,"u","l");
+ }
+
+ if (mode && strstr (mode, "printaln"))
+ {
+ Sequence *S;
+ Alignment *C;
+ S=aln2seq (A);
+ C=copy_aln (B, NULL);
+ for (a=0; a<B->nseq; a++)
+ {
+ i=name_is_in_list (C->name[a], S->name, S->nseq, 100);
+ if ( i==-1)
+ for (b=0; b<C->len_aln; b++) C->seq_al[a][b]='-';
+ else
+ for (d=0,b=0; b<C->len_aln; b++)
+ {
+ if ( !is_gap (C->seq_al[a][b]))
+ {
+ if (C->seq_al[a][b]==S->seq[i][d])C->seq_al[a][b]=toupper(C->seq_al[a][b]);
+ d++;
+ }
+ }
+ }
+ print_aln (C);
+ }
+
+ vfree (alp);vfree (alp_lu);
+ alp=vcalloc ( 256, sizeof (char));
+ alp_lu=vcalloc ( 256, sizeof (char));
+
+ for (c=0; c<2; c++)
+ {
+ Alignment *AL;
+ AL=(c==0)?A:B;
+ for (salp=0,a=0; a<AL->nseq; a++)
+ {
+ for (b=0; b<AL->len_aln; b++)
+ {
+ c=AL->seq_al[a][b];
+ if (!is_gap(c) && !alp[c])
+ {
+ salp++;
+ alp_lu[salp]=c;
+ alp[c]=salp;
+ }
+ }
+ }
+ }
+
+ vfree (buf1); vfree(buf2);
+ buf1=vcalloc ( A->len_aln+1, sizeof (char));
+ buf2=vcalloc ( B->len_aln+1, sizeof (char));
+
+ free_arrayN ((void **)r, 3);
+ r=declare_arrayN(3, sizeof (int),A->nseq,salp+1,salp+1);
+ free_char ( list, -1);
+ list=declare_char ( A->nseq, 100);
+ for (n=0,a=0; a< A->nseq; a++)
+ {
+ for ( b=0; b<B->nseq; b++)
+ {
+ if ( strm (A->name[a], B->name[b]))
+ {
+ sprintf ( buf1, "%s", A->seq_al[a]);
+ sprintf ( buf2, "%s", B->seq_al[b]);
+ ungap (buf1); ungap (buf2);
+ if ((l=strlen (buf1))!=strlen (buf2))continue;
+ else
+ {
+ sprintf ( list[n], "%s", A->name[a]);
+ for (c=0; c<l; c++)
+ {
+ int c1, c2;
+ c1=buf1[c];
+ c2=buf2[c];
+ r[n][alp[c1]][alp[c2]]++;
+ }
+ n++;
+ }
+ }
+ }
+ }
+
+
+
+ for ( s=1; s<=salp; s++)
+ {
+ char type[4];
+ sprintf (type, "_%c_", alp_lu[s]);
+ ttp=ttn=tfp=tfn=0;
+ for (a=0; a<n; a++)
+ {
+ tp=tn=fp=fn=0;
+ for (b=1; b<=salp; b++)
+ {
+ for (c=1; c<=salp; c++)
+ {
+ if ( b==s && c==s) tp+=r[a][b][c];
+ else if ( b==s && c!=s)fn+=r[a][b][c];
+ else if ( b!=s && c==s)fp+=r[a][b][c];
+ else if ( b!=s && b!=s)tn+=r[a][b][c];
+ }
+
+ }
+
+ ttp+=tp;
+ ttn+=tn;
+ tfp+=fp;
+ tfn+=fn;
+ rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
+ if ( mode && strstr (mode, "printstat"))fprintf ( stdout, ">%s S=%c sp=%6.2f sn=%6.2f sen2=%6.2f best=%6.2f\n", list[a],alp_lu[s],sp, sn, sen2, best);
+ }
+
+ rates2sensitivity (ttp, ttn, tfp, tfn, &sp, &sn, &sen2, &best);
+ if (mode && strstr (mode, "printstat"))fprintf ( stdout, ">TOT S=%c sp=%6.2f sn=%6.2f re=%6.2f best=%6.2f\n", alp_lu[s],sp, sn, sen2, best);
+
+ if ( mode && strstr (mode, type))
+ {
+ fresult[0]=sn;
+ fresult[1]=sp;
+ fresult[2]=sen2;
+ }
+ }
+ return fresult;
+}
+
+Alignment * mark_exon_boundaries (Alignment *A, Alignment *E)
+{
+ char *buf, *buf2;
+ int a, b, c, i, l;
+
+ buf2=vcalloc ( E->len_aln+1, sizeof (char));
+ buf =vcalloc ( E->len_aln+1, sizeof (char));
+
+ for (a=0; a< A->nseq; a++)
+ {
+ i=name_is_in_list (A->name[a], E->name, E->nseq, 100);
+ if ( i==-1) continue;
+ sprintf (buf, "%s", E->seq_al[i]);
+ ungap (buf);
+ l=strlen (buf);
+ //clean buf2
+ for (c=0, b=0; b<l; b++)if (buf[b]!='o' && buf[b]!='b' && buf[b]!='j')buf2[c++]=toupper(buf[b]);
+ buf2[c]='\0';
+
+ //lowercase the boundaries of buf2;
+ for ( c=0,b=0; b<l; b++)
+ {
+ //ENSEMBL: o: 0, b:1 j:2
+ if (buf[b]=='b' || buf[b]=='o' && c>=1)buf2[c-1]=tolower(buf2[c-1]);
+ else if (buf[b]=='j' &&c<l)buf2[c+1]=tolower(buf2[c+1]);
+ else c++;
+ }
+
+ for (c=0,b=0; b<A->len_aln; b++)
+ {
+ if (!is_gap(A->seq_al[a][b]))
+ {
+ A->seq_al[a][b]=buf2[c++];
+ }
+ }
+ }
+ vfree (buf);
+ vfree (buf2);
+ return A;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/******************************COPYRIGHT NOTICE*******************************/
+/*© Centro de Regulacio Genomica */
+/*and */
+/*Cedric Notredame */
+/*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
+/*All rights reserved.*/
+/*This file is part of T-COFFEE.*/
+/**/
+/* T-COFFEE is free software; you can redistribute it and/or modify*/
+/* it under the terms of the GNU General Public License as published by*/
+/* the Free Software Foundation; either version 2 of the License, or*/
+/* (at your option) any later version.*/
+/**/
+/* T-COFFEE is distributed in the hope that it will be useful,*/
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
+/* GNU General Public License for more details.*/
+/**/
+/* You should have received a copy of the GNU General Public License*/
+/* along with Foobar; if not, write to the Free Software*/
+/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
+/*............................................... |*/
+/* If you need some more information*/
+/* cedric.notredame@europe.com*/
+/*............................................... |*/
+/**/
+/**/
+/* */
+/******************************COPYRIGHT NOTICE*******************************/