#include #include #include #include #include #include #include "io_lib_header.h" #include "util_lib_header.h" #include "dp_lib_header.h" #include "define_header.h" int aln_has_stockholm_structure (Alignment *A) { return name_is_in_list ("#=GC SS_cons", A->name, A->nseq, 100); } int get_aln_stockholm_structure (Alignment *A) { int i; if ((i=aln_has_stockholm_structure(A))==-1) A=add_alifold2aln (A, NULL); return aln_has_stockholm_structure(A); } int ** update_RNAfold_list (Alignment *A, int **pos, int s, int **l) { int a=0; while (l[a]) { if (!is_gap(A->seq_al[s][l[a][0]]) && !is_gap (A->seq_al[s][l[a][1]])) { l[a][2]=pos[s][l[a][0]]; l[a][3]=pos[s][l[a][1]]; } else { l[a][2]=l[a][3]=-1; } a++; } return l; } Alignment *compare_RNA_fold ( Alignment *A, Alignment *B) { int i1, i2, i; int **l1, **l2; int **pos1, **pos2; int a, b; int tot_ol=0, tot_l=0; i1=get_aln_stockholm_structure (A); i2=get_aln_stockholm_structure (B); l1=vienna2list (A->seq_al[i1]); l2=vienna2list (B->seq_al[i2]); pos1=aln2pos_simple(A, A->nseq); pos2=aln2pos_simple(B, B->nseq); for (a=0; a< A->nseq; a++) { char **lu; int ol=0, ll1=0, ll2=0; if ( A->name[a][0]=='#')continue; i=name_is_in_list (A->name[a], B->name, B->nseq, 100); if (i!=-1) { l1=update_RNAfold_list (A,pos1,a, l1); l2=update_RNAfold_list (B,pos2,i, l2); lu=declare_char (A->len_aln, B->len_aln); b=0; while (l2[b]) { if (l2[b][2]==-1 || l2[b][3]==-1); else { ll2++; lu[l2[b][2]][l2[b][3]]=1; } b++; } b=0; while (l1[b]) { if (l1[b][2]==-1 || l1[b][3]==-1); else { ll1++; if (lu[l1[b][2]][l1[b][3]]==1) { A->seq_al[a][l1[b][0]]='6'; A->seq_al[a][l1[b][1]]='6'; ol++; } else { A->seq_al[a][l1[b][0]]='0'; A->seq_al[a][l1[b][1]]='0'; } } b++; } free_char (lu, -1); } tot_ol+=ol; tot_l+=ll1; tot_l+=ll2; fprintf ( stdout, "@@ Seq: %s Overalp: %.2f Al1: %.2f Al2: %.2f \n", A->name[a], (float)(ol*200)/(ll1+ll2), (float)(ol*100)/ll1,(float)(ol*100)/ll2); } fprintf ( stdout, "@@ Seq: Tot Overalp: %.2f \n", (float)(tot_ol*200)/(tot_l)); return A; } int is_neutral(char c1, char c2); int is_watson (char c1, char c2); int is_watson2 (char c1, char c2); int is_watson (char c1, char c2) { c1=tolower (c1); c2=tolower (c2); if ( is_watson2 (c1, c2)) return 1; else return is_watson2 (c2, c1); } int is_watson2 (char c1, char c2) { if ( c1=='g' && c2=='c')return 1; else if (c1=='a' && (c2=='t' || c2=='u'))return 1; return 0; } int is_neutral (char c1, char c2) { c1=tolower (c1); c2=tolower (c2); if (is_watson (c1, c2)) return 1; else if (c1=='g' && (c2=='t' || c2=='u'))return 1; else if ((c1=='t' || c1=='u') && c2=='g')return 1; return 0; } int ** vienna2list ( char *seq) { int a, b, i, i2,l; int **list; l=strlen (seq); list=declare_int (l+1, 8); for (i=0,a=0; a=0; b++) { if (seq[b]=='(')i2++; else if (seq[b]==')')i2--; } list[i][1]=b-1; i++; } } list[i]=NULL; return list; } Alignment *aln2alifold(Alignment *A) { char *tmp1; char *tmp2; tmp1=vtmpnam (NULL); tmp2=vtmpnam (NULL); output_clustal_aln (tmp1,A); printf_system ("RNAalifold %s >%s 2>/dev/null", tmp1, tmp2); return alifold2aln (tmp2); } Alignment *add_alifold2aln (Alignment *A, Alignment *ST) { int a,b,c,d,p1,p2; int r1, rr1, r2, rr2; int watson, comp,tot; int **compmat; int max, p,k; int minseq=3; int **list; int ncomp=0, nwatson=0; int cons_l, fold_l; int i,l; if (!ST) { char *tmp1, *tmp2; int f; Alignment *T; T=copy_aln (A, NULL); tmp1=vtmpnam (NULL); tmp2=vtmpnam (NULL); cons_l=A->len_aln; for (a=0; alen_aln; a++) { for (f=0,b=0; bnseq && f==0; b++) { if (is_gap (A->seq_al[b][a]))f=1; } if (f) { cons_l--; for (b=0; bnseq; b++)T->seq_al[b][a]='-'; } } ST=aln2alifold (T); } //add or Replace the structure l=strlen (ST->seq_al[0]); if ( l!=A->len_aln) { HERE ("\n%s\n%s\n", ST->seq_al[0], A->seq_al[0]); printf_exit ( EXIT_FAILURE, stderr, "ERROR the predicted structure and the multiple alignment do not have the same length [FATAL:%s]\n", PROGRAM); } for (a=0; a< l; a++)if (ST->seq_al[0][a]==STOCKHOLM_CHAR)ST->seq_al[0][a]='.'; if ((i=name_is_in_list ("#=GC SS_cons", A->name, A->nseq, 100))!=-1) { sprintf (A->seq_al[i], "%s", ST->seq_al[0]); } else { A=realloc_aln2 ( A, A->nseq+1, A->len_aln+1); sprintf (A->name[A->nseq], "#=GC SS_cons"); sprintf (A->seq_al[A->nseq], "%s", ST->seq_al[0]); A->nseq++; } return A; } Alignment * alifold2analyze (Alignment *A, Alignment *ST, char *mode) { int s; int **list; int usegap; s=name_is_in_list ("#=GC SS_cons", A->name,A->nseq, 100); if (s==-1) { A=add_alifold2aln (A,ST); s=name_is_in_list ("#=GC SS_cons", A->name,A->nseq, 100); } list=vienna2list (A->seq_al[s]); list=alifold_list2cov_list (A, list); usegap=0; //do not use gaped positions by default if (mode && strstr (mode, "usegap"))usegap=1;//count positions with gaps if (!mode) { A=alifold2cov_stat (A, list,usegap); } else { if ( strstr (mode, "stat")) A=alifold2cov_stat (A, list, usegap); if ( strstr (mode, "list")) A=alifold2cov_list (A, list, usegap); if ( strstr (mode, "aln")) A=alifold2cov_aln (A, list, usegap); if ( strstr (mode, "color") ) { Alignment *C; C=copy_aln (A, NULL); C=alifold2cov_cache (C, list, usegap); A=alifold2cov_aln (A, list, usegap); if ( strstr ( mode, "ps")) output_color_ps (A, C, "stdout"); else output_color_html (A, C, "stdout"); exit (EXIT_SUCCESS); } } return A; } int ** alifold_list2cov_list (Alignment *A, int **list) { int a,b,c,d,p1,p2,s; int r1, rr1, r2, rr2; int neutral,watson, comp,tot, occupancy; int **compmat; int max, p,k; int minseq=3; int ncomp=0, nwatson=0, nneutral=0, ncomp_wc=0; int cons_l, fold_l; int nseq; for (nseq=0,a=0; a< A->nseq; a++)if ( A->name[a][0]!='#')nseq++; max=((nseq*(nseq-1))/2); a=0; while (list[a]) { p1=list[a][0]; p2=list[a][1]; watson=0; comp=0; neutral=0; tot=0; occupancy=0; for (c=0; cnseq-1; c++) { if (A->name[c][0]=='#')continue; r1=tolower(A->seq_al[c][p1]); r2=tolower(A->seq_al[c][p2]); if (is_gap(r1) || is_gap(r2))continue; for (d=c+1; dnseq; d++) { if (A->name[d][0]=='#')continue; rr1=tolower(A->seq_al[d][p1]); rr2=tolower(A->seq_al[d][p2]); if (is_gap(rr1) || is_gap(rr2))continue; if (is_watson (r1, r2))watson++; if (is_watson (rr1, rr2))watson++; if (is_neutral (r1, r2))neutral++; if (is_neutral (rr1, rr2))neutral++; if (r1!=rr1 && r2!=rr2)comp++; occupancy++; } } watson=(watson*100)/(occupancy*2); comp=(comp*100)/occupancy; neutral=(neutral*100)/(occupancy*2); occupancy=(occupancy*100)/max; list[a][3]=neutral; list[a][4]=watson; list[a][5]=comp; list[a][6]=occupancy; if (list[a][3]<100) { if (list[a][5]>0)list[a][7]='I';//compensated incompatible pair else list[a][7]='i'; //non compensated incompatible pair } else { list[a][7]='N';//Neutral pair if (list[a][4]==100) { list[a][7]='W';//Watson and Crick if ( list[a][5]>0)list[a][7]='C'; //Watson and crick compensated } else if ( list[a][5]>0) { list[a][7]='c';//compensated } } a++; } return list; } Alignment *alifold2cov_aln (Alignment *inA,int **list, int ug) { int a=0; a=0; Alignment *A; A=copy_aln (inA, NULL); A=realloc_aln2 ( A, A->nseq+1, A->len_aln+1); sprintf (A->name[A->nseq], "#=GC SS_analyze"); sprintf (A->seq_al[A->nseq], "%s", A->seq_al[A->nseq-1]); A->nseq++; while (list[a]) { char s; if (list[a][6]<100 && !ug); else { s=list[a][7]; A->seq_al[A->nseq-1][list[a][0]]=s; A->seq_al[A->nseq-1][list[a][1]]=s; } a++; } return A; } Alignment *alifold2cov_stat (Alignment *A,int **list, int ug) { int fold=0,watson=0, comp=0, compwc=0, incomp=0, neutral=0; int a; a=0; while (list[a]) { int s; fold++; if (list[a][6]<100 && !ug); else { s=list[a][7]; watson +=(s=='W')?1:0; compwc +=(s=='C')?1:0; comp +=(s=='c')?1:0; neutral+=(s=='N')?1:0; incomp +=(s=='I')?1:0; } a++; } fprintf ( stdout, "@@ TOT Nseq:%d tot_len: %d fold: %d neutral: %d watson: %d CorWC: %d cor: %d CorIncompatible: %d\n",A->nseq-1, A->len_aln,fold, neutral,watson, compwc,comp,incomp); return A; } Alignment *alifold2cov_cache (Alignment *inA, int **list, int ug) { int a,b, c; Alignment *A; A=copy_aln (inA, NULL); a=0; while (list[a]) { int v, s; if (list[a][6]<100 && !ug); else { s=list[a][7]; if (s=='C')v=9; //red else if ( s=='c')v=7; //orange else if ( s=='W')v=5; //Yellow else if ( s=='N')v=2; //green else if ( s=='I')v=0; //blue; for (b=0;bnseq; b++) { if (A->name[b][0]=='#'); else { for (c=0; c<2; c++) { A->seq_al[b][list[a][c]]='0'+v; } } } } a++; } return A; } Alignment *alifold2cov_list (Alignment *A,int **list, int ug) { int a,b, s; a=0; while (list[a]) { s=list[a][7]; if (list[a][6]<100 && !ug); else if (s=='C') { fprintf ( stdout, "@@ WC Compensated pair: %4d %4d =>", list[a][0]+1, list [a][1]+1); for (b=0; bnseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]])); fprintf (stdout,"\n"); } else if (s=='c') { fprintf ( stdout, "@@ Neural Compensated pair: %4d %4d =>", list[a][0]+1, list [a][1]+1); for (b=0; bnseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]])); fprintf (stdout,"\n"); } else if (s=='W') { fprintf ( stdout, "@@ WC pair: %4d %4d =>", list[a][0]+1, list [a][1]+1); for (b=0; bnseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]])); fprintf (stdout,"\n"); } else if (s=='N') { fprintf ( stdout, "@@ Neutral pair: %4d %4d =>", list[a][0]+1, list [a][1]+1); for (b=0; bnseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]])); fprintf (stdout,"\n"); } else if (s=='I') { fprintf ( stdout, "@@ incompatible pair: %4d %4d =>", list[a][0]+1, list [a][1]+1); for (b=0; bnseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]])); fprintf (stdout,"\n"); } a++; } return A; } Alignment *aln2sample (Alignment *A, int n) { Alignment *B; int a, b, p; int **pos; B=copy_aln (A, NULL); vsrand(0); pos=declare_int (A->len_aln, 2); for (a=0; alen_aln; a++){pos[a][0]=a;pos[a][1]=rand()%(1000*A->len_aln);} sort_int (pos, 2, 1, 0, A->len_aln-1); n=(n==0)?A->len_aln:(MIN (n, (A->len_aln))); for (a=0; anseq; b++) A->seq_al[b][a]=B->seq_al[b][pos[a][0]]; for (b=0; bnseq; b++) A->seq_al[b][n]='\0'; A->len_aln=n; free_aln (B); free_int (pos, -1); return A; } Alignment *aln2bootstrap (Alignment *A, int n) { Alignment *B; int a, b, p; if (n==0)n=A->len_aln; else A=realloc_aln (A, n+1); vsrand(0); B=copy_aln (A, NULL); for (a=0; alen_aln; for (b=0; bnseq; b++) A->seq_al[b][a]=B->seq_al[b][p]; } for ( b=0; bnseq; b++)A->seq_al[b][n]='\0'; A->len_aln=n; free_aln (B); return A; } Alignment * aln2random_aln (Alignment *A, char *smode) { int a, b, n, **res; int max; if ( smode==NULL) { smode=vcalloc (4, sizeof (char)); sprintf ( smode, "SCR");//Sequences, Column Residues } else if ( strm (smode, "NO"))return A; vsrand(0); max=A->nseq*1000; if ( strstr ( smode, "S")) { A=aln2scramble_seq (A); } if ( strstr ( smode, "C")) { res=declare_int (A->nseq, 2); for (a=0; a< A->len_aln; a++) { for (n=0,b=0;bnseq; b++) { if ( !is_gap(A->seq_al[b][a])) { res[n][0]=A->seq_al[b][a]; res[n][1]=rand()%max; n++; } sort_int (res, 2, 1, 0, n-1); } for (n=0,b=0;bnseq; b++) { if ( !is_gap(A->seq_al[b][a]))A->seq_al[b][a]=res[n++][0]; } } free_int (res, -a); } //Redistributes the residues randomly without changing the gap pattern if ( strstr ( smode, "R")) { max=A->len_aln*A->nseq; res=declare_int (max, 2); for (n=0,a=0; a< A->len_aln; a++) { for (b=0;bnseq; b++) { if ( !is_gap(A->seq_al[b][a])) { res[n][0]=A->seq_al[b][a]; res[n][1]=rand()%max; n++; } } } sort_int (res, 2, 1, 0, n-1); for (n=0,a=0; a< A->len_aln; a++) { for (b=0;bnseq; b++) { if ( !is_gap(A->seq_al[b][a])) { A->seq_al[b][a]=res[n++][0]; } } } free_int (res, -1); } return A; } Alignment *score_aln2score_ascii_aln (Alignment *A, Alignment *C) { //Convert the output of T-Coffee evaluate into a printable score_ascii alignment*/ //A and C must be sorted //sets to 0 lone residues int a, b; for (a=0; anseq; a++) for (b=0; blen_aln; b++) { int rC=C->seq_al[a][b]; int rA=A->seq_al[a][b]; if ( !strm (A->name[a], C->name[a])){HERE ("Unsorted aln in score_aln2score_ascii"); exit (EXIT_FAILURE);} if ( rA=='x' || rA=='X')C->seq_al[a][b]='9'; else if ( rC >='0' && rC<='9'); else if ( rC<10)C->seq_al[a][b]='0'+rC; else if ( rC==NO_COLOR_RESIDUE && !is_gap(rA)) C->seq_al[a][b]='0'; else if ( rC==NO_COLOR_RESIDUE && is_gap(rA))C->seq_al[a][b]='-'; } return C; } Alignment*aln2gap_cache (Alignment *A, int val) { Alignment *B; int a, b, c, nr; B=copy_aln (A, NULL); for (b=0; blen_aln; b++) { for (nr=0,a=0; anseq; a++)nr+=!is_gap (A->seq_al[a][b]); for (a=0; anseq; a++)if (!is_gap(A->seq_al[a][b]))B->seq_al[a][b]=(nr==1)?'0'+val:'1'; } return B; } Alignment* aln2case_aln (Alignment *B, char *upper, char *lower) { int a, b, c, up, lo; Alignment *A; A=copy_aln (B, NULL); up=(upper)?upper[0]:'u'; lo=(lower)?lower[0]:'l'; for (a=0; anseq; a++) for (b=0; blen_aln; b++) { c=A->seq_al[a][b]; if ( is_gap(c)); else A->seq_al[a][b]=(isupper (c))?up:lo; } return A; } Alignment *aln2scale (Alignment *A, char *coffset) { int a, b, t, v, n; char *s1, *s2; char s[1000]; int offset; if (coffset)offset=atoi(coffset); else offset=0; sprintf (s, "%d", A->len_aln+offset); n=strlen (s); A=realloc_aln2 (A, A->nseq+n, A->len_aln+1); s1=vcalloc ( n+1, sizeof (char)); s2=vcalloc ( n+1, sizeof (char)); for (a=0; aname[A->nseq+a], "%s", s2); } for (a=0; alen_aln; a++) { sprintf (s1, "%d", a+1+offset); s2=invert_string (s1); t=strlen (s2); for (b=0; b<=n; b++) { if (b>=t) v='0'; else v=s2[b]; A->seq_al[A->nseq+b][a]=v; } } A->nseq+=n; return A; } int * pos2list (int * pos, int len, int *nl) { int *list; int a; nl[0]=0; list=vcalloc (len, sizeof (int)); for (a=0; anseq+((B)?B->nseq:0), sizeof (int)); pos=aln2pos_simple_2 (A); if (B) { n=B->nseq; for ( a=0; anseq; a++) { list[a]=name_is_in_list(B->name[a], A->name, A->nseq, 100); } } else { for ( a=0; anseq; a++) list[a]=a; n=A->nseq; } fprintf ( fp, "#"); for ( b=0; bname[s]); } fprintf (fp, "\n"); for ( a=0; alen_aln; a++) { for ( b=0; bx, where x is the position of residue z of seq1/S1 in S2->seq[index[Seq1/S1]] */ int a; int **index; char *seq1=NULL, *seq2=NULL; Alignment *Profile; index=vcalloc ( S1->nseq, sizeof (int*)); for (a=0; a< S1->nseq; a++) { int len1, len2, b, c; seq1=S1->seq[a]; if (name_index[a][0]==-1) seq2=NULL; else if (name_index[a][1]==-1) { seq2=S2->seq[name_index[a][0]]; } else if ((Profile=seq2R_template_profile (S2, name_index[a][0])) !=NULL) { seq2=Profile->seq_al[name_index[a][1]]; } len1=strlen (seq1);len2=strlen (seq2); index[a]=vcalloc (len2, sizeof(int)); for (c=0,b=0; bx if seq1 is the xth sequence of S2 ->-1 if seq1 is nowhere to be found index[seq1 of S1][1]->z if seq1 is the zth sequence within the xth profile of S2 */ int **index; int a, b, x, z; Alignment *Profile; index=declare_int (S1->nseq, 2); for ( a=0; anseq; a++) { index[a][0]=index[a][1]=-1; x=name_is_in_list (S1->name[a],S2->name,S2->nseq,100); if ( x!=-1){index[a][0]=x;index[a][1]=-1;} for ( b=0; bnseq; b++) { if ((Profile=seq2R_template_profile (S2,b))) { z=name_is_in_list (S1->name[a],Profile->name,Profile->nseq,100); if ( z!=-1){index[a][0]=b;index[a][1]=z;b=S2->nseq;} } } } return index; } int *get_name_index (char **l1, int n1, char **l2, int n2) { int *r; int a; /*return Array[Index_L1]=Index_L2 */ r=vcalloc ( n1, sizeof (int)); for ( a=0; a< n1; a++) r[a]=name_is_in_list (l1[a],l2,n2,100); return r; } int* get_res_index (char *seq0, char *seq1) { int *coor, a; if ( !seq0 || !seq1) return NULL; coor=vcalloc ( strlen (seq0)+1, sizeof (int)); if (!strm (seq0, seq1)) { int r0, r1 , isr0, isr1; int l0=0, l1=0; Alignment *A; A=align_two_sequences (seq0,seq1,"pam250mt",-5,-1, "myers_miller_pair_wise"); for ( a=0; a< A->len_aln; a++) { r0=A->seq_al[0][a];r1=A->seq_al[1][a]; isr0=!is_gap(r0); isr1=!is_gap(r1); l0+= isr0; l1+= isr1; if (isr0 && isr1)coor[l0-1]=l1-1; else if (isr0) coor[l0-1]=-1; } free_aln (A); } else { int l0; l0=strlen (seq0); for ( a=0;a< l0; a++) coor[a]=a; } return coor; } int change_residue_coordinate ( char *in_seq1, char *in_seq2, int v) { /*Expresses the coordinate of a residue in seq1, in the coordinate system of seq2*/ static char *seq1, *seq2; static int *coor; if ( seq1 !=in_seq1 || seq2 !=in_seq2) { int r0, r1 , isr0, isr1; int l0=0, l1=0; Alignment *A; int a; vfree (coor); seq1=in_seq1, seq2=in_seq2; A=align_two_sequences (seq1,seq2,"pam250mt", -14, -2, "myers_miller_pair_wise"); coor=vcalloc ( A->len_aln, sizeof (int)); for ( a=0; a< A->len_aln; a++) { r0=A->seq_al[0][a];r1=A->seq_al[1][a]; isr0=!is_gap(r0); isr1=!is_gap(r1); l0+= isr0; l1+= isr1; if (isr0 && isr1)coor[l0-1]=l1-1; else if (isr0) coor[l0-1]=-1; } free_aln (A); } return coor[v]; } int ** minimise_repeat_coor (int **coor, int nseq, Sequence *S) { int **new_coor; int a, min; new_coor=declare_int ( nseq, 3); min=return_min_int (coor, nseq, 2); for ( a=0; a< nseq; a++) { new_coor[a][0]=coor[a][0]; new_coor[a][1]=coor[a][1]; new_coor[a][2]=min; } return new_coor; } int ** get_nol_seq ( Constraint_list *CL, int **coor, int nseq, Sequence *S) { int a, s, p, l, nl; int **buf; int **new_coor; new_coor=declare_int ( nseq+1, 3); buf=get_undefined_list ( CL); for ( a=0; a< nseq; a++)buf[coor[a][0]][coor[a][1]]=1; for ( a=0; a< nseq; a++) { s=coor[a][0]; p=coor[a][1]+1; l=strlen(S->seq[s]); nl=0; while ( p<=l && !buf[s][p++])nl++; new_coor[a][0]=s; new_coor[a][1]=coor[a][1]; new_coor[a][2]=nl; } free_int ( buf, -1); return new_coor; } int compare_pos_column( int **pos1,int p1, int **pos2,int p2, int nseq) { int a,v1, v2; int identical=0; for ( a=0; a< nseq; a++) { v1=pos1[a][p1]; v2=pos2[a][p2]; if (v1>0 || v2>0) { if ( v1!=v2)return 0; else identical=1; } } return identical; } char *seq2alphabet (Sequence *S) { return array2alphabet (S->seq, S->nseq, ""); } char *aln2alphabet (Alignment *A) { return array2alphabet (A->seq_al, A->nseq, ""); } char *array2alphabet (char **array, int n, char *forbiden) { int a, b, l; int *hasch; char *alphabet; hasch=vcalloc (256, sizeof (int)); alphabet=vcalloc ( 257, sizeof (char)); for ( a=0; anseq; a++) { char s; s=Pred->seq_al[a][pos]; if (!is_gap(s)) { score[tolower(s)]++; tot++; } } if ( score['h']>score['i'] && score['h']>score['o'])cons='h'; else if ( score['i']>score['o'])cons='i'; else cons='o'; if (tot==0) return ""; if (mode==VERBOSE)sprintf (result, " H: %3d I: %3d O: %3d P: %c", (score['h']*100)/tot, (score['i']*100)/tot, (score['o']*100)/tot, cons); else if (mode == SHORT)sprintf ( result, "%c", cons); score['h']=score['o']=score['i']=0; return result; } Alignment * aln2hmmtop_pred (Alignment *A) { int a, b, c; char *buf, *pred; Alignment *PA; PA=copy_aln (A, NULL); buf=vcalloc ( A->len_aln+1, sizeof (char)); for ( a=0; a< A->nseq; a++) { sprintf (buf, "%s", A->seq_al[a]); pred=seq2tmstruc (buf); for (c=0,b=0; blen_aln; b++) { if (!is_gap (PA->seq_al[a][b]))PA->seq_al[a][b]=pred[c++]; } vfree (pred); } vfree (buf); return PA; } char * seq2tmstruc ( char *seq) { static Sequence *S; char *seqfile, *predfile, *buf; FILE *fp; seqfile=vtmpnam (NULL); predfile=vtmpnam (NULL); fp=vfopen (seqfile, "w"); fprintf ( fp, ">seq1\n%s", seq); vfclose (fp); printf_system ( "fasta_seq2hmmtop_fasta.pl -in=%s -out=%s -arch=%s/%s -psv=%s/%s", seqfile, predfile, get_mcoffee_4_tcoffee(), "hmmtop.arch", get_mcoffee_4_tcoffee(), "hmmtop.psv"); S=get_fasta_sequence (predfile, NULL); buf=vcalloc ( strlen (S->seq[0])+1, sizeof (char)); sprintf ( buf, "%s", S->seq[0]); free_sequence (S, S->nseq); return buf; } char * set_blast_default_values() { set_string_variable ("blast_server", (getenv ("blast_server_4_TCOFFEE"))?getenv ("blast_server_4_TCOFFEE"):"EBI"); set_string_variable ("pdb_db", (getenv ("pdb_db_4_TCOFFEE"))?getenv ("pdb_db_4_TCOFFEE"):"pdb"); set_string_variable ("prot_db", (getenv ("prot_db_4_TCOFFEE"))?getenv ("prot_db_4_TCOFFEE"):"uniprot"); set_int_variable ("prot_min_sim", 0); set_int_variable ("prot_max_sim", 100); set_int_variable ("prot_min_cov", 0); set_int_variable ("prot_max_cov", 100); set_int_variable ("pdb_min_sim", 0); set_int_variable ("pdb_max_sim", 100); set_int_variable ("pdb_min_cov", 0); set_int_variable ("pdb_max_cov", 100); return; } char * seq2pdb (Sequence *S) { set_blast_default_values(); S->nseq=1; S=seq2template_seq (S, "PDB", NULL); return seq2P_pdb_id(S,0); } Alignment * seq2blast ( Sequence *S) { Alignment *A; set_blast_default_values(); if (S->nseq==1) { S=seq2template_seq (S, "BLAST", NULL); A=seq2R_template_profile(S,0); sprintf ( A->name[0], "%s", S->name[0]); } else { int a; for (a=0; a< S->nseq; a++) { Sequence *NS; char name[1000]; NS=fill_sequence_struc(1, &(S->seq[a]), &(S->name[a])); NS=seq2template_seq (NS, "BLAST", NULL); A=seq2R_template_profile(NS,0); sprintf ( name, "%s.prf", S->name[a]); output_fasta_aln (name,A); fprintf (stdout, "\nOUTPUT %s\n", name); } exit (EXIT_SUCCESS); } return A; } Sequence * seq2unique_name_seq ( Sequence *S) { int a; if ((a=name_list2unique_name_list (S->nseq, S->name))) { add_warning ( stderr, "\nWarning: Sequence %s is duplicated in file %s. The sequence will be renamed", S->name[a-1], S->file[a-1]); } return S; } Alignment * aln2unique_name_aln ( Alignment *S) { int a; if ((a=name_list2unique_name_list (S->nseq, S->name))) { add_warning ( stderr, "\nWarning: Sequence %s is duplicated in file %s. The sequence will be renamed", S->name[a-1], S->file[a-1]); } return S; } int name_list2unique_name_list (int n, char **name) { int duplicate=0; int a, b; for (a=0; a%s\naggggg\n", name[a]); vfclose (fp); printf_system ("fasta_aln2fasta_aln_unique_name.pl %s > %s", tmp1, tmp2); S=get_fasta_sequence (tmp2, NULL); for (a=0; aname[a])+1)); sprintf ( name[a], "%s", S->name [a]); } free_sequence(S, -1); } return duplicate; } Sequence* seq2clean_seq (Sequence *S, char *alp) { int a, b, c, d, l; for (a=0; a< S->nseq; a++) { l=strlen (S->seq[a]); for (d=0,b=0; bseq[a][b]; if ( alp==NULL && !strchr (AA_ALPHABET, c) && !strchr (DNA_ALPHABET, c)); else if (alp && strchr (alp, c)); else S->seq[a][d++]=c; } S->seq[a][d]='\0'; S->len[a]=strlen (S->seq[a]); } return S; } int ** seq2aln_pos (Alignment *A, int *ns, int **l_s) { int **code; int a, b,c, d,l, p , g; l=MAX(strlen (A->seq_al[l_s[0][0]]), strlen (A->seq_al[l_s[1][0]])); code=declare_int ((A->S)->nseq,l+1); for (c=0; c<2; c++) { l=strlen (A->seq_al[l_s[c][0]]); for (d=0; dorder[l_s[c][d]][0]; for (p=0, b=0; bseq_al[l_s[c][d]][b]); if (!g){p++; code[a][p]=b+1;} } } } return code; } Alignment *local_maln2global_maln (char *seq, Alignment *A) { /*inputs a BLAST alignmnent where the master sequence may be partila outputs the same alignment, while amkeing sure the profile is perfectly in sink with its master sequence */ int a, b, c; int start, end, rend; char qname[100], *p; Alignment *B=NULL; sprintf ( qname, "%s", A->name[0]); p=strtok (qname, "_"); if ( !strm (p, "QUERY")) { fprintf ( stderr, "\nUnappropriate format for the alignment [%s:FATAL]", PROGRAM); myexit (EXIT_FAILURE); } start=atoi(strtok (NULL, "_")); end=atoi(strtok (NULL, "_")); rend=strlen (seq); B=copy_aln (A,NULL); if ( start>1 || endseq_al[0][a]=seq[a]; for ( b=1; b< A->nseq; b++)A->seq_al[b][a]='-'; } for (c=0,a=start-1; a< end; a++, c++) { A->seq_al[0][a]=seq[a]; for ( b=1; b< A->nseq; b++) { A->seq_al[b][a]=B->seq_al[b][c]; } } for ( a=end; aseq_al[0][a]=seq[a]; for ( b=1; b< A->nseq; b++)A->seq_al[b][a]='-'; } for ( a=0; a< A->nseq; a++) A->seq_al[a][rend]='\0'; free_aln (B); A->len_aln=rend; return A; } int ** aln2inv_pos ( Alignment *A) { int **pos,a; pos=vcalloc (A->nseq, sizeof (char*)); for (a=0; a< A->nseq; a++)pos[a]=seq2inv_pos (A->seq_al[a]); return pos; } int * seq2inv_pos ( char *seq) { /*returns a list where each value gives the index of the corresponding residue in seq*/ /*Numbering: 1 to L : Analogy to the aln2pos*/ int a,l1, l2; int *pos; l1=strlen ( seq); for ( l2=a=0; a< l1; a++)l2+=1-is_gap(seq[a]); pos=vcalloc (l2+1, sizeof (int)); for ( l2=a=0; a< l1; a++)if (!is_gap(seq[a]))pos[++l2]=a+1; return pos; } int ** aln2pos_simple_2 (Alignment *A) { int **pos1; int **pos2; pos1=aln2pos_simple (A, A->nseq); pos2=duplicate_int (pos1, A->nseq,read_size_int (pos1[0],sizeof (int))); pos1=aln2pos_simple (NULL, 0); return pos2; } int ** aln2pos_simple (Alignment *A, int n_nseq, ...) { /* function documentation: start int ** aln2pos_simple (Alignment *A, int n_nseq, ...) ####with two parameter only: Alignment *A, int n_nseq this function turns A into pos, a matrix where each residue is replace by its index according to the complete sequence. the indices in pos are computed using A->order[x][1] that contains the indice of the first residue of seq x of A n_nseq MUST not be null ####with more than two param: int ** aln2pos_simple (Alignment *A, int n_nseq, int *ns, int **ls) n_nseq must be set to 0 for the param 3 and four to be read ns[x]=number seq in group ls[x]=list of the sequences in group x ( size=ns[x]) The computation of the indices is only carried out on the scpecified residues ####IMPORTANT in pos, the numbering of the residues goes from 1 to L: pos[0][0]=3, means that the first position of the first sequence in the alignmnet contains residue #3 from sequence A->order[0][0]; function documentation: end */ int a, b,c, p, g,l; int **T; int max_nseq; int n_len=0; int *list=NULL; int *ns=NULL; int **ls=NULL; va_list ap; if ( A==NULL) { return NULL; } else { if ( n_nseq>0) { list=vcalloc(n_nseq, sizeof (int)); for ( a=0; a< n_nseq; a++)list[a]=a; } else { va_start (ap, n_nseq); ns=va_arg(ap, int * ); ls=va_arg(ap, int **); va_end(ap); list=vcalloc ( ns[0]+ns[1], sizeof (int)); n_nseq=0; for ( a=0; a< ns[0]; a++)list[n_nseq++]=ls[0][a]; for ( a=0; a< ns[1]; a++)list[n_nseq++]=ls[1][a]; } max_nseq=MAX(read_size_int(A->order,sizeof (int*)),return_max_int (A->order, read_size_int(A->order,sizeof (int*)),0))+1; n_len=get_longest_string ( A->seq_al,A->max_n_seq, NULL, NULL)+1; T=declare_int (max_nseq, n_len); for ( c=0; c< n_nseq; c++) { a=list[c]; l=strlen ( A->seq_al[a]); for ( p=A->order[a][1],b=0; bseq_al[a][b]); p+=g; T[a][b]=(g==1)?p:-(1+p); if ( A->seq_al[a][b]==UNDEFINED_RESIDUE)T[a][b]=0; if ( A->seq_cache && T[a][b]>0)T[a][b]=A->seq_cache[A->order[a][0]][T[a][b]]; } } vfree (list); } return T; } Alignment ** split_seq_in_aln_list ( Alignment **aln, Sequence *S, int n_seq, char **seq_list) { int a, b, c; char * long_seq=NULL; int len,l; int **translation; int **table; if ( aln==NULL)return NULL; translation=declare_int ( S->nseq,2); for (len=0,a=0; a< S->nseq; a++) { if((b=name_is_in_list (S->name[a],seq_list, n_seq, 100))!=-1) { l=strlen(S->seq[a])+1; long_seq=vrealloc(long_seq,(len+l+1)*sizeof(char)); long_seq=strcat(long_seq, S->seq[a]); long_seq=strcat(long_seq, "*"); translation[a][0]=b; translation[a][1]=len; len+=l; } else translation[a][0]=-1; } long_seq[len-1]='\0'; len--; table=declare_int ( len+1, 2); for ( b=0,a=0; a< S->nseq; a++) { if ( translation[a][0]!=-1) { c=1; while (long_seq[b]!='\0' && long_seq[b]!='*') { table[b+1][1]=c++; table[b+1][0]=translation[a][0]; b++; } table[b][1]=c++; table[b][0]=translation[a][0]; b++; } } for ( a=0; a< (aln[-1])->nseq; a++) { for ( b=0; b< (aln[a])->nseq; b++) { (aln[a])->order[b][0]=table[(aln[a])->order[b][1]][0]; (aln[a])->order[b][1]=table[(aln[a])->order[b][1]][1]; sprintf ( (aln[a])->name[b],"%s_%d_%d", S->name[(aln[a])->order[b][0]],a+1,b+1); } } free_int (translation, -1); free_int (table, -1); return aln; } Sequence * fill_sequence_struc ( int nseq, char **sequences, char **seq_name) { int a; Sequence *S; int shortest, longuest; if (!sequences) { shortest=longuest=0; } else if ( nseq>1) { shortest=get_shortest_string( sequences, nseq, NULL, NULL); longuest=get_longest_string (sequences, nseq, NULL, NULL); } else if ( nseq==1) { shortest=longuest=strlen (sequences[0]); } else { return NULL; } S=declare_sequence (shortest, longuest,nseq); S->nseq=nseq; if (sequences)S->seq=copy_char ( sequences, S->seq, nseq, -1); else S->seq=declare_char (S->nseq, 1); S->name=copy_char ( seq_name, S->name,nseq, -1); ungap_array (S->seq,nseq); for ( a=0; a< S->nseq; a++)S->len[a]=strlen(S->seq[a]); return S; } Alignment * thread_profile_files2aln (Alignment *A, char *template_file, Fname *F) { Alignment *P; int a; if (!A->S)A->S=aln2seq (A); if (template_file)A->S=seq2template_seq (A->S, template_file,F); for ( a=0; a< A->nseq; a++) { P=seq2R_template_profile (A->S, a); if ( P) { P->expand=1; sprintf ( P->name[0], "%s", A->name[a]); } } return expand_aln (A); } Alignment * expand_aln (Alignment *A) { /*This function expands the profiles within an alignment*/ int a, b, d, e; Alignment *MAIN=NULL, *SUB=NULL; int n_sub_seq=0; int new_nseq=0; int *list; Alignment *Profile; if ( !A)return A; list=vcalloc (A->nseq, sizeof (int)); for ( a=0; a< A->nseq; a++) { Profile=seq2R_template_profile (A->S, A->order[a][0]); if (Profile && Profile->expand) { new_nseq+=Profile->nseq; } else { new_nseq++; list[n_sub_seq++]=a; } } if ( n_sub_seq==A->nseq){vfree(list);return A;} else if (n_sub_seq==0){MAIN=copy_aln (A, MAIN);MAIN->nseq=0;} else { MAIN=extract_sub_aln (A, n_sub_seq, list); } vfree(list); for ( a=0; a< A->nseq; a++) { Profile=seq2R_template_profile (A->S, A->order[a][0]); if ( Profile && Profile->expand) { SUB=copy_aln (Profile,SUB); SUB=realloc_aln2(SUB, SUB->nseq, A->len_aln+1); for ( e=0,b=0; b< A->len_aln; b++) { if ( is_gap(A->seq_al[a][b])) {for (d=0; d< SUB->nseq; d++)SUB->seq_al[d][b]='-';} else { for(d=0; dnseq; d++)SUB->seq_al[d][b]=Profile->seq_al[d][e]; e++; } } MAIN=stack_aln(MAIN, SUB); } } free_aln (A); free_aln (SUB); return MAIN; } Alignment * expand_number_aln (Alignment *A,Alignment *EA) { /*This function expands the profiles within an alignment*/ int a, b, d, e; Alignment *MAIN=NULL, *SUB=NULL, *C=NULL; int n_sub_seq=0; int new_nseq=0; int *list; Alignment *Profile; if ( !EA || !A)return EA; if ( EA->nseqnseq) { fprintf (stderr, "\n[ERROR:expand_number_aln] Using as a master an expanded aln (%d %d) [FATAL:%s]", EA->nseq, A->nseq,PROGRAM); EA->A=A->A=NULL; print_aln (EA); print_aln (A); myexit (EXIT_FAILURE); } list=vcalloc (EA->nseq, sizeof (int)); for ( a=0; a< EA->nseq; a++) { Profile=seq2R_template_profile (EA->S, EA->order[a][0]); if (Profile && Profile->expand)new_nseq+=Profile->nseq; else { new_nseq++; list[n_sub_seq++]=a; } } if ( n_sub_seq==EA->nseq){vfree(list);return EA;} else if (n_sub_seq==0){MAIN=copy_aln (EA, MAIN);MAIN->nseq=0;} else { MAIN=extract_sub_aln (EA, n_sub_seq, list); } list[0]=EA->nseq; C=extract_sub_aln (EA,1, list); vfree(list); for ( a=0; a< EA->nseq; a++) { Profile=seq2R_template_profile (EA->S, EA->order[a][0]); if ( Profile && Profile->expand) { SUB=copy_aln (Profile,SUB); SUB=realloc_aln2(SUB, SUB->nseq, EA->len_aln+1); for ( e=0,b=0; b<= EA->len_aln; b++) { if (is_gap(A->seq_al[a][b])) { for ( d=0; dnseq; d++) SUB->seq_al[d][b]=NO_COLOR_RESIDUE; } else { for ( d=0; dnseq; d++) { if ( is_gap (Profile->seq_al[d][e])) { SUB->seq_al[d][b]=NO_COLOR_RESIDUE; } else SUB->seq_al[d][b]=EA->seq_al[a][b]; } e++; } } for (d=0; d< SUB->nseq; d++)SUB->score_seq[d]=EA->score_seq[a]; MAIN=stack_aln(MAIN, SUB); } } MAIN=stack_aln(MAIN, C); MAIN->nseq--; MAIN->score=MAIN->score_aln=EA->score_aln; free_aln (SUB); free_aln (EA); free_aln (C); return MAIN; } Alignment * probabilistic_rm_aa ( Alignment *A, int pos, int len) { int random_len=0; int a, b; int left, right; if ( len<0) { random_len=1; len=-len; } vsrand(0); if (pos==0)pos= (rand()%(A->len_aln-(2*len+len))) +len; for ( a=0; a< A->nseq; a++) { if (random_len)left =rand()%len; else left=len; if (random_len)right=rand()%len; else right=len; if ( (pos-right)<0 || (pos+left)>A->len_aln) { add_warning ( stderr, "\nWarning: probabilistic_rm_aa, pos out of range [%s]\n", PROGRAM); } else for ( b=pos-right; bseq_al[a][b]=(b==pos)?'~':'*'; } ungap_aln (A); free_sequence ( A->S, A->nseq); A->S=aln2seq (A); return A; } Alignment * remove_gap_column ( Alignment *A, char *mode) { int a, b; char *p; int *seq_list; int nseq=0; int keep_col, cl; seq_list =vcalloc ( A->nseq, sizeof (int)); while ( (p=strtok(mode, ":"))) { mode=NULL; if (p[0]=='#') { seq_list[nseq++]=atoi(p+1)-1; } else if ( (a=name_is_in_list (p, A->name, A->nseq, 100))!=-1) { seq_list[nseq++]=a; } } if ( nseq==0) { for ( a=0; a< A->nseq; a++)seq_list[a]=a; nseq=A->nseq; } for ( cl=0,a=0; a<=A->len_aln; a++) { for (keep_col=1, b=0; b< nseq && keep_col; b++) { keep_col=(is_gap(A->seq_al[seq_list[b]][a]))?0:keep_col; } if ( keep_col) { for ( b=0; b< A->nseq; b++) { A->seq_al[b][cl]=A->seq_al[b][a]; } cl++; } else { for ( b=0; b< A->nseq; b++) { A->seq_al[b][cl]='-'; } cl++; } } A->len_aln=cl; vfree (seq_list); return A; } Alignment * ungap_sub_aln (Alignment *A, int ns, int *ls) { int a, b, c,t; int len; len=strlen ( A->seq_al[ls[0]]); for ( c=0,a=0; aseq_al[ls[b]][a]); if (t==ns); else { for ( b=0; bseq_al[ls[b]][c]=A->seq_al[ls[b]][a]; c++; } } for ( b=0; bseq_al[ls[b]][c]='\0'; return A; } Sequence * ungap_seq ( Sequence *S) { int a; if ( !S)return NULL; ungap(S->seq[0]); S->max_len=S->min_len=strlen (S->seq[0]); for ( a=0; a< S->nseq; a++) { ungap(S->seq[a]); S->len[a]=strlen (S->seq[a]); S->max_len=MAX(S->max_len,S->len[a]); S->min_len=MAX(S->min_len,S->len[a]); } return S; } Alignment * unalign_aln (Alignment *A, Alignment *C, int t) { int a, b, pos, len; Sequence *S; for (a=0; anseq; a++) for (b=0; blen_aln; b++) { int res=C->seq_al[a][b]; A->seq_al[a][b]=toupper(A->seq_al[a][b]); if ((isdigit (res) && (res-'0')<=t)) A->seq_al[a][b]=tolower(A->seq_al[a][b]); } for (pos=-1, a=0; anseq; a++) { b=0; while ( C->seq_al[a][b]) { int res=C->seq_al[a][b]; if ((isdigit (res) && (res-'0')<=t)) { if (pos==-1){pos=b;len=1;} else len++; } else if (pos!=-1) { C=unalign_aln_pos(C,a,pos, len); pos=-1; } b++; } if ( pos!=-1){C=unalign_aln_pos(C,a,pos, len);pos=-1;} } S=aln2seq (A); thread_seq_struc2aln (C, S); A=realloc_aln2 (A, A->nseq, C->len_aln+1); A->len_aln=C->len_aln; for (a=0; anseq; a++)sprintf ( A->seq_al[a], "%s", C->seq_al[a]); ungap_aln (A); free_sequence (S, -1); return A; } Alignment * unalign_aln_pos (Alignment *A, int s, int p, int l) { int a; char *buf; int unalign=0; buf=vcalloc (l+1, sizeof (char)); for (a=0; aseq_al[s][p+a]; A->seq_al[s][p+a]='-'; } A=insert_gap_col (A,p, l); for (a=0; aseq_al[s][p+a]=buf[a]; } vfree (buf); return A; } Alignment * insert_gap_col (Alignment *A, int p, int l) { int a, c; char *buf; char *gap; gap=generate_null(l); if ( !A || p>=A->len_aln || p<0 || p<=0)return A; buf=vcalloc (A->len_aln+l+1, sizeof (char)); A=realloc_aln2(A,A->nseq, A->len_aln+l+1); for (a=0; anseq; a++) { c=A->seq_al[a][p]; A->seq_al[a][p]='\0'; sprintf ( buf, "%s%s%c%s", A->seq_al[a],gap,c,A->seq_al[a]+p+1); sprintf (A->seq_al[a], "%s", buf); } vfree (buf); A->len_aln+=l; return A; } Alignment * unalign_residues (Alignment *A, int si1, int si2) { char *s1, *s2, *ns1, *ns2; int l, a, b,r1, r2; s1=A->seq_al[si1];s2=A->seq_al[si2]; l=strlen (s1); ns1=vcalloc (2*l+1, sizeof (char)); ns2=vcalloc (2*l+1, sizeof (char)); for (b=a=0; a< l; a++) { r1=s1[a]; r2=s2[a]; if (is_gap(r1) || is_gap(r2) || isupper (r1) || isupper(r2)) { ns1[b]=(r1=='.')?'-':r1; ns2[b]=(r2=='.')?'-':r2; b++; } else { ns1[b]=r1; ns2[b]='-'; b++; ns2[b]=r2; ns1[b]='-'; b++; } } ns1[b]='\0'; ns2[b]='\0'; A->seq_al[si1]=ns1; A->seq_al[si2]=ns2; A->len_aln=strlen (ns1); return A; } Alignment *degap_aln (Alignment *A) { //Reomove all the gaps int a; for ( a=0; a< A->nseq; a++)ungap (A->seq_al[a]); return A; } Alignment *ungap_aln_n ( Alignment *A, int p) { /*remove all the columns of gap-only within an alignment*/ int a, b, c; int t; int gp; if ( A->nseq==0)return A; for ( c=0,a=0; a< A->len_aln; a++) { for ( t=0,b=0; bnseq; b++) t+=is_gap(A->seq_al[b][a]); gp=(t*100)/A->nseq; if (p>0 && (gp>=p || (t==A->nseq && p==100) || (t && p==1)));//Remove columns containing more than p% gaps else if (p<0 && (gp<=p || (t==0 && p==-100) ||(t && p==-1)));//remove columns containing less than p% gaps else { for ( b=0; bnseq; b++) A->seq_al[b][c]=A->seq_al[b][a]; c++; } } for ( b=0; bnseq; b++)A->seq_al[b][c]='\0'; A->len_aln=c; return A; } Alignment *ungap_aln ( Alignment *A) { return ungap_aln_n (A, 100); } /* Alignment *ungap_aln ( Alignment *A) { int a, b, c,t; for ( c=0,a=0; a< A->len_aln; a++) { for ( t=0,b=0; bnseq; b++) t+=is_gap(A->seq_al[b][a]); if (t==A->nseq); else { for ( b=0; bnseq; b++) A->seq_al[b][c]=A->seq_al[b][a]; c++; } } for ( b=0; bnseq; b++)A->seq_al[b][c]='\0'; A->len_aln=c; return A; } */ Alignment *remove_end (Alignment *A) { int a, b, d; int left, right; for (a=0; a< A->len_aln; a++) { for ( b=0, d=0; b< A->nseq; b++) if ( !is_gap(A->seq_al[b][a]))d++; if ( d>1)break; } left=a; for (a=A->len_aln-1; a>0; a--) { for ( b=0, d=0; b< A->nseq; b++) if ( !is_gap(A->seq_al[b][a]))d++; if ( d>1)break; } right=a; return extract_aln(A, left, right+1); } Alignment* condense_aln (Alignment *A) { /* condense complementarz columns: X- X -X ....>X X- X */ int a, b, plen, n,m, r1, r2; plen=0; while ( A->len_aln !=plen) { plen=A->len_aln; for ( a=0; a< A->len_aln-1; a++) { for ( n=m=b=0; b< A->nseq; b++) { r1=is_gap(A->seq_al[b][a]); r2=is_gap(A->seq_al[b][a+1]); n+=(r1 || r2); m+=r1; } if ( n==A->nseq && m!=A->nseq) { for (b=0; b< A->nseq; b++) { if (!is_gap(A->seq_al[b][a+1])) { A->seq_al[b][a]=A->seq_al[b][a+1]; A->seq_al[b][a+1]='-'; } } a++; } } } A=ungap_aln(A); return A; } void compress_aln ( Alignment *A) { /*remove all the columns of gap-only within an alignment*/ int a, b, c, d; for (c=0, a=0; a< A->len_aln; a++) { for ( b=0, d=0; b< A->nseq; b++) if ( A->seq_al[b][a]!='-'){d=1; break;} if ( d==0); else { for (b=0; b< A->nseq; b++) A->seq_al[b][c]=A->seq_al[b][a]; c++; } } A->len_aln=c; for ( a=0; a< A->nseq; a++) A->seq_al[a][c]='\0'; } Alignment *seq_coor2aln ( Sequence *S, Alignment *A, int **coor, int nseq) { int a; char *buf; A=realloc_alignment2(A, nseq, return_maxlen ( S->seq, S->nseq)+1); for ( a=0; a< S->nseq; a++)sprintf ( A->file[a], "%s", S->file[a]); for ( a=0; a< nseq; a++) { sprintf (A->name[a], "Repeat_%d_%d", a, coor[a][0]); buf=extract_char ( S->seq[coor[a][0]], coor[a][1]-1, coor[a][2]); sprintf ( A->seq_al[a],"%s", buf); vfree(buf); A->order[a][0]=0; A->order[a][1]=coor[a][1]-1; } A->nseq=nseq; return A; } Alignment *strings2aln (int nseq,...) { /*strings2aln(nseq, , , , ....)*/ va_list ap; char **list, **list2; char **name, **name2; Sequence *S; Alignment *A; int a, max; va_start(ap, nseq); list=vcalloc (nseq, sizeof (char*)); name=vcalloc (nseq, sizeof (char*)); for ( a=0; a< nseq; a++) { name[a]=va_arg(ap,char*); list[a]=va_arg(ap,char*); } va_end(ap); for ( max=0,a=0; a< nseq; a++) { max=(strlen (list[a])>max)?strlen(list[a]):max; } list2=declare_char (nseq, max+1); name2=declare_char (nseq, MAXNAMES+1); for ( a=0; a< nseq; a++) { sprintf ( list2[a], "%s", list[a]); sprintf ( name2[a], "%s", name[a]); } S=fill_sequence_struc(nseq,list2,name2); free_char (list2, -1); free_char (name2, -1); vfree (list); vfree(name); A=seq2aln(S,NULL, 1); return A; } Alignment *seq2aln ( Sequence *S, Alignment *A,int rm_gap) { int a; A=realloc_alignment2(A, S->nseq, S->max_len+1); for ( a=0; a< S->nseq; a++)sprintf ( A->file[a], "%s", S->file[a]); A->nseq=S->nseq; A->max_len=S->max_len; A->min_len=S->min_len; for ( a=0; a< S->nseq; a++) { A->order[a][0]=a; A->order[a][1]=0; sprintf ( A->seq_comment[a], "%s", S->seq_comment[a]); sprintf ( A->aln_comment[a], "%s", S->aln_comment[a]); sprintf ( A->name[a], "%s", S->name[a]); sprintf ( A->seq_al[a], "%s", S->seq[a]); ungap ( A->seq_al[a]); A->len[a]=strlen ( A->seq_al[a]); if ( rm_gap==0 || rm_gap==NO_PAD)sprintf ( A->seq_al[a], "%s", S->seq[a]); } if (rm_gap!=NO_PAD)padd_aln (A); A->S=S; return A; } Alignment *padd_aln ( Alignment *A) { A->seq_al=padd_string (A->seq_al, A->nseq, '-'); A->len_aln=strlen (A->seq_al[0]); return A; } char **padd_string ( char **string, int n,char pad) { /*Pads a the strings so that they all have the same length*/ int max_len, a; char *buf; max_len=get_longest_string (string,n, NULL, NULL); for (a=0; anseq; a++) sprintf (A->name[a], "tmpname_%d", seqindex++); R=copy_aln (A, NULL); for (c=0, a=0; a< A->len_aln; a++) { if ( is_gap (A->seq_al[0][a])); else { for ( b=0; bnseq; b++) R->seq_al[b][c]=A->seq_al[b][a]; c++; } } for ( a=0; a< A->nseq; a++)R->seq_al[a][c]='\0'; R->len_aln=c; R->S=aln2seq (R); free_aln (S); free_aln (P); free_aln (A); return R; } Alignment * add_align_seq2aln ( Alignment *A, char *seq, char *seq_name) { if ( !A) { A=declare_aln (NULL); A=realloc_aln2 ( A, 1, strlen (seq)+1); A->nseq=0; sprintf ( A->name[A->nseq], "%s", seq_name); sprintf ( A->seq_al[A->nseq], "%s", seq); A->nseq++; } else if ( strlen (seq)!=A->len_aln) { fprintf ( stderr, "\nError: Attempt to stack incompatible aln and aligned sequence[FATAL]\n"); myexit (EXIT_FAILURE); A=NULL; } else { A=realloc_aln2 ( A, A->nseq+1, A->len_aln+1); sprintf ( A->name[A->nseq], "%s", seq_name); sprintf ( A->seq_al[A->nseq], "%s", seq); A->nseq++; } return A; } Alignment *aln2number (Alignment *A) { A->seq_al=char_array2number(A->seq_al, A->nseq); return A; } Sequence *seq2number (Sequence *A) { A->seq=char_array2number(A->seq, A->nseq); return A; } Sequence * aln2seq (Alignment *A) { return aln2seq_main(A, RM_GAP); } Sequence * aln2seq_main (Alignment *A, int mode) { Sequence *LS; int a; int maxlen; if ( !A) return NULL; else if ( A->nseq==0)return NULL; for (maxlen=0,a=0; anseq; a++)maxlen=MAX(maxlen, strlen (A->seq_al[a])); LS=declare_sequence ( maxlen+1, maxlen+1, A->nseq); LS->nseq=A->nseq; for ( a=0; a< LS->nseq; a++) { sprintf (LS->file[a],"%s", A->file[a]); sprintf ( LS->seq[a], "%s", A->seq_al[a]); if (mode==RM_GAP)ungap ( LS->seq[a]); LS->len[a]=strlen ( LS->seq[a]); sprintf ( LS->seq_comment[a], A->seq_comment[a]); sprintf ( LS->aln_comment[a], A->aln_comment[a]); sprintf ( LS->name[a], "%s", A->name[a]); } return LS; } Sequence *keep_residues_in_seq ( Sequence *S, char *list, char replacement) { Alignment *A=NULL; int a; A=seq2aln (S, A,1); A=keep_residues_in_aln ( A, list, replacement); for ( a=0; a< A->nseq; a++) { ungap (A->seq_al[a]); sprintf ( S->seq[a], "%s", A->seq_al[a]); } free_aln (A); return S; } Alignment *aln2short_aln ( Alignment *A, char *list, char *new, int spacer) { int a, b, r, cl, l; char *buf; for ( a=0; a< A->nseq; a++) { buf=vcalloc ( strlen (A->seq_al[a])+1, sizeof (char)); for (l=0,cl=0, b=0; b< A->len_aln; b++) { r=A->seq_al[a][b]; if ( is_gap(r)); else if ( is_in_set (r, list)) { if (cl){cl=0; buf[l++]=new[0];} buf[l++]=r; } else { if ( cl==spacer){buf[l++]=new[0];cl=0;} cl++; } } buf[l]='\0'; sprintf (A->seq_al[a], "%s", buf); vfree (buf); } return A; } Alignment *keep_residues_in_aln ( Alignment *A, char *list, char replacement) { return filter_keep_residues_in_aln (A,NULL, 0, -1, list, replacement); } Alignment *filter_keep_residues_in_aln ( Alignment *A,Alignment *ST, int use_cons, int value, char *list, char replacement) { char **sl; int n, a; n=strlen (list); sl=declare_char (n+1, 256); for (a=0; a< n; a++) sprintf ( sl[a], "%c%c", list[a], list[a]); sprintf ( sl[a],"#%c", replacement); A=filter_aln_convert (A, ST,use_cons,value, n+1, sl); free_char (sl, -1); return A; } Alignment *filter_convert_aln ( Alignment *A,Alignment *ST, int use_cons, int value, int n, ...) { va_list ap; char **sl; int a; va_start (ap, n); sl=vcalloc ( n,sizeof(char*)); for ( a=0; a< n; a++) { sl[a]=va_arg(ap, char * ); } va_end(ap); A=filter_aln_convert (A,ST,use_cons,value, n,sl); vfree(sl); return A; } Alignment * filter_aln ( Alignment *A, Alignment *ST, int value) { return filter_aln_convert (A, ST,0,value,DELETE, NULL); } Alignment * filter_aln_switchcase ( Alignment *A, Alignment *ST,int use_cons, int value) { return filter_aln_convert (A, ST,0,value,SWITCHCASE, NULL); } Alignment * filter_aln_upper_lower ( Alignment *A, Alignment *ST,int use_cons, int value) { return filter_aln_convert (A, ST,use_cons,value, LOWER, NULL); } Alignment * filter_aln_lower_upper ( Alignment *A, Alignment *ST,int use_cons, int value) { return filter_aln_convert (A, ST,use_cons,value, UPPER, NULL); } Alignment * STseq2STaln ( Alignment *A, Alignment *ST) { int a, i=0; if (ST && ST->len_aln !=A->len_aln) { Sequence *S_T, *S_A; S_T=aln2seq (ST); S_A=aln2seq (A); for (a=0; a< A->nseq; a++) { i=name_is_in_list (A->name[a], S_T->name,S_T->nseq, 100); if (i!=-1) { char *s1, *s2; s1=(S_T)->seq[i];ungap(s1); s2=(S_A)->seq[a];ungap(s2); if ( strlen (s1)!=strlen(s2)) { fprintf ( stderr, "%s\n%s\n", s1, s2); printf_exit (EXIT_FAILURE, stderr, "ERROR: Sequence %s has different length in the alignment and in the structure Alignment [FATAL:%s]\n", A->name[a], PROGRAM); } } } ST=copy_aln (A, ST); thread_seq_struc2aln (ST,S_T); } return ST; } Alignment * merge_annotation ( Alignment *A, Alignment *ST, char *seq) { int s, a, b; ST=STseq2STaln (A, ST); if ( seq==NULL)s=0; else s=name_is_in_list ( seq, A->name, A->nseq, 100); if (s==-1) { add_warning ( stderr, "\nERROR: %s is not in your MSA [FATAL: %s]", PROGRAM); myexit (EXIT_FAILURE); } for (a=0; alen_aln; a++) { int t, r; t=A->seq_al[s][a]; if (is_gap (t))continue; for (b=0; bnseq; b++) { t=A->seq_al[s][a]; r=ST->seq_al[b][a]; if ( isdigit (r)) { if (!isdigit(t) || (isdigit (t) && tseq_al[s][a]=r; } } } return A; } Alignment * filter_aln_convert ( Alignment *A, Alignment *ST,int use_cons, int value, int n_symbol,char **symbol_list) { int a, b, c; int st; int cons=0; ST=STseq2STaln (A, ST); if ( ST && use_cons) { cons=name_is_in_list ("con", ST->name,ST->nseq+1, 100); if ( cons==-1)cons=name_is_in_list ("cons", ST->name,ST->nseq+1, 100); if ( cons==-1)cons=name_is_in_list ("Cons", ST->name,ST->nseq+1, 100); if ( cons==-1) { use_cons=0; fprintf (stderr, "WARNING: Could Not Use the Consensus Sequence [WARNING:%s]\n", PROGRAM); } } A->residue_case=KEEP_CASE; for ( a=0; a< A->nseq; a++) { if(value!=10 && ST && !use_cons) { c=name_is_in_list (A->name[a], ST->name, ST->nseq,100); if (c==-1)st=11; } for ( b=0; b< A->len_aln; b++) { if ( value==10 || !ST)st=11; else if ( ST && use_cons) { st=(isdigit(ST->seq_al[cons][b]))?ST->seq_al[cons][b]-'0':ST->seq_al[cons][b]; } else st=(isdigit(ST->seq_al[c][b]))?ST->seq_al[c][b]-'0':ST->seq_al[c][b]; if ( st==value || value==-1 || st==NO_COLOR_RESIDUE) { if ( n_symbol==UPPER && !symbol_list)A->seq_al[a][b]=toupper (A->seq_al[a][b]); else if ( n_symbol==LOWER && !symbol_list)A->seq_al[a][b]=tolower (A->seq_al[a][b]); else if ( n_symbol==SWITCHCASE && !symbol_list) { if ( !isalpha(A->seq_al[a][b])); else if (isupper (A->seq_al[a][b]))A->seq_al[a][b]=tolower (A->seq_al[a][b]); else if (islower (A->seq_al[a][b]))A->seq_al[a][b]=toupper (A->seq_al[a][b]); } else if ( n_symbol==DELETE && !symbol_list)A->seq_al[a][b]='-'; else { A->seq_al[a][b]=convert(A->seq_al[a][b],n_symbol,symbol_list); } } } } return A; } char ** sar_aln2motif (Alignment *A, Alignment *B, int *pos, int c); char ** sar_aln2motif (Alignment *A, Alignment *B, int *pos, int c) { static Alignment *I; static Alignment *O; int a, b, o, i; float tp,tn,fp,fn,best, sp, sn, sen2; float best_pred=-1; int best_motif=0; int n1; static char ***alp; static int *alp_size; char ***motif_list; int n; if (!I) { I=copy_aln(A, NULL); O=copy_aln(A, NULL); } I->nseq=O->nseq=I->len_aln=O->len_aln=0; for (a=0; alen_aln; a++) { if (pos[a]) { for (i=o=0,b=0; bnseq; b++) { if ( is_gap(A->seq_al[b][a]))return 0; if (B->seq_al[b][c]=='I')I->seq_al[i++][I->len_aln]=A->seq_al[b][a]; else O->seq_al[o++][O->len_aln]=A->seq_al[b][a]; } I->len_aln++; O->len_aln++; } } if (O->len_aln==0 || I->len_aln==0) return 0; O->nseq=o; I->nseq=i; for (a=0; aseq_al[a][O->len_aln]='\0'; for (a=0; aseq_al[a][I->len_aln]='\0'; alp=vcalloc ( sizeof (char**), I->len_aln); alp_size= vcalloc ( I->len_aln, sizeof (int)); for (a=0; alen_aln; a++) { char *col; alp[a]=string2alphabet ( (col=aln_column2string (I,a)),2, &alp_size[a]); vfree (col); } motif_list=generate_array_string_list (I->len_aln, alp, alp_size, &n, NULL, OVERLAP); best_pred=best_motif=0; for (a=0; anseq; b++) { if (match_motif (I->seq_al[b], motif_list[a]))tp++; else fn++; } for (b=0; bnseq; b++) { if (match_motif (O->seq_al[b], motif_list[a]))fp++; else tn++; } rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best); if (best> best_pred) { best_pred=best; best_motif=a; } } output_Alignment_without_header ( I, stdout); fprintf ( stdout, "\n"); output_Alignment_without_header ( O, stdout); fprintf ( stdout, "\nMotifCompound %d pred: %.2f motif: ", c, best_pred); for (n1=0, a=0; alen_aln; a++) { char *m; int l; m=motif_list[best_motif][a]; fprintf ( stdout, "[%s]-", m); l=strlen (m); n1+=(l==1 && !strm ("*",m) )?1:0; } fprintf (stdout, "SCORE: %d", n1); for (a=0; alen_aln) { fprintf ( stdout, "\n W:"); for (a=0; alen_aln; a++)fprintf ( stdout, "%d", array[a]); fprintf ( stdout, " %.4f",(float)sar_aln2r(A,B,array,0)); return; } else { for ( a=0; alen_aln=w; for ( a=0; alen_aln-w;a++) { for (b=0; bnseq; b++) { for (c=0; cseq_al[b][c]=B->seq_al[b][a+c]; } S->seq_al[b][c]='\0'; } s=search_best_combo (A, S); fprintf ( stdout,"\nP: XXXX \nP: XXXXX A=%d / %d", a, B->len_aln); } } float search_best_combo(Alignment *A, Alignment *B) { int a, b, c, d, best_pos,nl, max; float best_score, score; int *list, *pos; int w; int combo_mode=1; //1: greedy 2: consider all thw w combinations; FILE *fp2; static int **M; max=2; int delta=0; w=1; pos=vcalloc ( A->len_aln, sizeof (int)); list=vcalloc (A->len_aln, sizeof (int)); nl=0; if ( combo_mode==1) { for (a=0; a< max; a++) { for (best_score=-9999,best_pos=0,b=0; b< A->len_aln-w; b++) { for (c=0; cbest_score) { best_score=score; best_pos=b; } for (c=0; clen_aln-1, 1,NULL, tmpf); printf_system ( "cp %s testfile", tmpf); buf=vcalloc ( 1000, sizeof (char)); fp=vfopen (tmpf, "r"); best_score=-99999; n_preset=0; preset=vcalloc (A->len_aln, sizeof (int)); preset[n_preset++]=353; preset[n_preset++]=361; //preset[n_preset++]=365; //preset[n_preset++]=187; //preset[n_preset++]=397; //preset[n_preset++]=492; while ( (buf=vfgets ( buf, fp))!=NULL) { array=string2num_list (buf); for (a=1; a<=max; a++) { pos[array[a]]=1; } for ( a=0; abest_score) { best_score=score; fprintf ( stdout, "\n"); for (a=0; alen_aln; c++) { sar_aln2motif (A,B,pos, c); } exit (0); HERE ("***************"); fp2=vfopen ("aln.aln", "w"); for (a=0; anseq; a++) { fprintf (fp2, ">%s\n", A->name[a]); for ( b=0; bseq_al[a][list[b]]); fprintf ( fp2, "\n"); } vfclose (fp2); HERE ("Output aln.aln"); if (1) { float tp=0, tn=0, fp=0, fn=0, pp2=0,pp=0, sn,sn2, sp; int **result,**result2,**compound_score, *ref_score,n2,n, s, p, c; Alignment *AI, *AO; int simI, simO; compound_score=declare_int (B->len_aln, 2); ref_score=vcalloc (nl, sizeof (int)); result=declare_int (B->len_aln*A->nseq*A->nseq, 2); result2=declare_int (B->len_aln*A->nseq*A->nseq, 2); for (n2=c=0; c< B->len_aln; c++) { int sar1, sar2; pp=tp=tn=fp=fn=0; if (!M)M=read_matrice ("blosum62mt"); for (n=0,a=0; anseq-1; a++) { for (b=a+1; bnseq;b++) { for (s=0,p=0; pseq_al[a][list[p]]; r2=A->seq_al[b][list[p]]; if ( !is_gap (r1) && !is_gap(r2))s+=M[r1-'A'][r2-'A']; } result2[n2][0]=result[n][0]=s; sar1=B->seq_al[a][c];sar2=B->seq_al[b][c]; if (sar1=='I' && sar1==sar2) { result2[n2][1]=result[n][1]=1; pp++;pp2++; n++;n2++; } else if ( sar1==sar2 && sar1=='O') { ; } else { result2[n2][1]=result[n][1]=0; n++;n2++; } //else if ( s1==s2=='O')result[n][1]=-1; } } if (pp==0)continue; sort_int_inv (result, 2, 0, 0, n-1); for (tp=0,a=0; alen_aln-1); fp2=vfopen ("compound.fasta", "w"); for (d=0; dnseq; a++) for (b=0; bnseq; b++) { r1= A->seq_al[b][list[d]]; r2= A->seq_al[b][list[d]]; if (is_gap(r1) || is_gap(r2))continue; else { ref_score[d]+=M[r1-'A'][r2-'A']; n++; } } ref_score[d]/=n; } AO=copy_aln (A, NULL); AI=copy_aln (A,NULL); AO->len_aln=AI->len_aln=nl; for (a=0; anseq; a++)AO->seq_al[a][nl]=AI->seq_al[a][nl]='\0'; for (a=0; alen_aln; a++) { fprintf (stdout, "\n>%4d %4d ", compound_score[a][0], compound_score[a][1]); for (b=0; bnseq; b++) fprintf (stdout, "%c", B->seq_al[b][compound_score[a][0]]); fprintf ( stdout, "\n"); for (AI->nseq=0,b=0; bnseq; b++) { if (B->seq_al[b][compound_score[a][0]]=='O')continue; fprintf ( stdout, "\n\t"); for (c=0; cseq_al[b][list[c]]); AI->seq_al[AI->nseq][c]=A->seq_al[b][list[c]]; } AI->nseq++; } fprintf ( stdout, "\n\t"); for (d=0; dnseq; b++) { if (B->seq_al[b][compound_score[a][0]]=='O')continue; for (c=0; cnseq; c++) { if (B->seq_al[c][compound_score[a][0]]=='O')continue; { int r1, r2; r1= A->seq_al[b][list[d]]; r2= A->seq_al[b][list[d]]; if (is_gap(r1) || is_gap(r2))continue; else score+=M[r1-'A'][r2-'A']; n++; } } } score/=n; if ((float)score/(float)ref_score[d]>1.2)fprintf ( stdout, "*"); else fprintf ( stdout, " "); } for (AO->nseq=0,b=0; bnseq; b++) { if (B->seq_al[b][compound_score[a][0]]=='I')continue; fprintf ( stdout, "\n\t"); for (c=0; cseq_al[AO->nseq][c]=A->seq_al[b][list[c]]; fprintf ( stdout, "%c", A->seq_al[b][list[c]]); } AO->nseq++; } simI=aln2sim (AI, "blosum62mt"); simO=aln2sim (AO, "blosum62mt"); fprintf ( stdout, "\nDELTA: I: %d O: %d %d",simI,simO, simI-simO); delta+=simI-simO; } for ( a=0; anseq; a++) { fprintf ( fp2, ">%s\n", B->name[a]); for (b=0; blen_aln/2; b++) fprintf ( fp2, "%c", B->seq_al[a][compound_score[b][0]]); fprintf (fp2, "\n"); } vfclose (fp2); HERE ("OUTPUT compound.fasta"); result=result2; n=n2; pp=pp2; sort_int_inv (result, 2, 0, 0, n-1); for (tp=0,a=0; anseq; b++) C->seq_al[b][a]=A->seq_al[b][list[a]]; C->len_aln=nl; array=vcalloc (C->len_aln, sizeof (int)); explore_weight_matrix (C, B, 6,0, array); */ return best_score; } void count_misc (Alignment *A, Alignment *B) { int **done, a, b, c, d, e,f, g, *list, n, score; double **slist, *r; int *pos; int w=1; search_best_combo (A,B); exit (0); pos=vcalloc (A->len_aln+1, sizeof (int)); /* pos[354]=1; pos[362]=1; pos[366]=1; pos[398]=1; pos[476]=1; fprintf ( stdout, "\nR: %3f " ,(float)sar_aln2r(A,B,pos,1));exit (0); */ for (a=0; a< A->len_aln-w; a++) { for (c=0; clen_aln-1; a++) { pos[a-w]=0; pos[a]=1; fprintf ( stdout, "\nP: %3d W:2 R: %3f ",a, (float)sar_aln2r(A,B,pos,0)); } exit (0); pos[2]=1; pos[3]=1; explore_weight_matrix (A, B,3, 0,pos); exit (0); for (a=0; alen_aln; a++) for ( b=0; blen_aln; b++) for (c=0; clen_aln; c++) for (d=0; dlen_aln; d++) for (f=0; flen_aln; f++) for (g=0; glen_aln; g++) { e=0; pos[e++]=a; pos[e++]=b; pos[e++]=c; pos[e++]=d; pos[e++]=f; pos[e++]=g; pos[e++]=-1; fprintf ( stdout, "\n%d %d %d %d %d %d %.3f", a, b,c,d,f, g, sar_aln2r(A,B, pos,0)); } exit (0); slist=declare_double (A->nseq*A->nseq*10, 2); done=declare_int (256, 256); list=vcalloc ( A->nseq, sizeof (int)); for (a=0; alen_aln-1; a++) { for (b =0; b<256; b++)for (c=0; c<256; c++)done[b][c]=0; for (b=0; bnseq-1; b++) { int r1, r2; r1=A->seq_al[b][a]; r2=A->seq_al[b][a+1]; if (done[r1][r2])continue; n=0; done[r1][r2]=1; list[n++]=b; fprintf ( stdout, "\n%3d %c%c: %s ",a+1, r1, r2, A->name[b]); for ( c=b+1; cnseq; c++) { if (r1==A->seq_al[c][a] && r2==A->seq_al[c][a+1]) { fprintf ( stdout, "%s ", A->name[c]); list[n++]=c; } } if (B && n>1) { for (e=0,score=0,c=0; cseq_al[list[c]], B->seq_al[list[d]]); fprintf ( stdout, " Score=%d", score/e); } } } for (score=0,e=0,a=0; anseq-1; a++) for (b=a+1; bnseq; b++,e++) { score+=get_sar_sim2(B->seq_al[a], B->seq_al[b]); } fprintf (stdout,"AVG=%d", score/e); for (n=0,a=0; a< A->nseq-1; a++) { static int **M; int sim; if (!M)M=read_matrice ("blosum62mt"); for (b=a+1; bnseq; b++) { int n11, n01, n10, n00, n1; for (sim=d=0;dlen_aln; d++) { int r1, r2; r1=A->seq_al[a][d]; r2=A->seq_al[b][d]; sim+=(r1==r2)?1:0; //sim +=(M[r1-'A'][r2-'A']>0)?1:0; } sim=(100*sim)/(A->len_aln);//+rand()%10; for (n1=n00=n11=n10=n01=score=0, d=0; dlen_aln; d++) { int r1, r2; r1=B->seq_al[a][d]; r2=B->seq_al[b][d]; n11+=(r1=='I' && r2=='I'); n00+=(r1=='O' && r2=='O'); n10+=(r1=='I' && r2=='0'); n01+=(r1=='O' && r2=='I'); n1+=(r1=='I' || r2=='I'); } score =((n11+n00)*100)/B->len_aln; //score=get_sar_sim2(B->seq_al[a], B->seq_al[b]); fprintf ( stdout, "\nSIM: %d SC: %d", sim, score); slist[n][0]=(double)sim; slist[n][1]=(double)score; n++; } } r=return_r(slist, n); fprintf ( stdout, "\nR= %.4f", (float)r[0]); exit (0); } int aln2ngap ( Alignment *A) { int ngap=0, a, b; for (a=0; a< A->len_aln; a++) for (b=0; bnseq; b++) ngap+=is_gap (A->seq_al[b][a]); return ngap; } int * count_in_aln ( Alignment *A, Alignment *ST, int value, int n_symbol,char **symbol_list, int *table) { int a, b, c=0, d; int st; if (!table)table=vcalloc (n_symbol, sizeof (int)); A->residue_case=KEEP_CASE; for ( a=0; a< A->nseq; a++) { if(value!=10 && ST)for ( c=0; c< ST->nseq; c++)if ( strm(ST->name[c], A->name[a]))break; for ( b=0; b< A->len_aln; b++) { if ( value==10 || !ST)st=11; else st=(isdigit(ST->seq_al[c][b]))?ST->seq_al[c][b]-'0':ST->seq_al[c][b]; if ( st==value || value==-1) { for ( d=0; dseq_al[a][b], symbol_list[d]); } } } return table; } char *dna_aln2cons_seq ( Alignment *A) { int a, b, best; static int **column_count; static int **old_tot_count; static int **new_tot_count; static char *string1, *string2; int **count_buf; char r1, r2,*seq; int NA=0, NG=1, NC=2, NT=3, IGAP=4; static int MAX_EST_SIZE=10000; static int size_increment=1000; static int first; int overlap=0, best_overlap=0; seq=vcalloc ( A->len_aln+1, sizeof (char)); if (!column_count ) { column_count=vcalloc(MAX_EST_SIZE, sizeof (int*)); for ( a=0; a< MAX_EST_SIZE; a++) column_count[a]=vcalloc (5, sizeof (int)); old_tot_count=vcalloc(MAX_EST_SIZE, sizeof (int*)); new_tot_count=vcalloc(MAX_EST_SIZE, sizeof (int*)); A->P=declare_profile( "agct-",MAX_EST_SIZE); string1=vcalloc (MAX_EST_SIZE, sizeof (char)); string2=vcalloc (MAX_EST_SIZE, sizeof (char)); } else if (A->len_aln>MAX_EST_SIZE) { if ( column_count) { for ( a=0; a< MAX_EST_SIZE; a++) vfree(column_count[a]); vfree(column_count); vfree(old_tot_count); vfree(new_tot_count); vfree(string1); vfree(string2); } column_count=vcalloc(MAX_EST_SIZE+ size_increment, sizeof (int*)); for ( a=0; a< MAX_EST_SIZE+ size_increment; a++) column_count[a]=vcalloc (5, sizeof (int)); old_tot_count=vcalloc(MAX_EST_SIZE+ size_increment, sizeof (int*)); new_tot_count=vcalloc(MAX_EST_SIZE+ size_increment, sizeof (int*)); for (a=0; a< MAX_EST_SIZE; a++) { old_tot_count[a]=*(column_count++); for ( b=0; b<5; b++)old_tot_count[a][b]=(A->P)->count[b][a]; } free_int ( (A->P)->count, -1); (A->P)->count=declare_int (5, MAX_EST_SIZE+ size_increment); (A->P)->max_len=MAX_EST_SIZE+ size_increment; MAX_EST_SIZE+= size_increment; string1=vcalloc (MAX_EST_SIZE, sizeof (char)); string2=vcalloc (MAX_EST_SIZE, sizeof (char)); } sprintf ( string1, "%s",A->seq_al[0]); sprintf ( string2, "%s",A->seq_al[1]); string1=mark_internal_gaps(string1,'.'); string2=mark_internal_gaps(string2,'.'); for (b=0,a=0; a< A->len_aln; a++) { r1=string1[a]; r2=string2[a]; if ( r1==r2) { overlap++; } else { best_overlap=MAX(overlap, best_overlap); overlap=0; } if (!is_gap(r1) && first==1)new_tot_count[a]=old_tot_count[b++]; else if (is_gap(r1) || first==0){new_tot_count[a]=*column_count;column_count++;}; if ( first==0) { if(r1=='a') new_tot_count[a][NA]++; else if ( r1=='g')new_tot_count[a][NG]++; else if ( r1=='c')new_tot_count[a][NC]++; else if ( r1=='t')new_tot_count[a][NT]++; else if (is_gap(r1)); else { new_tot_count[a][NA]++; new_tot_count[a][NG]++; new_tot_count[a][NC]++; new_tot_count[a][NT]++; } } if ( a> 0 && alen_aln-1 && r1=='.') { new_tot_count[a][IGAP]+=((new_tot_count[a-1][NA]+new_tot_count[a-1][NG]+new_tot_count[a-1][NC]+new_tot_count[a-1][NT])); } if(r2=='a') new_tot_count[a][NA]++; else if ( r2=='g')new_tot_count[a][NG]++; else if ( r2=='c')new_tot_count[a][NC]++; else if ( r2=='t')new_tot_count[a][NT]++; else if ( r2=='.')new_tot_count[a][IGAP]++; else if ( r2=='-'); else { new_tot_count[a][NA]++; new_tot_count[a][NG]++; new_tot_count[a][NC]++; new_tot_count[a][NT]++; } (A->P)->count[0][a]=new_tot_count[a][NA]; (A->P)->count[1][a]=new_tot_count[a][NG]; (A->P)->count[2][a]=new_tot_count[a][NC]; (A->P)->count[3][a]=new_tot_count[a][NT]; (A->P)->count[4][a]=new_tot_count[a][IGAP]; best_int(4,1, &best,new_tot_count[a][NA], new_tot_count[a][NG],new_tot_count[a][NC],new_tot_count[a][NT]); if( best==0) seq[a]='a'; else if ( best==1)seq[a]='g'; else if ( best==2)seq[a]='c'; else if ( best==3)seq[a]='t'; } first=1; seq[a]='\0'; fprintf ( stderr, "[Best Overlap: %d Residues]", best_overlap); count_buf=old_tot_count; old_tot_count=new_tot_count; new_tot_count=count_buf; return seq; } char *aln2cons_maj ( Alignment *A, int ns, int *ls, int n_groups, char **group_list) { char *seq; int a, b; int len; int clean_ls=0; static int *aa; if ( !aa) aa=vcalloc (1000, sizeof (int)); len=strlen (A->seq_al[ls[0]]); seq=vcalloc (len+1, sizeof (char)); if ( ns==0) { ns=A->nseq; ls=vcalloc ( A->nseq, sizeof (int)); for ( a=0; a< A->nseq; a++)ls[a]=a; clean_ls=1; } for ( a=0; aseq_al[ls[b]][a]); aa[r]++; if (!is_gap(r) && aa[r]>best_s) { best_s=aa[r]; best_aa=r; } seq[a]=best_aa; } for (best_s=0, best_aa=0,b=0; b< ns; b++) { aa[tolower(A->seq_al[ls[b]][a])]=0; } } if ( clean_ls)vfree(ls); seq[a]='\0'; return seq; } char *aln2cons_seq ( Alignment *A, int ns, int *ls, int n_groups, char **group_list) { char *seq; int a, b, c; int best_group=0; int aa_group=0; int *group; int len; int clean_ls=0; len=strlen (A->seq_al[ls[0]]); seq=vcalloc (len+1, sizeof (char)); if ( ns==0) { ns=A->nseq; ls=vcalloc ( A->nseq, sizeof (int)); for ( a=0; a< A->nseq; a++)ls[a]=a; clean_ls=1; } if ( !group_list) { group_list=declare_char ( 26, 2); for ( a=0; a<26; a++)group_list[a][0]=a+'a'; n_groups=26; aa_group=1; } for ( a=0; aseq_al[ls[b]][a])) { for (c=0; c< n_groups; c++) if ( is_in_set (tolower(A->seq_al[ls[b]][a]), group_list[c])) {group[c]++; best_group=(group[c]>group[best_group])?c:best_group; } } seq[a]=group_list[best_group][0]; } vfree (group); } seq[a]='\0'; if ( aa_group) free_char (group_list, -1); if ( clean_ls)vfree(ls); return seq; } Alignment *aln2conservation ( Alignment *A, int threshold,char *seq) { int a, b, c, d, i, c1, c2; int *pos; float *eval; float tot=0; float tn=0; int **sim; int w=0; pos =vcalloc (A->len_aln, sizeof (int)); eval=vcalloc (A->len_aln, sizeof (int)); sim=aln2sim_mat (A, "idmat"); if (seq)i=name_is_in_list (seq, A->name, A->nseq, 100); else i=0; if ( i==-1) {HERE ("%s is an unknown:sequence [FATAL]"); exit (EXIT_FAILURE);} for (a=0; alen_aln; a++) { double s; int e; for (c=0,e=a-w; e<=a+w; e++) { if (e<0 || e==A->len_aln)continue; c1=toupper (A->seq_al[i][e]); for (b=0; bnseq; b++) { c2=toupper (A->seq_al[b][a]); if (c1==c2) { c++; s=(double)((double)sim[i][b]/(double)(100)); } else { s=(double)(((double)100-(double)sim[i][b])/(double)(100)); } eval[a]+=(s==0)?0:log(s); } } pos[a]=(c*100)/A->nseq; if (!is_gap(c1)){tot+=pos[a]; tn++;} if (pos[a]>=threshold)A->seq_al[i][a]=toupper (A->seq_al[i][a]); else A->seq_al[i][a]=tolower (A->seq_al[i][a]); } fprintf (stdout, ">%s %s [i=%d]\n%s\n", A->name[i],A->aln_comment[i],i, A->seq_al[i]); tot=(tn>0)?(float)tot/(float)tn:0; for (d=0,a=0; alen_aln; a++) { fprintf (stdout, "# %c %4d", A->seq_al[i][a],pos[a]); if ( !is_gap (A->seq_al[i][a])) { fprintf (stdout, " LogOdd: %6.2f ", (tot==0 || pos[a]==0)?0:(float)log((float)pos[a]/tot)); fprintf ( stdout, " Pos: %5d E-Val: %9.2f", ++d, eval[a]/(A->nseq)); } fprintf ( stdout, "\n"); } fprintf ( stdout, "#average conservation: %.2f", tot); exit (EXIT_SUCCESS); } char *aln2cons_seq_mat ( Alignment *A, char *mat_name) { return sub_aln2cons_seq_mat (A, A->nseq, NULL, mat_name); } char *sub_aln2cons_seq_mat2 ( Alignment *A,int ns, char **ls, char *mat_name) { char *cons; int *list; list=name_array2index_array(ls, ns, A->name, A->nseq); cons=sub_aln2cons_seq_mat ( A,ns, list, mat_name); vfree (list); return cons; } char *sub_aln2cons_seq_mat ( Alignment *A,int ns, int *ls, char *mat_name) { int a, b, c, s; char *seq, r1, r2; int **mat; int score=0, best_score=0, best_r=0; int len; int naa; mat=read_matrice (mat_name); len=strlen ( A->seq_al[(ls==NULL)?0:ls[0]]); seq=vcalloc (len+1, sizeof (char)); for ( a=0; aseq_al[s][a]))continue; else { naa++; r2=A->seq_al[s][a]; score+=mat[r1-'A'][r2-'A']; } } if (naa==0)best_r='-'; if ( b==0 || score>best_score){best_score=score; best_r=r1;} } seq[a]=best_r; } free_int (mat, -1); return seq; } int seq_list2in_file ( TC_method *M, Sequence *S, char *list, char *file) { X_template *T=NULL; if ( !S)return 0; else { int t; t=tolower(M->seq_type[0]); if ( t=='s') { return seq_list2fasta_file ( S, list, file); } else { FILE *fp, *fp2; int a, n, s, c; int *slist; fp=vfopen ( file, "w"); slist=string2num_list (list); n=slist[0]; if (strlen (M->seq_type) >1) { add_warning( stderr, "\nERROR: Mixed seq_type not supported for external methods\n[FATAL:%s]", PROGRAM); } for ( a=2; aT[s])->P; else if (t=='r')T=(S->T[s])->R; else if (t=='g')T=(S->T[s])->G; if (!T && t=='r') { fprintf ( fp, ">%s\n%s%s", S->name[s], S->seq[s], LINE_SEPARATOR); } else if ( T && T->template_file && T->template_file[0]) { fp2=vfopen (T->template_file, "r"); while ( (c=fgetc (fp2))!=EOF) { fprintf ( fp, "%c", c); } fprintf (fp, "%s", LINE_SEPARATOR); vfclose (fp2); } } fprintf (fp, "TARGET_SEQ_NAME: "); for (a=2; aname[slist[a]])); fprintf ( fp, "%s", LINE_SEPARATOR); vfclose (fp); vfree (slist); } return 1; } } int seq_list2fasta_file( Sequence *S, char *list, char *file) { FILE *fp; int n, a, s; static char *buf; static int blen; int l; /*Buf is used because cmalloced functions cannot go through strtok*/ if ( !S)return 0; else { fp=vfopen ( file, "w"); if ( !list) { for ( a=0; anseq; a++) { fprintf ( fp, ">%s %s\n%s\n", decode_name (S->name[a], CODE),S->name[a], S->seq[a]); } } else { l=strlen (list); if ( l>blen) { if (buf)vfree(buf); buf=vcalloc ( strlen (list)+1, sizeof (char)); sprintf ( buf, "%s", list); blen=l; } n=atoi(strtok (list,SEPARATORS)); for ( a=0; a< n; a++) { s=atoi(strtok (NULL, SEPARATORS)); fprintf ( fp, ">%s %s\n%s\n", decode_name (S->name[s], CODE), S->name[a],S->seq[s]); } } vfclose (fp); } return 1; } Structure * seq2struc ( Sequence *S, Structure *ST) { int a, b; for ( a=0; a< S->nseq; a++) for ( b=0; b< S->len[a]; b++) ST->struc[a][b+1][ST->n_fields-1]=S->seq[a][b]; return ST; } void aln2struc (Alignment *A, Structure *ST) { int a, b, c; for ( a=0; a< A->nseq; a++) for (c=0, b=0; b< A->len_aln; b++) { if ( !is_gap (A->seq_al[a][b])) { ST->struc[a][c][ST->n_fields-1]=A->seq_al[a][b]; c++; } } } Alignment *stack_aln (Alignment *A, Alignment *B) { int a,b; int max_len=0, max_nseq=0; if ( B==NULL)return A; if ( A==NULL)return B; max_nseq=A->nseq+B->nseq; for (a=0; a< A->nseq; a++)max_len=MAX(strlen(A->seq_al[a]),max_len); for (a=0; a< B->nseq; a++)max_len=MAX(strlen(B->seq_al[a]),max_len); A=realloc_aln2 ( A,max_nseq,max_len+1); for (a=A->nseq,b=0; b< B->nseq; b++, a++) { sprintf ( A->seq_comment[a] , "%s", B->seq_comment[b]); sprintf ( A->aln_comment[a] , "%s", B->aln_comment[b]); sprintf ( A->seq_al [a] , "%s", B->seq_al [b]); sprintf ( A->name [a] , "%s", B->name[b]); sprintf ( A->file [a], "%s" , B->file[b]); A->order[a][0]=B->order[b][0]; A->order[a][1]=B->order[b][1]; A->score_seq[a]=B->score_seq[b]; A->len[a]=B->len[b]; } A->len_aln=MAX(A->len_aln, B->len_aln); A->nseq=A->nseq+B->nseq; A->score_aln=A->score_aln+B->score_aln; A->finished=A->finished+B->finished; return A; } Alignment *chseqIaln(char *name, int seq_n, int start,int len,Sequence *S, int seqIaln, Alignment *A) { char *seq; seq=extract_char ( S->seq[seq_n], start, len); A=realloc_aln2 (A, (A==NULL)?(seqIaln+1):MAX(A->nseq,seqIaln+1), ((A==NULL)?(strlen (seq)):MAX(strlen (seq),A->len_aln))+1); sprintf ( A->seq_al[seqIaln], "%s",seq); A->order[seqIaln][0]=seq_n; A->order[seqIaln][1]=start; sprintf ( A->name[seqIaln], "%s", name); A->nseq=MAX(A->nseq, seqIaln+1); A->len_aln=return_maxlen(A->seq_al, A->nseq); A->S=S; vfree (seq); return A; } Alignment * aln_gap2random_aa(Alignment *A) { int a, b,l; char alp[200]; if (strm ( (A->S)->type, "PROTEIN")) sprintf ( alp, "acefghiklmnpqrstuvwy"); else if ( strm ( (A->S)->type, "DNA") ||strm ( (A->S)->type, "RNA") ) sprintf ( alp, "agct"); l=strlen (alp); for (a=0; anseq; a++) for ( b=0; blen_aln; b++) if ( is_gap (A->seq_al[a][b]))A->seq_al[a][b]=alp[(int)rand()%(l)]; return A; } Alignment * make_random_aln(Alignment *A,int nseq, int len, char *alphabet) { int a; A=realloc_aln2(A, nseq, len+1); A->nseq=0; A->len_aln=len; for ( a=0; a< A->nseq; a++)sprintf ( A->file[a], "random alignment"); for ( a=0; a< nseq; a++) A=add_random_sequence2aln(A,alphabet); return A; } Alignment * add_random_sequence2aln( Alignment *A, char *alphabet) { int a, n; vsrand(0); n=strlen(alphabet); A=realloc_alignment2 (A, A->nseq+1, A->len_aln+1); for ( a=0; a< A->len_aln; a++)A->seq_al[A->nseq][a]=alphabet[rand()%n]; if (! A->name[A->nseq][0]) { for ( a=0; a<10; a++)A->name[A->nseq][a]=alphabet[rand()%n]; A->name[A->nseq][a]='\0'; } A->nseq++; return A; } Sequence *get_defined_residues( Alignment *A) { char *buf; Sequence *S; int a, b, s, l, r; if ( !A || !A->S) return NULL; S=duplicate_sequence (A->S); for ( a=0; a< S->nseq; a++) for ( b=0; b< S->len[a]; b++)S->seq[a][b]=UNDEFINED_RESIDUE; buf=vcalloc(A->len_aln+1,sizeof (char)); for ( a=0; a< A->nseq; a++) { sprintf ( buf, "%s",A->seq_al[a]); ungap(buf); l=strlen (buf); s=A->order[a][0]; for ( b=1; b<= l; b++) { r=A->seq_cache[s][b]; if ( r>=0)S->seq[s][r-1]=(A->S)->seq[s][r-1]; } } vfree(buf); return S; } Alignment *thread_defined_residues_on_aln ( Alignment *A, Sequence *S1) { int a, b; int gap, r,s, r2; for ( a=0; a< A->nseq; a++) { s=A->order[a][0]; r=A->order[a][1]; for (b=0;b< A->len_aln; b++) { gap=is_gap(A->seq_al[a][b]); if (!gap) { r+=!gap; r2=A->seq_cache[s][r]-1; if (r2>=0 && S1->seq[s][r2]==UNDEFINED_RESIDUE) A->seq_al[a][b]=UNDEFINED_RESIDUE; } } } return A; } int ** trim_aln_borders (char **seq1, char **seq2, int nseq) { int a, b, c,l1,l2; char *buf1; char *buf2; int max; max=MAX(get_longest_string (seq1,-1, NULL, NULL),get_longest_string (seq2,-1, NULL, NULL))+1; buf1=vcalloc ( max, sizeof(char)); buf2=vcalloc ( max, sizeof(char)); for ( a=0; a< nseq; a++) { sprintf ( buf1, "%s", seq1[a]); sprintf ( buf2, "%s", seq2[a]); ungap (buf1); ungap (buf2); if (str_overlap ( buf1, buf2,'*')!=0) { l1=strlen ( seq1[a]); l2=strlen ( seq2[a]); for ( b=0,c=0; c< l1; c++) if ( !is_gap(seq1[a][c]))seq1[a][c]=buf1[b++]; seq1[a][c]='\0'; for ( b=0,c=0; c< l2; c++) if ( !is_gap(seq2[a][c]))seq2[a][c]=buf2[b++]; seq2[a][c]='\0'; } } vfree (buf1); vfree (buf2); return NULL; } Sequence * merge_seq ( Sequence *IN, Sequence *OUT) { int a; if ( OUT==NULL)return duplicate_sequence (IN); else { if ( IN && check_list_for_dup( IN->name, IN->nseq)) { fprintf ( stderr, "\nERROR: %s is duplicated in file %s[FATAL]\n", check_list_for_dup( IN->name, IN->nseq), IN->file[0]); myexit (EXIT_FAILURE); } for ( a=0; a< IN->nseq; a++) if ((OUT=add_sequence ( IN, OUT, a))==NULL)return NULL; return OUT; } } Alignment *seq_name2removed_seq_name(Sequence *S, Alignment *NA, float **diff) { int a, b, rb, s; float min_diff; for (a=0; a< S->nseq; a++) { if (name_is_in_list( S->name[a], NA->name, NA->nseq, 100)!=-1) continue; for ( min_diff=100, s=0, b=0; b< NA->nseq; b++) { rb=name_is_in_list ( NA->name[b], S->name, S->nseq, 100); if ( diff[a][rb]seq_comment[s], " "); strcat ( NA->seq_comment[s], S->name[a]); } return NA; } int seq_name2index (char *name, Sequence *S) { if ( !S) return -1; else return name_is_in_list ( name, S->name, S->nseq, MAXNAMES+1); } char * seq_name2coor ( char *s, int *start, int *end, char sep) { /*name|start|end */ char n1[100], n2[100]; int a=0, b=0, c=0; n1[0]=n2[0]='\0'; start[0]=end[0]=0; while ( s[a]!=sep && s[a]!='\0')a++; if ( s[a]=='\0')return s; else s[a++]='\0'; while ( s[a]!=sep && s[a]!='\0')n1[b++]=s[a++]; if ( s[a]=='\0'){n1[b]='\0';if ( n1[0])start[0]=atoi(n1);return s;} else s[a++]=n1[b]='\0'; while ( s[a]!=sep && s[a]!='\0')n2[c++]=s[a++]; n2[c]='\0'; if ( n1[0])start[0]=atoi(n1); if ( n2[0])end[0]=atoi(n2); return s; } Sequence *extract_one_seq(char *n,int start, int end, Alignment *S, int keep_name) { int seq, a; FILE*fp; char *name; Sequence *OUT_S; if ( n[0]=='#')seq=S->nseq; else if ( (seq=name_is_in_list (n, S->name, S->nseq, 100)+1)!=0); else if (is_number (n) && (seq=atoi(n))!=0) seq=atoi(n); else { fprintf ( stderr, "\nCould not find Sequence %s [FATAL]", n); myexit (EXIT_FAILURE); } seq--; name=vtmpnam ( NULL); fp=vfopen ( name, "w"); if ( start && end &&!keep_name)fprintf (fp, ">%s_%d_%d\n",S->name[seq],start, end); else if ( start && end==0 && !keep_name)fprintf (fp, ">%s_%d_%d\n",S->name[seq],start,(int)strlen ( S->seq_al[seq])); else fprintf (fp, ">%s\n", S->name[seq]); if ( start==0 && end==0){fprintf (fp, "%s\n", S->seq_al[seq]);} else if (end==0){fprintf (fp, "%s\n", S->seq_al[seq]+start-1);} else { for ( a=start-1; aseq_al[seq][a]);} fprintf ( fp, "\n"); } vfclose (fp); OUT_S=get_fasta_sequence_num (name, NULL); return OUT_S; } Sequence * extract_sub_seq( Sequence *COOR, Sequence *S) { int a, b, c,s; int start, end; for ( a=0; a< S->nseq; a++) { if ( (s=name_is_in_list ( S->name[a], COOR->name, COOR->nseq, 100))!=-1) { sscanf ( COOR->seq_comment[s], "%d %d", &start, &end); for (c=0,b=start-1; b< end; b++, c++)S->seq[a][c]=S->seq[a][b]; S->seq[a][c]='\0'; sprintf ( S->seq_comment[a], "%s",COOR->seq_comment[s]); } } S=reorder_seq ( S, COOR->name, COOR->nseq); return S; } char * aln_column2string (Alignment *A, int p) { char *s; int a; if (p>=A->len_aln) { HERE ("ERROR: index (p=%d) loger than aln (l=%d) [FATAL]", p, A->len_aln); exit (EXIT_FAILURE); } else { s=vcalloc (A->nseq+1, sizeof (char)); for (a=0; a< A->nseq; a++)s[a]=A->seq_al[a][p]; } return s; } Alignment * fix_aln_seq ( Alignment *A, Sequence *S) { int a, b, c; char *buf1, *buf2; int g0, g1, nr0, nr1; int id, tot; Alignment *B; /*This function establishes the correspondance between every (1..N+1) residue of each aligned sequence and its correspondance in S: A->seq_cache[a][b]=x means that residue b of aligned sequence a corresponds to residue x of the sequence with tye same index in S A->seq_cache[a][b]=0 means there is no correspondance. a is the index of the sequence Applying this function is needed for turning an alignment into a constraint list */ if ( S==NULL)return A; A->seq_cache=declare_int ( S->nseq, MAX((A->len_aln+1), S->max_len+1)); for (a=0; a< S->nseq; a++) for ( b=0; b< A->len_aln; b++)A->seq_cache[a][b]=-1; buf1=buf2=NULL; for ( a=0; a< S->nseq; a++) { for (b=0; b< A->nseq; b++) { if (strm ( S->name[a], A->name[b])) { A->order[b][0]=a; vfree (buf1); buf1=vcalloc ( A->len_aln+1, sizeof (char)); sprintf (buf1, "%s", A->seq_al[b]); ungap (buf1); upper_string (buf1); vfree(buf2); buf2=vcalloc (strlen(S->seq[a])+1, sizeof (char)); sprintf (buf2, "%s",S->seq[a]); ungap (buf2); upper_string (buf2); if ( strm (buf1,buf2)) { for ( c=0; clen[a]; c++)A->seq_cache[a][c+1]=c+1; } else { B=align_two_sequences (buf2,buf1,"blosum62mt",-4,-1, "myers_miller_pair_wise"); if ( getenv ("DEBUG_RECONCILIATION")) { fprintf (stderr, "\n[DEBUG_RECONCILIATION:fix_aln_seq]\nReconciliation of %s\nA=Ref_sequence\nB=New_seq", S->name[a]); print_aln (B); } for (id=0, tot=0,nr0=0,nr1=0,c=0; clen_aln; c++) { g0=is_gap(B->seq_al[0][c]); g1=is_gap(B->seq_al[1][c]); nr0+=1-g0; nr1+=1-g1; if ( !g0 && !g1) { tot++; id+=(B->seq_al[0][c]==B->seq_al[1][c])?1:0; A->seq_cache[a][nr1]=nr0; } else if (g0 && !g1) { A->seq_cache[a][nr1]=0; } } if ( ((id*100)/tot)<20) { print_aln (B); fprintf ( stderr, "\nTwo different sequences have the same name: %s", S->name[a]); fprintf ( stderr, "\nIf %s is a PDBID, Make sure it identifies the right chain (A, B, 1, 2...)", S->name[a]); fprintf ( stderr, "\nChain number or index must be added to the PDB id (i.e. 1gowA)"); fprintf ( stderr, "\nIf You want to use %s anyway, rename it with a non-PDB identifier such as seq_%s\n",S->name[a],S->name[a]); myexit (EXIT_FAILURE); } free_sequence ( B->S, -1); free_aln (B); } } } } vfree(buf1);vfree(buf2); return A; } Sequence * add_prf2seq ( char *file, Sequence *S) { char **new_seq; Sequence *NS; if ( !is_aln (file)&& !is_seq (file))return S; else { X_template *R; Alignment *A; R=fill_R_template(file,file, S); A=(R->VR)->A; ((R->VR)->A)->expand=1; new_seq=declare_char (1,A->len_aln+1); sprintf ( new_seq[0], "%s",aln2cons_seq_mat(A, "blosum62mt")); NS=fill_sequence_struc(1, new_seq,A->file); S=add_sequence (NS, S, 0); (S->T[S->nseq-1])->R=R; free_sequence (NS, NS->nseq); free_char( new_seq, -1); return S; } } int prf_in_seq ( Sequence *S) { int a; if ( !S) return 0; else { for ( a=0; a< S->nseq; a++) if (seq2R_template_profile(S, a)) return 1; } return 0; } Sequence * add_sequence ( Sequence *IN, Sequence *OUT, int i) { int s, a; char *buf; if (OUT==NULL) { OUT=duplicate_sequence (IN); return OUT; } for (a=0; anseq; a++) { Alignment *P; P=seq2R_template_profile (OUT, a); if (!P) continue; else if (name_is_in_list (IN->name[i], P->name, P->nseq, 100)!=-1) return OUT; } /*Adds sequence i of IN at the end of OUT*/ if ((s=name_is_in_list ( IN->name[i], OUT->name, OUT->nseq,STRING))==-1 ) { OUT=realloc_sequence (OUT, OUT->nseq+1, IN->len[i]); sprintf ( OUT->name[OUT->nseq],"%s",IN->name[i]); sprintf ( OUT->file[OUT->nseq],"%s",IN->file[i]); sprintf ( OUT->seq_comment[OUT->nseq],"%s",IN->seq_comment[i]); sprintf ( OUT->aln_comment[OUT->nseq],"%s",IN->aln_comment[i]); sprintf ( OUT->seq[OUT->nseq],"%s",IN->seq[i]); OUT->len[OUT->nseq]=IN->len[i]; OUT->T[OUT->nseq][0]=IN->T[i][0]; OUT->nseq++; return OUT; } else if ( s!=-1 && !case_insensitive_strcmp ( IN->seq[i], OUT->seq[s])) { if ( getenv4debug("DEBUG_RECONCILIATION"))fprintf ( stderr,"[DEBUG_RECONCILIATION:add_sequence]\n%s\n%s\n", IN->seq[i], OUT->seq[s]); add_warning (stderr, "WARNING: DISCREPANCY:%s in [%s] and [%s]\n", IN->name[i], IN->file[i], OUT->file[s]); if (((buf=build_consensus(IN->seq[i], OUT->seq[s],"cfasta_pair_wise" ))!=NULL)||((buf=build_consensus(IN->seq[i], OUT->seq[s],"myers_miller_pair_wise" ))!=NULL)) { OUT->max_len=MAX(OUT->max_len, strlen(buf)); OUT->min_len=MIN(OUT->min_len, strlen(buf)); OUT->seq =realloc_char ( OUT->seq, -1, -1,OUT->nseq,OUT->max_len+1); sprintf ( OUT->seq[s],"%s",buf); OUT->len[s]=strlen (buf); vfree (buf); return OUT; } else { fprintf ( stderr, "IMPOSSIBLE TO RECONCILIATE SOME SEQUENCES[FATAL:%s]\n", PROGRAM); print_aln ( align_two_sequences (IN->seq[i], OUT->seq[s], "idmat", 0, 0, "fasta_pair_wise")); myexit (EXIT_FAILURE); return NULL; } } else { return OUT; } } Sequence * trim_seq ( Sequence *A, Sequence *B) { int a; Sequence *R; if (A->nseq>B->nseq) { Sequence *I; I=A;A=B;B=I; } R=declare_sequence (MIN(A->min_len,B->min_len), MAX(A->max_len, B->max_len), MIN(A->nseq, B->nseq)); R->nseq=0; for (a=0; a< A->nseq; a++) { if ( name_is_in_list ( A->name[a], B->name, B->nseq,STRING+1)!=-1) { sprintf ( R->name[R->nseq], "%s", A->name[a]); sprintf ( R->seq[R->nseq], "%s", A->seq[a]); sprintf ( R->file[R->nseq], "%s", A->file[a]); sprintf ( R->aln_comment[R->nseq], "%s", A->aln_comment[a]); sprintf ( R->seq_comment[R->nseq], "%s", A->seq_comment[a]); R->len[R->nseq]=A->len[a]; R->nseq++; } } return R; } Sequence * trim_aln_seq ( Alignment *A, Alignment *B) { int a; static char **name_list; int n=0; Sequence *SA, *SB; int **cache_A=NULL; int **cache_B=NULL; int * p; /*This function inputs two alignments A and B It removes sequences that are not common to both of them It rearange the sequences so that they are in the same order A decides on the order The Sequences (A->S) and (B->S) are treated the same way Sequences are also merged in order to detects discrepencies. A pointer to S is returned */ if (name_list)free_char (name_list, -1); name_list=declare_char (MAX(A->nseq, B->nseq), STRING+1); for ( a=0; a< A->nseq; a++) { if ( name_is_in_list ( A->name[a], B->name, B->nseq,STRING)!=-1) { sprintf ( name_list[n++], "%s", A->name[a]); } } reorder_aln ( A, name_list, n); if (A->seq_cache)cache_A=duplicate_int (A->seq_cache, -1, -1); if (B->seq_cache)cache_B=duplicate_int (B->seq_cache, -1, -1); reorder_aln ( B, name_list, n); for ( a=0; a< n; a++) { if ( cache_A) { p=A->seq_cache[A->order[a][0]]; A->seq_cache[A->order[a][0]]=cache_A[a]; cache_A[a]=p; } if ( cache_B) { p=B->seq_cache[B->order[a][0]]; B->seq_cache[B->order[a][0]]=cache_B[a]; cache_B[a]=p; } A->order[a][0]=B->order[a][0]=a; } free_int(A->seq_cache, -1); free_int(B->seq_cache, -1); A->seq_cache=cache_A; B->seq_cache=cache_B; SA=aln2seq(A); SB=aln2seq(B); A->S=B->S=merge_seq (SA, SB); return A->S; } Sequence * trim_aln_seq_name ( Alignment *A, Alignment *B) { int a; Sequence *S; /*This function inputs two alignments A and B It removes sequences that are not common to both of them It rearange the sequences so that they are in the same order A decides on the order */ S=declare_sequence ( 1, 1, A->nseq+B->nseq); S->nseq=0; for ( a=0; a< A->nseq; a++) { if ( name_is_in_list ( A->name[a], B->name, B->nseq,STRING)!=-1) { sprintf ( S->name[S->nseq++], "%s", A->name[a]); } } return S; } char ** rm_name_tag (char **name, int nseq, char *tag) { int a , b, ntag; char **tag_list; char *s; char **template_list; if ( !name )return NULL; tag_list=declare_char (10, 4); if ( tag) { ntag=1; sprintf ( tag_list[0], "%s", tag); } else { ntag=0; sprintf ( tag_list[ntag++], "_S_"); sprintf ( tag_list[ntag++], "_G_"); } template_list=declare_char (nseq, 100); for ( a=0; a%s _%s_ %s", name[a], s+1, s+3); break; } } } free_char (tag_list, -1); return template_list; } Sequence * swap_header ( Sequence *S, Sequence *H) { int a, b, n; for ( a=0; a< S->nseq; a++) { if ( (n=name_is_in_list (S->name[a],H->name, H->nseq, 1000))!=-1) { char **list; list=string2list (H->seq_comment[n]); if ( list==NULL || atoi(list[0])==1)continue; S->seq_comment[a]='\0'; sprintf (S->name[a], "%s%s%s",H->name[n], list[1], list[2]); vfree ( S->seq_comment[a]);S->seq_comment[a]=vcalloc ( strlen (H->seq_comment[n])+1, sizeof (char)); for (b=3; b< atoi(list[0]); b++)S->seq_comment[a]=strcat (S->seq_comment[a], list[b]); free_char (list, -1); } } return S; } Sequence * profile_seq2template_seq ( Sequence *S, char *template_file, Fname *F) { /*This function fetches potential templates associated with sequences within a profile*/ int i; Alignment *A; char *tmp; if ( !check_file_exists (template_file)) return S; tmp=vtmpnam (NULL); for ( i=0; i< S->nseq; i++) { if ( (A=seq2R_template_profile (S, i))) { printf_system ("cp %s %s", template_file, tmp);//seq2template over-writes the temnplate file with a list of the templates effectively encounter A->S=aln2seq (A); A->S=seq2template_seq (A->S, tmp, F); if (!A->S)return NULL; } } return S; } Sequence * seq2template_type(Sequence *Seq) { //add template int a, e; int s; struct X_template *S=NULL; struct X_template *P=NULL; struct X_template *R=NULL; struct X_template *G=NULL; struct X_template *F=NULL; struct X_template *T=NULL; struct X_template *E=NULL; struct X_template *U=NULL; Alignment *A; e=' '; for (a=0; a< Seq->nseq; a++) { if (!Seq->T[a])continue; //HERE ADD a Template P=seq_has_template (Seq, a, "_P_"); S=seq_has_template (Seq, a, "_S_"); R=seq_has_template (Seq, a, "_R_"); G=seq_has_template (Seq, a, "_G_"); F=seq_has_template (Seq, a, "_F_"); T=seq_has_template (Seq, a, "_T_"); E=seq_has_template (Seq, a, "_E_"); U=seq_has_template (Seq, a, "_U_"); s=(!P)?1:0; sprintf ( (Seq->T[a])->seq_type, "%c%c%c%c%c%c%c%c", (P)?'P':e, (S)?'S':e, (S &&!P)?'s':e,(R)?'R':e, (G)?'G':e,(T)?'T':e,(E)?'E':e,(U)?'U':e); if (R && (A=seq2R_template_profile (Seq,a))) { A->S=seq2template_type ( A->S); } } return Seq; } char * string_contains_template_tag (char *string_in) { char string[100]; if ( strstr (string, "_P_"))return "_P_"; if ( strstr (string, "_S_"))return "_S_"; if ( strstr (string, "_R_"))return "_R_"; if ( strstr (string, "_G_"))return "_G_"; if ( strstr (string, "_F_"))return "_F_"; if ( strstr (string, "_T_"))return "_T_"; if ( strstr (string, "_E_"))return "_E_"; if ( strstr (string, "_U_"))return "_U_"; return NULL; } static int check_blast_is_installed (char *server); static int check_blast_is_installed (char *server) { if (strm (server, "EBI")); else if ( strm (server, "NCBI")) return check_program_is_installed (NCBIWEBBLAST_4_TCOFFEE,NULL, NULL,NCBIWEBBLAST_ADDRESS, INSTALL_OR_DIE); else if ( strm (server, "LOCAL")) return check_program_is_installed (NCBIBLAST_4_TCOFFEE,NULL, NULL,NCBIBLAST_ADDRESS, INSTALL_OR_DIE); return 1; } Sequence * vremove_seq_template_files(Sequence *S) { return handle_seq_template_file (S, "remove"); } Sequence * display_seq_template_files(Sequence *S) { return handle_seq_template_file (S, "display"); } Sequence * handle_seq_template_file (Sequence *S, char *mode) { int a; Template *T; for (a=0; a< S->nseq; a++) { T=S->T[a]; if (T) { handle_X_template_files (T->P, mode); handle_X_template_files (T->F, mode); handle_X_template_files (T->R, mode); handle_X_template_files (T->T, mode); handle_X_template_files (T->E, mode); } } return S; } int handle_X_template_files ( X_template *T, char *mode) { if (!T)return 0; if ( strm (mode, "remove")) { vremove (T->template_file); vremove (T->template_name); } else if (strm (mode, "display")) { char buf[100]; sprintf ( buf, "Template %s", template_type2type_name (T->template_type)); if (check_file_exists (T->template_name))display_output_filename ( stdout,buf,T->template_format,T->template_name, STORE); } else { printf_exit (EXIT_FAILURE, stderr, "\nUnkonwn mode %s for template handling [FATAL:%s]", mode, PROGRAM); } return 1; } Sequence * seq2template_seq ( Sequence *S, char *template_list, Fname *F) { /*Expected format for the template file: >seq_name _X_ Target_template X: S for Structures G for genomes (Exoset) When alternative templates are given for a sequence, the first one superseeds all the others */ /*Fill the sequences*/ /*1: No template*/ char buf[1000]; int PmC,PmI,PMI; int BmC,BmI,BMI; char *server; char *pdb_db,*prot_db; int remove_template_file=0; remove_template_file=get_int_variable ("remove_template_file"); server=get_string_variable ("blast_server"); pdb_db=get_string_variable ("pdb_db"); prot_db=get_string_variable ("prot_db"); PmI=get_int_variable ("pdb_min_sim"); PMI=get_int_variable ("pdb_max_sim"); PmC=get_int_variable ("pdb_min_cov"); BmI=get_int_variable ("prot_min_sim"); BMI=get_int_variable ("prot_max_sim"); BmC=get_int_variable ("prot_min_cov"); if ( (template_list && template_list[0]=='\0') || strm ( template_list, "no_template")) { return S; } else if ( strstr (template_list, "MODE_"))//pre_set mode { return seq2template_seq ( S,template_list+strlen ("MODE_"),F); } else if ( strm ( template_list, "SSP")|| strm ( template_list, "GOR")) { /*use GOR to Predict the secondary structure*/ check_program_is_installed (GOR4_4_TCOFFEE,NULL, NULL,GOR4_ADDRESS, INSTALL_OR_DIE); sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#ssp_template@seq#%s/%s@obs#%s/%s@cache#%s@type#_E_",get_mcoffee_4_tcoffee(), "New_KS.267.seq", get_mcoffee_4_tcoffee(), "New_KS.267.obs", get_cache_dir()); S=seq2template_seq (S,buf, F); return S; } else if ( strm ( template_list, "PSISSP") || strm (template_list, "PSIGOR")) { /*Computes a GOR consensus on a psi-blast output*/ check_program_is_installed (GOR4_4_TCOFFEE,NULL, NULL,GOR4_ADDRESS, INSTALL_OR_DIE); check_blast_is_installed(server); sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#psissp_template@seq#%s/%s@obs#%s/%s@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_E_",get_mcoffee_4_tcoffee(), "New_KS.267.seq", get_mcoffee_4_tcoffee(), "New_KS.267.obs", get_cache_dir(), BmI,BMI,BmC,server); S=seq2template_seq (S,buf, F); return S; } else if ( strm ( template_list, "TM")) { /*predict transmembrane structure*/ check_program_is_installed (HMMTOP_4_TCOFFEE,NULL, NULL,HMMTOP_ADDRESS, INSTALL_OR_DIE); sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#tm_template@arch#%s/%s@psv#%s/%s@type#_T_",get_mcoffee_4_tcoffee(), "hmmtop.arch", get_mcoffee_4_tcoffee(), "hmmtop.psv"); S=seq2template_seq (S,buf, F); return S; } else if ( strm ( template_list, "PSITM")) { /*predict transmembrane structure*/ check_program_is_installed (HMMTOP_4_TCOFFEE,NULL, NULL,HMMTOP_ADDRESS, INSTALL_OR_DIE); check_blast_is_installed(server); sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#psitm_template@arch#%s/%s@psv#%s/%s@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_T_",get_mcoffee_4_tcoffee(), "hmmtop.arch", get_mcoffee_4_tcoffee(), "hmmtop.psv",get_cache_dir(), BmI,BMI,BmC,server); S=seq2template_seq (S,buf, F); return S; } else if (strm ( template_list, "PSIBLAST")) { check_blast_is_installed(server); sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#psiprofile_template@database#%s@method#psiblast@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_R_", prot_db,get_cache_dir(),BmI,BMI,BmC,server); S=seq2template_seq (S,buf, F); return S; } else if (strm ( template_list, "BLAST") ) { check_blast_is_installed(server); sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#profile_template@database#%s@method#blastp@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_R_", prot_db,get_cache_dir(),BmI,BMI,BmC,server); S=seq2template_seq (S,buf, F); return S; } else if ( strm ( template_list, "EXPRESSO") || strm (template_list, "PDB")) { check_blast_is_installed(server); int isRNA = 0; int i; for (i= 0; i < S->len[0]; ++i) { isRNA = (isRNA || is_rna(S->seq[0][i])); } if (isRNA) { sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#pdb_template@database#%s@method#blastn@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_P_",pdb_db, get_cache_dir(),PmI,PMI,PmC, server); } else { sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#pdb_template@database#%s@method#blastp@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_P_",pdb_db, get_cache_dir(),PmI,PMI,PmC, server); } return seq2template_seq (S,buf, F); } else if ( strm (template_list, "RCOFFEE") || strm (template_list, "RNA")) { char *file_struc_clac = vtmpnam (NULL); FILE* struc_calc_f =vfopen(file_struc_clac,"w"); int i; int j = 0; for (i = 0; i< S->nseq; ++i) { if (S->T[i]->P) { ++j; fprintf(struc_calc_f,"%s %s\n",S->name[i],S->T[i]->P->template_file); } } vfclose(struc_calc_f); check_program_is_installed (RNAPLFOLD_4_TCOFFEE,NULL, NULL,RNAPLFOLD_ADDRESS, IS_FATAL); sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#RNA_template@type#_F_"); if (j > 0) { S = seq2template_seq (S,buf,F); sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#calc_rna_template@pdbfile#%s@cache#%s@type#_F_", file_struc_clac,get_cache_dir()); } // printf("IN T_\n"); return seq2template_seq (S,buf,F); } /*2: Templates from seqnames (SELF) or named like the sequences (SEQFILE)*/ else if ( strstr (template_list, "SELF_") ||strstr (template_list, "SEQFILE_") ) { int a; char *p; //add template for (a=0; a< S->nseq; a++) { if ( (p=strstr (template_list,"SELF_")))p=S->name[a]; else if ( strstr (template_list, "SEQFILE_"))p=template_list; else { fprintf ( stderr, "\nUnkown mode for Template [FATAL:%s]\n", PROGRAM); myexit (EXIT_FAILURE); } if ( strstr (template_list, "_P_") && !(S->T[a])->P) { (S->T[a])->P =fill_P_template ( S->name[a], p,S);//PDB } else if ( strstr (template_list, "_S_") && !(S->T[a])->S)(S->T[a])->S =fill_S_template ( S->name[a], p,S);//Sequence else if ( strstr (template_list, "_R_" )&& !(S->T[a])->R)(S->T[a])->R =fill_R_template ( S->name[a], p,S);//pRofile else if ( strstr (template_list, "_G_" )&& !(S->T[a])->G)(S->T[a])->G =fill_G_template ( S->name[a], p,S);//Genomic else if ( strstr (template_list, "_F_" )&& !(S->T[a])->F)(S->T[a])->F =fill_F_template ( S->name[a], p,S);//Fold else if ( strstr (template_list, "_T_" )&& !(S->T[a])->T)(S->T[a])->T =fill_T_template ( S->name[a], p,S);//Trans Membrane else if ( strstr (template_list, "_E_" )&& !(S->T[a])->E)(S->T[a])->E =fill_E_template ( S->name[a], p,S);//Secondary Structure else if ( strstr (template_list, "_U_" )&& !(S->T[a])->U)(S->T[a])->U =fill_U_template ( S->name[a], p,S);//unicode, list template } return S; } /*2: Templates comes in a template_file*/ else if ( template_list==NULL || format_is_fasta (template_list)) { Sequence *T; int a, i; int ntemp=0; T=(template_list!=NULL)?get_fasta_sequence (template_list, NULL):S; for (a=0; a< T->nseq; a++) { char *p; if ((i=name_is_in_list(T->name[a], S->name, S->nseq, MAXNAMES))!=-1) { if ( (p=strstr (T->seq_comment[a], " _P_ ")) && !(S->T[i])->P &&( (S->T[i])->P=fill_P_template (S->name[i],p,S))) { ntemp++; } else if ( (p=strstr (T->seq_comment[a], " _F_ ")) && !(S->T[i])->F &&( (S->T[i])->F=fill_F_template (S->name[i],p,S)))ntemp++; else if ( (p=strstr (T->seq_comment[a], " _S_ ")) && !(S->T[i])->S &&( (S->T[i])->S=fill_S_template (S->name[i],p,S)))ntemp++; else if ( (p=strstr (T->seq_comment[a], " _R_ ")) && !(S->T[i])->R &&( (S->T[i])->R=fill_R_template (S->name[i],p,S)))ntemp++; else if ( (p=strstr (T->seq_comment[a], " _G_ ")) && !(S->T[i])->G &&( (S->T[i])->G=fill_G_template (S->name[i],p,S)))ntemp++; else if ( (p=strstr (T->seq_comment[a], " _T_ ")) && !(S->T[i])->T &&( (S->T[i])->T=fill_T_template (S->name[i],p,S)))ntemp++; else if ( (p=strstr (T->seq_comment[a], " _E_ ")) && !(S->T[i])->E &&( (S->T[i])->E=fill_E_template (S->name[i],p,S)))ntemp++; else if ( (p=strstr (T->seq_comment[a], " _U_ ")) && !(S->T[i])->U &&( (S->T[i])->E=fill_U_template (S->name[i],p,S)))ntemp++; if (T!=S)strcat (S->seq_comment[i], T->seq_comment[a]); } } if (T!=S)free_sequence (T, -1); if ( remove_template_file==2 || ntemp==0) { vremove (template_list); } else if (template_list)display_output_filename ( stdout, "Template_List","fasta_seq", template_list, STORE); return S; } /*3 Templates are generated with a script*/ else if (strstr (template_list, "SCRIPT_") && get_string_variable ("multi_core") && strstr (get_string_variable ("multi_core"), "templates") && get_nproc()>1) { char *tmp1,*command; Alignment *A; char **temp_file,**seq_file; int * pid_list, pid, npid, submited; int nproc, max_nproc; char outfile[1000]; static char *script; static int ntemp; char *p; int z, i; if (!script)script=vcalloc ( 1000, sizeof(char)); ntemp++; command=vcalloc ( 1000, sizeof (char)); tmp1=vtmpnam (NULL); A=seq2aln (S,NULL, 0); string_array_upper(A->seq_al, A->nseq); output_fasta_seq (tmp1, A); sprintf ( script, "%s", after_strstr (template_list, "SCRIPT_")); if ((p=strstr (template_list, "@type#"))) p+=strlen ("@type#"); if (F) { sprintf (outfile, "%s%s_%s%d.template_list", F->path,F->name,template_type2short_type_name(p),ntemp); } else { F=parse_fname (S->file[0]); sprintf (outfile, "%s%s_%s%d.template_list",F->path, F->name,template_type2short_type_name(p),ntemp); free_fname (F); } nproc=get_nproc(); max_nproc=2*nproc; script=substitute(script, "@", " -"); script=substitute(script, "#", "="); temp_file=vcalloc ( A->nseq, sizeof (char*)); seq_file =vcalloc (A->nseq, sizeof (char*)); pid_list =vcalloc (MAX_N_PID, sizeof (int *)); fprintf ( stderr, "\n\t------ Fetch %Templates [Multi Core Mode %d CPUs]\n",get_nproc()); for (npid=0, submited=0,i=0; inseq; i++) { FILE *fp2; seq_file[i]=vtmpnam (NULL); temp_file[i]=vtmpnam (NULL); fp2=vfopen (seq_file[i], "w"); fprintf ( fp2, ">%s\n%s\n", S->name[i], S->seq[i]); vfclose (fp2); pid=vfork(); if (pid==0) { initiate_vtmpnam (NULL); if ( strstr (script, "tc_generic_method")) { //sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s -tmpdir=%s",get_string_variable ("t_coffee"),script,seq_file[i],temp_file[i],get_tmp_4_tcoffee()); sprintf ( command, "%s -infile=%s -outfile=%s -tmpdir=%s",script,seq_file[i],temp_file[i],get_tmp_4_tcoffee()); if (strstr (command, "EBI"))get_email (); } else //sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s",get_string_variable("t_coffee"),script,seq_file[i],temp_file[i]); sprintf ( command, "%s -infile=%s -outfile=%s",script,seq_file[i],temp_file[i]); command=substitute(command, "@", " "); my_system ( command); exit (EXIT_SUCCESS); } else { pid_list[pid]=npid; set_pid(pid); npid++; submited++; submited=vwait_npid(submited,max_nproc,nproc); } } submited=vwait_npid(submited,0,0); //Concatenate all the files vremove (outfile); for (i=0; iseq_al, A->nseq); output_fasta_seq (tmp1, A); sprintf ( script, "%s", after_strstr (template_list, "SCRIPT_")); fprintf ( stderr, "\n"); if ((p=strstr (template_list, "@type#"))) p+=strlen ("@type#"); if (F) { sprintf (outfile, "%s%s_%s%d.template_list", F->path,F->name,template_type2short_type_name(p),ntemp); } else { F=parse_fname (S->file[0]); sprintf (outfile, "%s%s_%s%d.template_list",F->path, F->name,template_type2short_type_name(p),ntemp); free_fname (F); } script=substitute(script, "@", " -"); script=substitute(script, "#", "="); if ( strstr (script, "tc_generic_method")) { sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s -tmpdir=%s",get_string_variable ("t_coffee"),script, tmp1,outfile,get_tmp_4_tcoffee()); if (strstr (command, "EBI"))get_email (); } else sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s",get_string_variable("t_coffee"),script, tmp1, outfile); vremove (outfile); command=substitute(command, "@", " "); my_system ( command); free_aln (A); if ( check_file_exists (outfile) && format_is_fasta(outfile)) { S=seq2template_seq (S, outfile, F); } else if (strstr (command, "webblast.pl"))return S; else { add_warning (stderr, "\nWARNING: Could not Run %s to find templates[%s]\n",command, PROGRAM); return NULL; } vfree (command); return S; } return S; } char* seq2template_file (Sequence *S, char *file) { Alignment *A; int i; if (file==NULL)file=vtmpnam (NULL); seq2template_file2 (S, file, "w"); for (i=0; inseq; i++) if ( (A=seq2R_template_profile (S, i))) { seq2template_file2 (A->S, file, "a"); } return file; } int seq2template_file2 (Sequence *S, char *file, char *mode) { FILE *fp; int i; char buf1[10000]; char buf2[10000]; struct X_template *X; fp=vfopen ( file, mode); for ( i=0; i< S-> nseq; i++) { buf1[0]=0; if ( (X=(S->T[i])->P)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);} /*if ( (X=(S->T[i])->S)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}*/ if ( (X=(S->T[i])->R)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);} if ( (X=(S->T[i])->G)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);} if (buf1[0])fprintf ( fp, ">%s %s\n", S->name[i], buf1); } vfclose (fp); return EXIT_SUCCESS; } int seq2n_X_template ( Sequence *S, char *type) { int a, n; for (n=0,a=0; a< S->nseq; a++) { if ( strm2 (type, "_P_","_*_") && (S->T[a])->P)n++; if ( strm2 (type, "_F_","_*_") && (S->T[a])->F)n++; if ( strm2 (type, "_S_","_*_") && (S->T[a])->S)n++; if ( strm2 (type, "_R_","_*_") && (S->T[a])->R)n++; if ( strm2 (type, "_G_","_*_") && (S->T[a])->G)n++; } return n; } struct X_template *fill_X_template ( char *name, char *p, char *token) { struct X_template *X; char *k; X=vcalloc (1, sizeof (X_template)); sprintf ( X->seq_name, "%s", name); if ( (k=strstr (p, token)))sscanf (k+strlen(token), "%s",X->template_name); else sprintf (X->template_name, "%s", p); /*Add a Structure HERE*/ sprintf ( X->template_type, "%s", token); if ( strm (token, "_P_"))X->VP=vcalloc (1, sizeof (P_template)); if ( strm (token, "_F_"))X->VF=vcalloc (1, sizeof (F_template)); if ( strm (token, "_S_"))X->VS=vcalloc (1, sizeof (S_template)); if ( strm (token, "_R_"))X->VR=vcalloc (1, sizeof (R_template)); if ( strm (token, "_G_"))X->VG=vcalloc (1, sizeof (G_template)); if ( strm (token, "_T_"))X->VT=vcalloc (1, sizeof (T_template)); if ( strm (token, "_E_"))X->VE=vcalloc (1, sizeof (E_template)); if ( strm (token, "_U_"))X->VU=vcalloc (1, sizeof (U_template)); return X; } struct X_template* free_X_template ( struct X_template *X) { if (X->VP) { vfree (X->VP); } if (X->VF) { vfree (X->VF); } if ( X->VS) { free_sequence ((X->VS)->S, -1); vfree (X->VS); } if ( X->VR) { free_aln ((X->VR)->A); vfree (X->VR); } if ( X->VG) { free_sequence ((X->VG)->S, -1); vfree (X->VG); } vfree (X); return NULL; } FILE * display_sequence_templates (Sequence *S,int i, FILE *io) { io=display_X_template ( (S->T[i])->P, io); io=display_X_template ( (S->T[i])->F, io); io=display_X_template ( (S->T[i])->S, io); io=display_X_template ( (S->T[i])->R, io); io=display_X_template ( (S->T[i])->G, io); io=display_X_template ( (S->T[i])->T, io); io=display_X_template ( (S->T[i])->E, io); return io; } FILE * display_X_template (struct X_template *X, FILE *io) { if ( !X) return io; if ( !strm (X->template_type, "_S_"))fprintf (io, "\n\t%s: Template=%s, File=%s",template_type2type_name (X->template_type), X->template_name,X->template_file); return io; } char *template_type2short_type_name (char *type) { //add_template if (!type)return ""; else if ( strstr (type, "_P_")) return "pdb"; else if ( strstr (type, "_F_")) return "rfold"; else if ( strstr (type, "_S_")) return "seq"; else if ( strstr (type, "_R_")) return "prf"; else if ( strstr (type, "_G_")) return "genome"; else if ( strstr (type, "_E_")) return "ssp"; else if ( strstr (type, "_T_")) return "tmp"; else if ( strstr (type, "_U_")) return "unicode"; else return type; } char *template_type2type_name (char *type) { //add_template if ( strstr (type, "_P_")) return "PDB struc"; else if ( strstr (type, "_F_")) return "RNA Fold"; else if ( strstr (type, "_S_")) return "Sequeence"; else if ( strstr (type, "_R_")) return "Profile"; else if ( strstr (type, "_G_")) return "Genomic"; else if ( strstr (type, "_E_")) return "Protein Secondary Structure"; else if ( strstr (type, "_T_")) return "Protein Trans Membrane Structure "; else if ( strstr (type, "_U_")) return "Unicode and strings"; else return type; } struct X_template *fill_F_template ( char *name,char *p, Sequence *S) { /*Profile template*/ struct X_template *F; F=fill_X_template ( name, p, "_F_"); sprintf (F->template_format , "TCOFFEE_LIBRARY"); if (!F || !check_file_exists (F->template_name)) { fprintf ( stderr, "\nWARNING: Could Not Fill _F_ (Fold) template for sequence |%s|", name); free_X_template (F); return NULL; } else if ( check_file_exists (F->template_name)) { sprintf ( F->template_file, "%s", F->template_name); } return F; } struct X_template *fill_P_template ( char *name,char *p, Sequence *S) { struct X_template *P; Sequence *PS; Alignment *A; int sim, cov, i; char *buf; P=fill_X_template ( name, p, "_P_"); sprintf (P->template_format , "pdb"); if (!P) { //fprintf ( stderr, "\nWARNING: Could Not Fill _P_ template for sequence |%s|", name); free_X_template (P); return NULL; } else if ( check_file_exists (P->template_name)) { sprintf ( P->template_file, "%s", P->template_name); buf=path2filename (P->template_name); if (P->template_name!=buf) { sprintf ( P->template_name, "%s",buf ); vfree (buf); } } else { char *st; st=is_pdb_struc (P->template_name); if (st) { if (st!=P->template_file)sprintf ( P->template_file, "%s", st); } } /*Make a first run to fix relaxed PDB files*/ buf=fix_pdb_file (P->template_file); if ( buf!=P->template_file) { sprintf ( P->template_file, "%s",buf); vfree (buf); } /*Check the PDB FILE EXISTS*/ if (!is_pdb_file (P->template_file)) { add_warning(stderr, "\nWARNING: _P_ Template |%s| Could Not Be Found\n",p); free_X_template (P); return NULL; } else { buf= get_pdb_id (P->template_file); if (buf!=(P->VP)->pdb_id) { sprintf ((P->VP)->pdb_id, "%s", buf); vfree (buf); } } /*Check the target sequence is similar enough*/ PS=get_pdb_sequence (P->template_file); if ( PS==NULL) { add_warning( stderr, "\nWARNING: _P_ Template |%s| Could Not be Used for Sequence |%s|: Structure Not Found", P->template_name, name); free_X_template (P);P=NULL; } else { int minsim=get_int_variable ("pdb_min_sim"); int mincov=get_int_variable ("pdb_min_cov"); i=name_is_in_list (name, S->name, S->nseq, 100); A=align_two_sequences (S->seq[i], PS->seq[0],"idmat",-3,0, "fasta_pair_wise"); cov=aln2coverage (A, 0); sim=aln2sim (A, "idmat"); if (simtemplate_name,name, sim, minsim); free_X_template (P); P=NULL; } else if ( covtemplate_name,name, cov, mincov); free_X_template (P);P=NULL; } free_aln(A); free_sequence (PS, -1); } return P; } struct X_template *fill_S_template ( char *name,char *p, Sequence *Seq) { struct X_template *S; S=fill_X_template ( name, p, "_S_"); if ( strm (name, p))sprintf ( S->template_file, "%s",output_fasta_seqX (NULL,"w",Seq,NULL, seq_name2index (name, Seq))); (S->VS)->S=get_fasta_sequence (S->template_file, NULL); return S; } struct X_template *fill_R_template ( char *name,char *p, Sequence *S) { /*Profile template*/ struct X_template *R; R=fill_X_template ( name, p, "_R_"); sprintf (R->template_format , "fasta_aln"); if (!is_aln(R->template_name) && !is_seq (R->template_name)) { add_warning ( stderr, "\nWARNING: _R_ Template %s Could Not Be Found\n",R->template_name); free_X_template (R); return NULL; } else { int s; Sequence *S1; Alignment *A1; (R->VR)->A=main_read_aln (R->template_name, NULL); if ( !S) sprintf ( R->template_file, "%s", R->template_name); else { s=name_is_in_list(name, S->name, S->nseq, 100); if ( s!=-1) { S1=fill_sequence_struc (1, &S->seq[s], &S->name[s]); A1=seq2aln (S1,NULL, RM_GAP); (R->VR)->A=trim_aln_with_seq (A1, (R->VR)->A); sprintf ( R->template_file, "%s", vtmpnam (NULL)); output_clustal_aln (R->template_file, (R->VR)->A); } else sprintf ( R->template_file, "%s", R->template_name); } (R->VR)->A=aln2profile ((R->VR)->A); } return R; } struct X_template *fill_T_template ( char *name,char *p, Sequence *S) { /*Profile template*/ struct X_template *T; T=fill_X_template ( name, p, "_T_"); sprintf (T->template_format , "fasta_seq"); if (!is_aln(T->template_name) && !is_seq (T->template_name)) { add_warning ( stderr, "\nWARNING: _T_ Template %s Could Not Be Found\n",T->template_name); free_X_template (T); return NULL; } else { (T->VT)->S=main_read_seq(T->template_name); sprintf ( T->template_file, "%s", T->template_name); } return T; } //add template struct X_template *fill_U_template ( char *name,char *p, Sequence *S) { /*Profile template*/ struct X_template *U; U=fill_X_template ( name, p, "_U_"); sprintf (U->template_format , "string list"); if (!check_file_exists(U->template_name)) { add_warning ( stderr, "\nWARNING: _U_ Template %s Could Not Be Found\n",U->template_name); free_X_template (U); return NULL; } else { //(U->VU)->list=file2string(U->template_name); sprintf ( U->template_file, "%s", U->template_name); } return U; } struct X_template *fill_E_template ( char *name,char *p, Sequence *S) { /*Profile template*/ struct X_template *E; E=fill_X_template ( name, p, "_E_"); sprintf (E->template_format , "fasta_seq"); if (!is_aln(E->template_name) && !is_seq (E->template_name)) { add_warning ( stderr, "\nWARNING: _E_ Template %s Could Not Be Found\n",E->template_name); free_X_template (E); return NULL; } else { (E->VE)->S=main_read_seq (E->template_name); sprintf ( E->template_file, "%s", E->template_name); } return E; } struct X_template *fill_G_template ( char *name,char *p, Sequence *S) { struct X_template *G; G=fill_X_template ( name, p, "_G_"); sprintf (G->template_format , "fasta_seq"); /*1: Get the sequence from another file if needed*/ if ( strm (name, p))sprintf ( G->template_file, "%s",output_fasta_seqX (NULL,"w",S,NULL, seq_name2index (name, S))); else if ( strstr (p, "SEQFILE_")) { Sequence *ST; int i2; ST=main_read_seq (after_strstr ( p,"SEQFILE_G_")); i2=seq_name2index (name, ST); if ( i2!=-1) { sprintf ( G->template_file, "%s",output_fasta_seqX (NULL,"w",ST,NULL, i2)); sprintf ( G->template_name, "%s", name); } free_sequence (ST, -1); } else sprintf (G->template_file, "%s", G->template_name); /*2: Put the template in VG->S*/ if (!is_seq (G->template_file)) { add_warning ( stderr, "\nWARNING: _G_ Template %s Could Not Be Found \n",p); free_X_template (G); return NULL; } else { (G->VG)->S=get_fasta_sequence (G->template_file, NULL); } return G; } char *seq2T_value ( Sequence *S, int n, char *value, char *type) { static char *rv_buf; X_template *X; if ( !rv_buf)rv_buf=vcalloc (100, sizeof(char)); if (!(X=seq_has_template (S, n, type)))return NULL; else { if (strm (value, "template_file"))return X->template_file; else if ( strm (value, "template_name"))return X->template_name; else if ( strm (value, "seq_name"))return X->seq_name; else if (strm (type, "_P_")) { if ( strm (value, "pdb_id"))return (X->VP)->pdb_id; } else if ( strm (type, "_R_")) { if ( strm (value, "A")) { if ((X->VR)->A){sprintf ( rv_buf, "%d", (int)(X->VR)->A);return rv_buf;} else return NULL; } } } return NULL; } char *seq2P_pdb_id (Sequence *S, int n) { if (!S->T || !S->T[n] || !(S->T[n])->P ) return NULL; else return ((S->T[n])->P)->template_name; } char *seq2P_template_file(Sequence *S, int n) { return seq2T_value (S, n, "template_file", "_P_"); } char *profile2P_template_file (Sequence *S, int n) { Alignment *A; int a; char *p; if ( !(A=seq2R_template_profile (S, n)))return NULL; for (a=0; anseq; a++) { if ((p=seq2P_template_file (A->S, a))!=NULL)return p; } return NULL; } Alignment * seq2R_template_profile (Sequence *S, int n) { return (Alignment *)atop(seq2T_value (S, n, "A", "_R_")); } char * seq2E_template_string (Sequence *S, int n) { struct X_template *T; if ( (T=seq_has_template (S, n, "_E_"))!=NULL) return ((T->VE)->S)->seq[0]; else return NULL; } //add template int* seq2U_template (Sequence *S, int n) { struct X_template *T; if ( (T=seq_has_template (S, n, "_U_"))!=NULL) return (T->VU)->list; else return NULL; } char * seq2T_template_string (Sequence *S, int n) { struct X_template *T; if ( (T=seq_has_template (S, n, "_T_"))!=NULL) return ((T->VT)->S)->seq[0]; else return NULL; } struct X_template* seq_has_template ( Sequence *S, int n, char *mode) { Template *T; if ( !S || !mode) return NULL; else if ( n<0 || n>=S->nseq)return NULL; else if ( !(S->T)) return NULL; else if ( !(S->T[n]))return NULL; T=S->T[n]; //ADD STRUCTURE //add template if ( strm (mode, "_P_"))return T->P; else if ( strm (mode, "_F_"))return T->F; else if ( strm (mode, "_S_"))return T->S; else if ( strm (mode, "_R_"))return T->R; else if ( strm (mode, "_T_"))return T->T; else if ( strm (mode, "_E_"))return T->E; else if ( strm (mode, "_U_"))return T->U; else if ( strm (mode, "_G_"))return T->G; else return NULL; } char ** name2random_subset (char **in_name, int n_in, int n_out) { char **out_name; int **list; int a,max; vsrand (0); max=n_in*10000; out_name=declare_char (n_out,MAXNAMES+1 ); list=declare_int (n_in, 2); for (a=0; aname, A->nseq, A->nseq); A=reorder_aln (A, name_list, A->nseq); free_char (name_list, -1); return A; } Alignment *aln2jacknife (Alignment *A, int nseq, int len) { int a, b; if (nseq!=0 && nseqnseq) { char **name; name=name2random_subset (A->name, A->nseq, nseq); A=reorder_aln (A, name, nseq); free_char (name, -1); } if (len!=0 && lenlen_aln) { int **l; Alignment *B; l=declare_int (A->len_aln, 2); for (a=0; a< A->len_aln; a++) { l[a][0]=a; l[a][1]=rand()%(A->len_aln*1000); } sort_int ( l,2, 1, 0, A->len_aln-1); B=copy_aln (A, NULL); for ( a=0; a< len; a++) { for ( b=0; bnseq; b++) { A->seq_al[b][a]=B->seq_al[b][l[a][0]]; } } for (b=0; bnseq; b++)A->seq_al[b][len]='\0'; free_aln (B); free_int (l, -1); } return A; } Alignment * aln2scramble_seq (Alignment *A) { int **list; char **name_list; int a,max; max=100*A->nseq; vsrand (0); list=declare_int (A->nseq, 2); name_list=vcalloc (A->nseq, sizeof (char*)); for (a=0; anseq; a++) { list[a][0]=a; list[a][1]=rand ()%max; } sort_int ( list,2, 1, 0, A->nseq-1); for ( a=0; a< A->nseq; a++) name_list[a]=A->seq_al[a]; for (a=0; anseq; a++) { A->seq_al[a]=name_list[list[a][0]]; } vfree (name_list); free_int (list, -1); return aln2random_order (A); } Alignment * reorder_aln ( Alignment *A, char **name, int nseq) { int a,sn; Alignment *BUF; int n=0; int *tpp_int; if ( name==NULL)return aln2random_order(A); BUF=copy_aln ( A,NULL); for ( a=0; aname, A->nseq,STRING); if ( sn==-1) { ; } else { SWAPP(A->order[n], BUF->order[sn], tpp_int); sprintf ( A->name[n], "%s", BUF->name[sn]); sprintf ( A->seq_al[n], "%s",BUF->seq_al[sn]); sprintf ( A->seq_comment[n], "%s", BUF->seq_comment[sn]); n++; } } for ( a=n; a< A->nseq; a++)A->name[a][0]=A->seq_al[a][0]='\0'; A->nseq=n; if ( A->A)A->A=reorder_aln(A->A, name, nseq); free_aln (BUF); return A; } Sequence * reorder_seq_2 ( Sequence *A, int **order,int field, int nseq) { char **name; int a; if (!A || !order) return A; name=declare_char (A->nseq, 100); for (a=0; aname[order[a][field]]); A=reorder_seq (A, name,nseq); free_char (name, -1); return A; } Sequence * reorder_seq ( Sequence *A, char **name, int nseq) { int a,sn; Sequence *nA; nA=duplicate_sequence (A); for ( a=0; a< nseq; a++) { sn=name_is_in_list (name[a] ,nA->name, nA->nseq, 100); if (sn==-1)continue; if ( nA->file) sprintf ( A->file[a], "%s", nA->file[sn]); if ( nA->seq_comment)sprintf ( A->seq_comment[a], "%s", nA->seq_comment[sn]); if ( nA->aln_comment)sprintf ( A->aln_comment[a], "%s", nA->aln_comment[sn]); sprintf ( A->seq[a], "%s", nA->seq[sn]); A->len[a]=nA->len[sn]; sprintf ( A->name[a], "%s", nA->name[sn]); A->T[a][0]=nA->T[sn][0]; } A->nseq=nseq; free_sequence (nA, nA->nseq); return A; } char * concatenate_seq ( Sequence *S, char *conc, int *order) { int a; vfree (conc); conc=vcalloc ( S->nseq*S->max_len, sizeof (char)); for ( a=0; a< S->nseq; a++) { conc=strcat ( conc, S->seq[order[a]]); } return conc; } Alignment * rotate_aln ( Alignment *A, char *name) { Alignment *B; int a, b; B=declare_aln2 (A->len_aln, A->nseq+1); for ( a=0; a< A->nseq; a++) for ( b=0; b< A->len_aln; b++) { B->seq_al[b][a]=A->seq_al[a][b]; } for (a=0; a< A->len_aln; a++) if (name && name[0])sprintf ( B->name[a], "%s_%s%d", name, (a<9)?"0":"",a+1); else sprintf ( B->name[a], "%d", a+1); for (a=0; a< A->len_aln; a++)B->seq_al[a][A->nseq]='\0'; B->len_aln=A->nseq; B->nseq=A->len_aln; /*free_aln (A);*/ return B; } Alignment * invert_aln ( Alignment *A) { char *buf; int l, a, b, c; for ( a=0; a< A->nseq; a++) { l=strlen ( A->seq_al[a]); buf=vcalloc ( l+1,sizeof (char) ); for ( c=l-1,b=0; b< l; b++, c--) { buf[c]=A->seq_al[a][b]; } buf[l]='\0'; sprintf ( A->seq_al[a], "%s", buf); } vfree(buf); return A; } char * complement_string (char *s) { char *buf; int l, a, b, c; l=strlen (s); for ( b=0; b< l; b++) { char r; r=s[b]; if ( r=='a')r='t'; else if (r=='A')r='T'; else if (r=='t')r='a'; else if (r=='T')r='A'; else if (r=='g')r='c'; else if (r=='G')r='C'; else if (r=='c')r='g'; else if (r=='C')r='G'; s[b]=r; } return invert_string (s); } Alignment * complement_aln ( Alignment *A) { char *buf; int l, a, b, c; for ( a=0; a< A->nseq; a++) { A->seq_al[a]=complement_string (A->seq_al[a]); } return A; } Alignment * extract_nol_local_aln(Alignment *A, int start, int max_end) { A=extract_aln ( A, start, max_end); A=trunkate_local_aln (A); return A; } Alignment * alnpos_list2block (Alignment *A, int n, char **in_list) { int *pos; int a; char **list; int list_declared=0; Alignment *B; if (check_file_exists (in_list[0])) { int mn; char ***tmp_list; mn=count_n_line_in_file (in_list[0]); list=declare_char (mn, 100); list_declared=1; tmp_list=file2list (in_list[0], " "); a=0; n=0; while (tmp_list[a]) { if (tmp_list[a][1][0]!='!') { sprintf (list[n++], "%s", tmp_list[a][1]); } a++; } free_arrayN ((void **)tmp_list, 3); } else { list=in_list; } pos=vcalloc (A->len_aln, sizeof (int)); for (a=0; a=end || end>A->len_aln+1) { add_warning ( stderr, "\nWARNING: Illegal coordinates in extract_pos_list [%s]", list[a]); return A; } start--; end--; for (a=start; aA->len_aln) { add_warning ( stderr, "\nWARNING: Illegal coordinates in extract_pos_list [%s]", list[a]); } p--; pos[p]=1; } } B=alnpos2block(A, pos, NULL); vfree (pos); if ( list_declared)free_char (list, -1); return B; } Alignment * aln2block (Alignment *A, int start, int end, Alignment *B) { if ( !A || start<=0 || start>=end || end>A->len_aln+1) { add_warning ( stderr, "\nWARNING: Illegal coordinates in extract_block start=%d end=%d len=%d [Note : [start-end[, with [1...n]", start, end, A->len_aln); return A; } else { int *pos, p; start--; end--; pos=vcalloc (A->len_aln, sizeof (int)); for (p=start;plen_aln=0; for (a=0; a<=A->len_aln; a++) { if ( pos[a]!=0 || a==A->len_aln) { for ( b=0; bnseq; b++) B->seq_al[b][B->len_aln]=A->seq_al[b][a]; if ( a!=A->len_aln)B->len_aln++; } } return B; } Alignment * extract_aln ( Alignment *A, int start, int end) { return extract_aln2 ( A, start, end, "cons"); } Alignment * extract_aln2 ( Alignment *A, int in_start, int in_end, char *seq) { char *tmp; FILE *fp; tmp=vtmpnam (NULL); fp=vfopen (tmp, "w"); fprintf ( fp, "%s %d %d\n", seq, in_start, in_end); vfclose (fp); return extract_aln3 (A,tmp); } Alignment * extract_aln3 ( Alignment *B, char *file) { int a, b, c; int start, end; int n, i, s, nline=0; FILE *fp; Alignment *A=NULL; int *col; char name[MAXNAMES]; char line[VERY_LONG_STRING]; int *offset; /*Reads in a file #comment ! seq_name offset seqname pos OR seqname start end[ modifies the incoming alignment */ offset=vcalloc ( B->nseq+1, sizeof (int)); fp=vfopen (file,"r"); while ( (c=fgetc(fp))!=EOF) { s=-1; fgets ( line, VERY_LONG_STRING,fp); if ( c=='!') { sscanf (line, "%s %d", name, &start); s=name_is_in_list (name,B->name,B->nseq,MAXNAMES); } if (s!=-1) offset[s]=start; } vfclose (fp); A=copy_aln (B, A); col=vcalloc ( A->len_aln, sizeof (int)); fp=vfopen ( file, "r"); while ( (c=fgetc(fp))!=EOF) { nline++; if ( c=='#' || c=='!')fgets ( line, VERY_LONG_STRING,fp); else { ungetc(c, fp); fgets ( line, VERY_LONG_STRING,fp); if (sscanf (line, "%s %d %d", name, &start, &end)==3); else if (sscanf (line, "%s %d", name, &start)==2) { end=start+1; } else { add_warning ( stderr, "\nWARNING: wrong format in coordinate file (line=%d)\n", nline); continue; } if ( end==0)end=A->len_aln+1; s=name_is_in_list (name,A->name,A->nseq,MAXNAMES); if ( s==-1 && !strm (name, "cons")) { add_warning ( stderr, "\nWARNING: Seq %s does not belong to the alignment (line %d)\n", name,nline); continue; } else if ( start>end) { add_warning ( stderr, "\nWARNING: Illegal coordinates [%s %d %d] (line %d)\n", name,start, end,nline); continue; } else { int done=0; if ( s!=-1) { start-=offset[s]-1; end-=offset[s]-1; } for (n=0, a=0; done!=1 && a< A->len_aln; a++) { i=(strm (name, "cons"))?1:!is_gap(A->seq_al[s][a]); n+=i; if (n>=start && n=end)done=1; //if (n>=start && n=end)a=A->len_aln; } if ( done==0) { HERE ("Warning Missing positions in File %s",file ); } } } } vfclose ( fp); /*Extract [start-end[*/ for ( b=0,a=0; a< A->len_aln; a++) { if ( col[a]) { for (c=0; c< A->nseq; c++)A->seq_al[c][b]=A->seq_al[c][a]; b++; } } A->len_aln=b; for (c=0; c< A->nseq; c++)A->seq_al[c][A->len_aln]='\0'; vfree (col); return A; } Alignment * trunkate_local_aln ( Alignment *A) { int a, b; int **pos; int **cache; int seq; cache=declare_int (return_max_int (A->order,read_size_int ( A->order,sizeof (int*)),0)+1,return_max_int (A->order,read_size_int ( A->order,sizeof (int*)),1)+A->len_aln+1); pos=aln2pos_simple(A,A->nseq); for ( b=0; blen_aln; b++) for ( a=0; a< A->nseq; a++) { seq=A->order[a][0]; if ( pos[a][b]<=0); else if ( pos[a][b]>0) { if (cache[seq][pos[a][b]]==0)cache[seq][pos[a][b]]++; else if ( cache[seq][pos[a][b]]>=1) { cache[seq][pos[a][b]]++; A->seq_al[a][b]='\0'; } } } A->len_aln=get_shortest_string ( A->seq_al, A->nseq, NULL, NULL); pad_string_array ( A->seq_al, A->nseq, A->len_aln, '-'); free_int (pos, -1); free_int ( cache,-1); return A; } int get_nol_aln_border ( Alignment *A, int start, int direction) { int a, b; int **pos; int **cache; int seq,end; /*This Function Returns the limit position for a non overlaping alignment*/ cache=declare_int (return_max_int (A->order,read_size_int ( A->order,sizeof (int*)),0)+1,return_max_int (A->order,read_size_int ( A->order,sizeof (int)),1)+A->len_aln+1); pos=aln2pos_simple(A,A->nseq); end=(direction==GO_RIGHT)?A->len_aln:-1; for ( b=start; b!=end;b+=direction) for ( a=0; a< A->nseq; a++) { seq=A->order[a][0]; if ( pos[a][b]<=0); else if ( pos[a][b]>0) { if (cache[seq][pos[a][b]]==0)cache[seq][pos[a][b]]++; else if ( cache[seq][pos[a][b]]>=1) { cache[seq][pos[a][b]]++; free_int(cache, -1); return b-direction; } } } free_int ( cache,-1); free_int (pos, -1); return end-direction; } char * extract_defined_seq ( char *in, int in_of, int in_start, int *aa_def, int dir, int *out_start, char *out) { int start=0, end,l; int b, c, d; if ( dir==GO_LEFT){start=in_start-1;} else if ( dir==GO_RIGHT){start=in_start+1;} end=start; while (aa_def[end]!=UNDEFINED) { end+=dir; } end-=dir; if (end=start && c<=end) { if ( out_start[0]==-1)out_start[0]=c-!is_gap(in[b]); out[d++]=in[b]; } } out[d]='\0'; return out; } Alignment * concatenate_aln ( Alignment *A1, Alignment *A2, char *spacer) { Alignment *A; int a, i; A=declare_aln2( A1->nseq+A2->nseq , A1->len_aln+A2->len_aln+1); for ( a=0; a< A1->nseq; a++) { if ((i=name_is_in_list ( A1->name[a], A2->name, A2->nseq, 100))!=-1) { sprintf ( A->name[A->nseq], "%s", A1->name[a]); sprintf (A->seq_al[A->nseq], "%s%s%s", A1->seq_al[a],(spacer)?spacer:"", A2->seq_al[i]); A->nseq++; } else { char *buf; buf=generate_string (A2->len_aln, '-'); sprintf ( A->name[A->nseq], "%s", A1->name[a]); sprintf (A->seq_al[A->nseq], "%s%s", A1->seq_al[a], buf); A->nseq++; vfree (buf); } } for ( a=0; a< A2->nseq; a++) { if ((i=name_is_in_list ( A2->name[a], A1->name, A1->nseq, 100))==-1) { char *buf; buf=generate_string (A1->len_aln, '-'); sprintf ( A->name[A->nseq], "%s", A2->name[a]); sprintf (A->seq_al[A->nseq], "%s%s", buf, A2->seq_al[a]); A->nseq++; vfree (buf); } } A->len_aln=A1->len_aln+A2->len_aln; return A; } Alignment * aln_cat ( Alignment *A, Alignment *B) { int a; if ( A->nseq!=B->nseq) { fprintf ( stderr, "\nERROR IN ALN CAT: DIFFERENT NSEQ\n"); myexit(EXIT_FAILURE); } A=realloc_alignment2(A, A->nseq,A->len_aln+B->len_aln+1); for ( a=0;a< A->nseq; a++) { strcat ( A->seq_al[a], B->seq_al[a]); } A->len_aln+=B->len_aln; return A; } int verify_aln ( Alignment *A, Sequence *S, char *message) { int a, b, c,s,r; for ( a=0;a< A->nseq; a++) { s=A->order[a][0]; r=A->order[a][1]; for ( b=0, c=0; b< A->len_aln; b++) { if ( !is_gap(A->seq_al[a][b])) { if (tolower(A->seq_al[a][b])!=tolower(S->seq[s][c+r])) { fprintf ( stderr, "\n%s\nResidue [%c %d, %c %d] line %d seq %d",message,A->seq_al[a][b], b,S->seq[s][c+r], c+r,a,s); output_Alignment_with_res_number(A, stderr); myexit(EXIT_FAILURE); return 0; } c++; } } } return 1; } Alignment *adjust_est_aln ( Alignment *PW, Alignment *M, int s) { /*This function reajusts M, threading M onto PW two seqences in PW s+1 seq in M seq 0 PW ----> 0->s-1 in M seq 1 PW ----> 1->s in M; */ int a, b; static char **array; int top_M=0; int bottom_M=0; if ( array==NULL) { array=declare_char (500, 100000); } for ( a=0; a< PW->len_aln; a++) { if ( is_gap(PW->seq_al[0][a])) { for ( b=0; b< s; b++) array[b][a]='-'; } else { for ( b=0; b< s; b++) array[b][a]=M->seq_al[b][top_M]; top_M++; } if ( is_gap(PW->seq_al[1][a])) { array[s][a]='-'; } else { array[s][a]=M->seq_al[s][bottom_M]; bottom_M++; } } M->len_aln=PW->len_aln; for (a=0; alen_aln; b++) M->seq_al[a][b]=array[a][b]; M->seq_al[a][b]='\0'; } M->nseq=s+1; return M; } Alignment * rename_seq_in_aln (Alignment *A, char ***list) { int n, i; if ( !A)return A; n=0; while ( list[n][0][0]) { if ( (i=name_is_in_list (list[n][0], A->name, A->nseq, 100))!=-1) { sprintf ( A->name[i], "%s", list[n][1]); } n++; } A->S=rename_seq_in_seq (A->S, list); return A; } Sequence * rename_seq_in_seq (Sequence *A, char ***list) { int n, i; if ( !A || !list)return A; n=0; while ( list[n][0][0]) { if ( (i=name_is_in_list (list[n][0], A->name, A->nseq, 100))!=-1) { sprintf ( A->name[i], "%s", list[n][1]); } n++; } return A; } /********************************************************************/ /* */ /* FLOAT SIMILARITIES */ /* */ /* */ /* */ /********************************************************************/ float get_seq_fsim ( char *string1, char *string2, char *ignore, char *similarity_set,int **matrix, int MODE ) { int len, a, r1, r2, nr1=0, nr2=0; float pos=0, sim=0; len=MIN((strlen (string1)),(strlen (string2))); if ( len==0)return 0; for ( a=0; a< len; a++) { r1=string1[a]; r2=string2[a]; nr1+=!is_gap(r1); nr2+=!is_gap(r2); if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore)) { pos++; if ( matrix)sim+=matrix[r1-'A'][r2-'A']; else if (is_in_same_group_aa(r1,r2,0, NULL,similarity_set)) { sim++; } } } if ( MODE==UNGAPED_POSITIONS)return ( sim*100)/pos; else if ( MODE==ALIGNED_POSITIONS)return (sim*100)/len; else if ( MODE==AVERAGE_POSITIONS)return (sim*200)/(nr1+nr2); else { return 0; } } float get_seq_fsim2 ( char *string1, char *string2, char *ignore, char *in_mode) { int len1; int a; int p1, p2; int r1=0,r2=0; char *p; char mode[1000]; float r=0, pos1, pos2, pos0, gap, sim; sprintf ( mode, "%s", in_mode); /*mode: __ mat: idscore to get the alignment done any legal cw matrix sim_mode: sim1->identities/matches sim2->identities/min len */ if ( (p=strstr (mode, "_"))!=NULL) { p[0]='\0'; p++; } if (strstr (mode, "idscore")) { static int **mat; if (!mat) mat=read_matrice ("blosum62mt"); return idscore_pairseq (string1, string2, -12, -1, mat,mode); } len1=strlen (string1); for ( sim=pos1=pos2=pos0=gap=0,a=0; a< len1; a++) { r1=string1[a]; r2=string2[a]; p1=1-is_in_set (r1, ignore); p2=1-is_in_set (r2, ignore); pos1+=p1; pos2+=p2; if (p1 && p2) { pos0++; if (is_in_same_group_aa(r1,r2,0, NULL, mode)) { sim++; } } else if (p1+p2==1) { gap++; } } if ( p==NULL || strm (p, "sim1") || strm (p, "sim")) { r=(pos0==0)?0:(sim*MAXID)/pos0; } else if ( strm (p, "sim2")) { r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MIN(pos1,pos2); } else if ( strm (p, "sim3")) { r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MAX(pos1,pos2); } else if ( strm (p, "gap1")) { r=(len1==0)?MAXID:(gap*MAXID)/len1; r=MAXID-r; } else if ( strm (p, "logid")) { r=logid_score (pos0, sim); } return r; } /********************************************************************/ /* */ /* ALIGNMENT ANALYSES */ /* */ /* */ /* */ /********************************************************************/ int **dist_array2sim_array ( int **p, int max) { int s1, s2, a, b; s1=read_array_size ((void *)p, sizeof (void *)); s2=read_array_size ((void*)p[0],sizeof (int)); /* s2=read_array_size ((void*)p[0],sizeof (void *)); OLD before 64 BITS*/ for ( a=0; a< s1; a++) for ( b=0; b< s2; b++) { p[a][b]=max-p[a][b]; } return p; } int **sim_array2dist_array ( int **p, int max) { int s1, s2, a, b; s1=read_array_size ((void *)p, sizeof (void *)); s2=read_array_size ((void*)p[0],sizeof (int)); /*s2=read_array_size ((void*)p[0],sizeof (void *)); OLD before 64 Bits stuff*/ for ( a=0; a< s1; a++) for ( b=0; b< s2; b++) { p[a][b]=max-(int)p[a][b]; } return p; } int **normalize_array (int **p, int max, int norm) { int s1, s2, a, b; s1=read_array_size ((void *)p, sizeof (void *)); s2=read_array_size ((void*)p[0],sizeof (int)); /*s2=read_array_size ((void*)p[0],sizeof (void *)); OLD before 64 Bits stuff*/ for ( a=0; a< s1; a++) for ( b=0; b< s2; b++) { p[a][b]=(p[a][b]*norm)/max; } return p; } int aln2most_similar_sequence ( Alignment *A, char *mode) { int **w; int a, b; int avg, best_avg=0, best_seq=0; char *buf; int coverage; if ( !A) return -1; else if ( A->nseq==1)return 0; else { buf=vcalloc ( A->len_aln+1, sizeof (char)); w=get_sim_aln_array ( A, mode); for ( a=0; a< A->nseq; a++) { sprintf ( buf, "%s", A->seq_al[a]); ungap(buf); coverage=(strlen(buf)*MAXID)/A->len_aln; for ( avg=0,b=0; b< A->nseq; b++)avg+=w[a][b]*coverage; if ( avg>best_avg){best_avg=avg; best_seq=a;} } free_int (w, -1); vfree (buf); return best_seq; } } int aln2coverage ( Alignment *A, int ref_seq) { int a,b; int cov_pos=0, npos=0; for ( a=0; a< A->len_aln; a++) { if ( !is_gap ( A->seq_al[ref_seq][a])) { npos++; for ( b=0; b< A->nseq; b++) { if ( b!=ref_seq && !is_gap ( A->seq_al[b][a])){cov_pos++;break;} } } } return (int) (npos==0)?0:(( MAXID*cov_pos)/A->len_aln); } int sub_aln2sim ( Alignment *A, int *ns, int **ls, char *mode) { int a, b, n; float avg; n=0; avg=0; if (!A || (ns==NULL && A->nseq<2))return -1; else if (ns==NULL) { for (a=0; a< A->nseq-1; a++) for ( b=a+1; b< A->nseq;b++, n++) avg+=generic_get_seq_sim (A->seq_al[a], A->seq_al[b], NULL, mode); } else { for (a=0; aseq_al[ls[0][a]], A->seq_al[ls[1][b]], NULL, mode); } } return (int)(n==0)?0:((float)avg/(float)n); } int sub_aln2max_sim ( Alignment *A, int *ns, int **ls, char *mode) { int a, b, n; float avg; n=0; avg=0; if (!A || (ns==NULL && A->nseq<2))return -1; else if (ns==NULL) { for (a=0; a< A->nseq-1; a++) for ( b=a+1; b< A->nseq;b++, n++) avg=MAX(avg,generic_get_seq_sim (A->seq_al[a], A->seq_al[b], NULL, mode)); } else { for (a=0; aseq_al[ls[0][a]], A->seq_al[ls[1][b]], NULL, mode)); } } return avg; } double aln2entropy (Alignment *A, int *in_ls, int in_ns, float gap_threshold) { int ns, a, s, col, r,ncol; int *ls; double *count; double entropy=0; float ng; ls=vcalloc ( A->nseq, sizeof (int)); count=vcalloc ( 26, sizeof (double)); if ( in_ls) { ns=in_ns; for ( a=0; a< ns; a++)ls[a]=in_ls[a]; } else { ns=A->nseq; for ( a=0; a< ns; a++)ls[a]=a; } if ( ns==0) { vfree(ls);vfree(count);return 0; } for (ncol=0,col=0; collen_aln; col++) { for (ng=0,a=0; a< ns; a++) { s=ls[a]; ng+=is_gap(A->seq_al[s][col]); } ng/=ns; if ( ng>gap_threshold)continue; ncol++; for ( a=0; aseq_al[s][col]); if (!is_gap(r))count[r-'a']++; } for (a=0; a<26; a++) { if ( count[a]==0); else { count[a]/=(double)ns; entropy+=count[a]*log(count[a]); count[a]=0; } } } entropy/=-ncol; vfree (ls); vfree(count); return entropy; } int aln2sim ( Alignment *A, char *mode) { return sub_aln2sim ( A, NULL, NULL, mode); /* if ( !A || A->nseq<2) return -1; w=get_sim_aln_array ( A, mode); for (c=0, a=0; a< A->nseq-1; a++) for ( b=a+1; b< A->nseq; b++, c++) { avg+=(float)w[a][b]; } free_int (w, -1); return (int)((float)avg/(float)c); */ } int aln_is_aligned ( Alignment *A) { int a, b; if ( !A)return 0; for (a=0; a< A->nseq; a++) for ( b=A->len_aln-1; b>0; b--) { if (!is_gap(A->seq_al[a][b]) && is_gap(A->seq_al[a][b-1]))return 1; } return 0; } int seq2aln2sim_old ( char *seq1, char *seq2, char *mode_aln, char *mode_id) { Alignment *A; int sim; A=align_two_sequences (seq1, seq2, "pam250mt", -10, -1, mode_aln); sim=aln2sim (A, mode_id); free_aln (A); return sim; } int seq2aln2sim ( char *seq1, char *seq2, char *mode_aln, char *mode_id) { Alignment *A; int sim; static int gop; if (!gop) { int **m; m=read_matrice ("blosum62mt"); gop=get_avg_matrix_mm(m, AA_ALPHABET)*10; free_int (m, -1); } A=align_two_sequences (seq1, seq2, "blosum62mt",gop,-1, mode_aln); sim=aln2sim (A, mode_id); free_aln (A); return sim; } int* get_cdna_seq_winsim ( int *cache, char *string1, char *string2, char *ignore, char *mode,int *w ) { int len1, len2; int a, x; len1=strlen (string1); len2=strlen (string2); if ( len1!=len2) { fatal_exit( stderr,EXIT_FAILURE, "\nTHE TWO cDNAs DO NOT HAVE THE SAME LENGTH [FATAL:get_cdna_seq_sim:%s", PROGRAM); } x=get_cdna_seq_sim(cache, string1, string2, ignore, ""); for ( a=0; a< len1; a++) w[a]=x; add_warning (stderr, "\nWARNING: winsim not implemented for cDNA"); return w; } int get_cdna_seq_sim ( int *cache, char *string1, char *string2, char *ignore, char *mode) { int len1; int len2; int a; int pos=0; int sim=0; char r1=0, r2=0; len1=strlen (string1); len2=strlen (string2); if ( len1!=len2) { fprintf ( stderr, "\nTHE TWO cDNAs DO NOT HAVE THE SAME LENGTH [FATAL:get_cdna_seq_sim:%s", PROGRAM); crash(""); } for ( a=0; a< len1;) { if ( cache[a]==0){a++;continue;} else if ( cache[a]==1) { r1=translate_dna_codon (string1+a, 'x'); r2=translate_dna_codon (string2+a, 'x'); a+=3; } if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore)) { pos++; if (is_in_same_group_aa(r1,r2,0, NULL,mode+4)) { sim++; } } } if (pos==0) return 0; else return (int) (sim*MAXID)/pos; } int* get_seq_winsim ( char *string1, char *string2, char *ignore, char *mode, int*w) { int len1, len2, len; int left, right; int a,b; int sim=0; int window; int r1, r2; len1=strlen (string1); len2=strlen (string2); window=atoi(mode); len=2*window+1; if ( len1!=len2)return 0; if (window==0 || (window*2+1)>=len1) { sim=get_seq_sim (string1, string2, ignore, ""); for (a=0; a__ mat: idscore to get the alignment done any legal cw matrix sim_mode: sim1->identities/matches sim2->identities/min len */ if ( (p=strstr (mode, "_"))!=NULL) { p[0]='\0'; p++; } if (strstr (mode, "idscore")) { static int **mat; if (!mat) mat=read_matrice ("blosum62mt"); return idscore_pairseq (string1, string2, -12, -1, mat,mode); } len1=strlen (string1); for ( sim=pos1=pos2=pos0=0,a=0; a< len1; a++) { r1=string1[a]; r2=string2[a]; p1=1-is_in_set (r1, ignore); p2=1-is_in_set (r2, ignore); pos1+=p1; pos2+=p2; if (p1 && p2) { pos0++; if (is_in_same_group_aa(r1,r2,0, NULL, mode)) { sim++; } } else if (p1+p2==1) { gap++; } } if ( strstr (mode, "cov")) { r=(pos0+gap==0)?0:(pos0*MAXID)/(pos0+gap); } else if ( p==NULL || strm (p, "sim1") || strm (p, "sim")) { r=(pos0==0)?0:(sim*MAXID)/pos0; } else if ( strm (p, "sim2")) { r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MIN(pos1,pos2); } else if ( strm (p, "sim3")) { r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MAX(pos1,pos2); } else if ( strm (p, "gap1")) { r=(len1==0)?MAXID:(gap*MAXID)/len1; r=MAXID-r; } else if ( strm (p, "logid")) { r=logid_score (pos0, sim); } else if ( strstr (mode, "sim")) { r=(pos0==0)?0:(sim*MAXID)/pos0; } return r; } int get_seq_sim_2 ( char *string1, char *string2, char *ignore, char **gr, int ng) { int len1; int len2; int a; int pos=0; int sim=0; char r1, r2; len1=strlen (string1); len2=strlen (string2); if ( len1!=len2)return 0; for ( a=0; a< len1; a++) { r1=string1[a]; r2=string2[a]; if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore)) { pos++; if (is_in_same_group_aa(r1,r2,ng, gr, NULL)) { sim++; } } } if (pos==0) return 0; else return (int) (sim*MAXID)/pos; } int get_seq_sim_3 ( char *string1, char *string2, char *ignore, int **mat) { int len1; int len2; int a; int sim=0; char r1, r2; len1=strlen (string1); len2=strlen (string2); if ( len1!=len2)return 0; for ( a=0; a< len1; a++) { r1=string1[a]; r2=string2[a]; if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore)) { sim+=mat[r1-'A'][r2-'A']; } } return sim; } int * get_aln_col_weight ( Alignment *A, char *mode) { int a, b; char *col; int *weight; col=vcalloc ( A->nseq, sizeof (int)); weight=vcalloc (A->len_aln, sizeof (int)); for (a=0; a< A->len_aln; a++) { for ( b=0; b< A->nseq; b++) col[b]=A->seq_al[b][a]; weight[a]=(find_group_aa_distribution (col, A->nseq,0,NULL,NULL, mode )*MAXID)/A->nseq; } vfree (col); return weight; } int analyse_aln_column ( Alignment *B, int col) { char r=' '; int a, b, c=0; static char *mat; static int ng_cw_star; static char **cw_star; int *cw_star_count; static int ng_cw_col; static char **cw_col; int *cw_col_count; static int ng_cw_dot; static char **cw_dot; int *cw_dot_count; if ( !B->S || !(B->S)->type)B= get_aln_type (B); if ( !mat)mat=vcalloc ( STRING, sizeof (char)); if ( !ng_cw_star) { cw_star=make_group_aa ( &ng_cw_star, strcpy ( mat,"idmat")); cw_col=make_group_aa ( &ng_cw_col, strcpy (mat,"clustalw_col")); cw_dot=make_group_aa ( &ng_cw_dot, strcpy (mat, "clustalw_dot")); } cw_star_count=vcalloc (ng_cw_star, sizeof (int)); cw_col_count=vcalloc ( ng_cw_col, sizeof (int)); cw_dot_count=vcalloc (ng_cw_dot, sizeof (int)); for ( a=0; a< B->nseq; a++) { c=tolower (B->seq_al[a][col]); if (is_gap(c)){r=' ';break;} for ( b=0; b< ng_cw_star; b++) cw_star_count[b]+=is_in_set (c, cw_star[b]); for ( b=0; b< ng_cw_col; b++) cw_col_count[b]+=is_in_set (c, cw_col[b]); for ( b=0; b< ng_cw_dot; b++) cw_dot_count[b]+=is_in_set (c, cw_dot[b]); } if ( !is_gap(c) && r==' ') for ( b=0; b< ng_cw_star; b++)if ( cw_star_count[b]==B->nseq){r='*'; break;} if ( !is_gap(c) && r==' ' && !(strm((B->S)->type, "DNA")||strm ((B->S)->type,"RNA"))) for ( b=0; b< ng_cw_col ; b++)if ( cw_col_count [b]==B->nseq){r=':'; break;} if ( !is_gap(c) && r==' ' && !(strm((B->S)->type, "DNA")||strm ((B->S)->type,"RNA"))) for ( b=0; b< ng_cw_dot ; b++)if ( cw_dot_count [b]==B->nseq){r='.'; break;} vfree(cw_star_count); vfree(cw_col_count); vfree(cw_dot_count); return r; } int ** get_cov_aln_array ( Alignment *A, char *mode) { int **w; int a, b, c, t; w=declare_int ( A->nseq, A->nseq); for ( a=0; a< A->nseq-1; a++) { w[a][a]=100; for ( t=0,b=a+1; b< A->nseq; b++) { for ( c=0; c< A->len_aln; c++) { t+=(!is_gap(A->seq_al[a][c]) &&!is_gap(A->seq_al[b][c])); } w[a][b]=w[b][a]=(t*100)/A->len_aln; } } return w; } int ** get_cov_master_aln_array ( Alignment *A,int n, char *mode) { int **w; int b, c, t; w=declare_int ( A->nseq, A->nseq); for (b=0; b< A->nseq; b++) { for (t=0, c=0; c< A->len_aln; c++) { t+=(!is_gap(A->seq_al[n][c]) &&!is_gap(A->seq_al[n][c])); } w[n][b]=w[b][n]=(t*100)/A->len_aln; } return w; } int ** get_sim_master_aln_array ( Alignment *A,int n, char *mode) { int **w; int a; w=declare_int ( A->nseq, A->nseq); for ( a=0; a< A->nseq; a++) { if ( strm (mode, "cdna")) w[n][a]=w[a][n]=get_cdna_seq_sim ( A->cdna_cache[0], A->seq_al[a], A->seq_al[n],GAP_LIST, mode); else w[n][a]=w[a][n]=get_seq_sim ( A->seq_al[n], A->seq_al[a],GAP_LIST, mode); } return w; } int ** get_dist_aln_array ( Alignment *A, char *mode) { int **w; w=get_sim_aln_array ( A, mode); return sim_array2dist_array(w,MAXID); } Sequence * seq2filter (Sequence *Sin, int min, int max) { int *keep; char *tmpfile; Sequence *S, *Sout; int a, b, sim; int **M; FILE *fp; int n; S=duplicate_sequence (Sin); for (a=0; anseq; a++)ungap(S->seq[a]); keep=vcalloc (S->nseq, sizeof (int)); M=read_matrice ("blossum62mt"); for (a=0; anseq; a++) { output_completion ( stderr, a, S->nseq, 100, "Distance Matrix Computation: "); for ( b=a+1; bnseq; b++) { sim=idscore_pairseq(S->seq[a], S->seq[b],-10, -2,M, "sim"); if ( sim>min && simnseq; a++) if ( keep[a]) { fprintf ( fp, ">%s %s\n%s", S->name[a], S->seq_comment[a], S->seq[a]); n++; } vfclose (fp); if (n==0) return NULL; Sout=main_read_seq(tmpfile); free_int (M, -1); vfree (keep); free_sequence (S, -1); return Sout; } Alignment * grep_seq (Alignment *S,char *field, char *mode, char *string) { int a; FILE *fp; char *tmp; int n=0; tmp=vtmpnam (NULL); fp=vfopen (tmp, "w"); if ( !strm(mode, "KEEP") && ! strm (mode, "REMOVE")) { add_warning ( stderr, "\nERROR: +grep [FATAL: %s]", PROGRAM); myexit (EXIT_FAILURE); } else if ( !strm(field, "SEQ") && ! strm (field, "COMMENT") && ! strm(field, "NAME")) { add_warning ( stderr, "\nERROR: +grep [FATAL: %s]", PROGRAM); myexit (EXIT_FAILURE); } for (n=0, a=0; a< S->nseq; a++) { int found=0; if (strm(field, "NAME") && perl_strstr (S->name[a], string))found=1; else if (strm(field, "COMMENT") && S->seq_comment[a][0] && perl_strstr (S->seq_comment[a], string) )found=1; else if (strm(field, "SEQ") && perl_strstr (S->seq_al[a], string))found=1; if ( (strm (mode, "KEEP") && found) || (strm (mode, "REMOVE") && !found)) { n++; fprintf (fp, ">%s", S->name[a]); if (S->seq_comment[a][0])fprintf (fp, " %s", S->seq_comment[a]); fprintf (fp, "\n%s\n", S->seq_al[a]); } } vfclose (fp); free_aln (S); if ( n==0) return NULL; else return main_read_aln (tmp, NULL); } Alignment * modify_seq (Alignment *S, char *field, char *string1, char *string2) { int a; FILE *fp; char *tmp; tmp=vtmpnam (NULL); fp=vfopen (tmp, "w"); for ( a=0; a< S->nseq; a++) { if (strm(field, "NAME"))S->name[a]=substitute ( S->name[a], string1, string2); else if (strm(field, "COMMENT"))S->seq_comment[a]=substitute ( S->seq_comment[a], string1, string2); else if (strm(field, "SEQ"))S->seq_al[a]=substitute ( S->seq_al[a], string1, string2); fprintf (fp, ">%s", S->name[a]); if (S->aln_comment[a][0])fprintf (fp, " %s", S->aln_comment[a]); fprintf (fp, "\n%s\n", S->seq_al[a]); } vfclose (fp); free_aln (S); S=main_read_aln (tmp, NULL); return S; } int ** seq2sim_mat (Sequence *S, char *mode) { return seq2comp_mat ( S,mode, "sim"); } int ** seq2cov_mat (Sequence *S, char *mode) { return seq2comp_mat ( S,mode, "cov"); } int ** seq2comp_mat (Sequence *S, char *mode, char *comp_mode) { int a, b; int **sim; char file[1000]; Alignment *A; char *name; /*Use pre_computed value if available in the current dir*/ name=path2filename(S->file[0]); sprintf ( file, "%s%s.%s.%s_file", get_cache_dir(),name, mode, comp_mode); A=seq2aln(S,NULL, RM_GAP); if ( check_file_exists (file) && is_distance_matrix_file (file) && (sim=input_similarities(file, A, NULL))!=NULL) { display_input_filename (stderr, "SIMILARITY_MATRIX", "SIMILARITY_MATRIX_FORMAT_01", file, CHECK); fprintf ( stderr, "\n"); } else { char mode2[1000]; int **M; M=read_matrice (mode); sim=declare_int ( S->nseq, S->nseq); for ( a=0; a< S->nseq; a++) { ungap (S->seq[a]); sim[a][a]=100; } for ( a=0; anseq-1; a++) { output_completion4halfmat ( stderr, a, S->nseq, 100, "Similarity Matrix Computation: "); for ( b=a+1; b< S->nseq; b++) { sim[a][b]=sim[b][a]=idscore_pairseq(S->seq[a], S->seq[b],-12, -1,M, comp_mode); } } free_int (M,-1); sprintf ( mode2, "_memory_%ld", (long int)sim); output_similarities( file, A, mode2); display_output_filename (stderr, "SIMILARITY_MATRIX", "SIMILARITY_MATRIX_FORMAT_01", file, CHECK); fprintf ( stderr, "\n"); } free_aln (A); return sim; } int ** fast_aln2sim_list (Alignment *A, char *mode, int *ns, int **ls) { int **simm; int p1, p2, p3, r1, r2; int gap,pos0,pos1,pos2,len,sim; int a, b, c, m, s=0,s1, s2, n; int free_ns=0; if (ns==NULL) { free_ns=1; ns=vcalloc (2, sizeof (int)); ns[0]=ns[1]=A->nseq; ls=declare_int (2, A->nseq); for ( a=0; a< 2; a++) for (b=0; bnseq; b++) ls[a][b]=b; } simm=declare_int (ns[0]*ns[1]+1, 3); if (strstr (mode, "sim1"))m=0; else if (strstr (mode, "sim2"))m=1; else if (strstr (mode, "sim3"))m=2; else if (strstr (mode, "gap1"))m=3; else if (strstr (mode, "cov1"))m=4; else if (strstr (mode, "logid"))m=5; else m=0; for (n=0,a=0; alen_aln; c++) { r1=tolower (A->seq_al[s1][c]); r2=tolower (A->seq_al[s2][c]); p1=(r1!='-')?1:0; p2=(r2!='-')?1:0; p3=p1+p2; if ( p3==0)continue; if ( p3==1)gap++; if ( r1==r2)sim++; pos1+=p1; pos2+=p2; pos0+=(p3==2)?1:0; len++; } if (m==0)s=(pos0==0)?0:(sim*MAXID)/pos0; //sim1 else if (m==1) s=(MIN(pos1,pos2)==0)?0:(sim*MAXID)/MIN(pos1,pos2);//sim2 else if (m==2) s=(MAX(pos1,pos2)==0)?0:(sim*MAXID)/MAX(pos1,pos2);//sim3 else if (m==3) s=(len==0) ?0:((len-gap)*MAXID)/len;//gap1 else if (m==4) s=(len==0) ?0:((pos0)*MAXID)/len; //cov else if (m==5) { s=logid_score ( sim, len); } simm[n][0]=s1; simm[n][1]=s2; simm[n][2]=s; } } if ( free_ns) {vfree(ns); free_int (ls, -1);} simm[n][0]=-1; return simm; } int ** fast_aln2sim_mat (Alignment *A, char *mode) { int **simm; int p1, p2, p3, r1, r2; int gap,pos0,pos1,pos2,len,sim; int a, b, c, m; simm=declare_int (A->nseq, A->nseq); if (strstr (mode, "sim1"))m=0; else if (strstr (mode, "sim2"))m=1; else if (strstr (mode, "sim3"))m=2; else if (strstr (mode, "gap1"))m=3; else if (strstr (mode, "cov1"))m=4; else if (strstr (mode, "logid"))m=5; else m=0; for ( a=0; a< A->nseq-1; a++) { simm[a][a]=MAXID; for ( b=a+1; b< A->nseq; b++) { gap=pos0=pos1=pos2=len=sim=0; for ( c=0; c< A->len_aln; c++) { r1=tolower (A->seq_al[a][c]); r2=tolower (A->seq_al[b][c]); p1=(r1!='-')?1:0; p2=(r2!='-')?1:0; p3=p1+p2; if ( p3==0)continue; if ( p3==1)gap++; if ( r1==r2)sim++; pos1+=p1; pos2+=p2; pos0+=(p3==2)?1:0; len++; } if (m==0)simm[a][b]=simm[b][a]=(pos0==0)?0:(sim*MAXID)/pos0; //sim1 else if (m==1) simm[a][b]=simm[b][a]=(MIN(pos1,pos2)==0)?0:(sim*MAXID)/MIN(pos1,pos2);//sim2 else if (m==2) simm[a][b]=simm[b][a]=(MAX(pos1,pos2)==0)?0:(sim*MAXID)/MAX(pos1,pos2);//sim3 else if (m==3) simm[a][b]=simm[b][a]=(len==0) ?0:((len-gap)*MAXID)/len;//gap1 else if (m==4) simm[a][b]=simm[b][a]=(len==0) ?0:((pos0)*MAXID)/len; //cov else if (m==5) { //Inspired from Muscle +mafft 5 simm[a][b]=simm[b][a]=logid_score ( sim, len); } } } return simm; } int logid_score ( int sim, int len) { float score; if ( len==0)return (int)(0.33*(float)MAXID); score=(float)sim/(float)len; if (score>0.9) score=1.0; else score=-log10 (1.0-score); score=(score*MAXID); return score; } int ** aln2sim_mat (Alignment *A, char*mode) { if ( strstr (mode, "idmat"))return fast_aln2sim_mat(A, mode); return get_sim_aln_array(A, mode); } int ** aln2cov (Alignment *A) { int a, b, c; int r1, r2, gr1, gr2, pos0, gap; int **cov; cov=declare_int (A->nseq, A->nseq); for (a=0; a< A->nseq-1; a++) { cov[a][a]=100; for ( b=a+1; bnseq; b++) { for (gap=0,pos0=0,c=0;clen_aln; c++) { r1=A->seq_al[a][c]; r2=A->seq_al[b][c]; gr1=is_gap(r1); gr2=is_gap(r2); if ( gr1+gr2==0)pos0++; else if ( gr1+gr2<2)gap++; } cov[a][b]=cov[b][a]=((gap+pos0)==0)?0:((pos0*100)/(gap+pos0)); } } return cov; } int ** get_raw_sim_aln_array (Alignment *A, char *mode) { int **w; int **M; int a, b, c, r1, r2, set, max, min; w=declare_int (A->nseq, A->nseq); if (strstr(mode, "sar"))M=NULL; else M=read_matrice (mode); HERE ("RAW STUFF"); for ( set=0,a=0; a< A->nseq; a++) for (b=a; bnseq; b++) { if (M) { for (c=0; clen_aln; c++) { r1=A->seq_al[a][c]; r2=A->seq_al[b][c]; if ( !is_gap(r1) && !is_gap(r2)) w[a][b]+=M[r1-'A'][r2-'A']; } } else if ( strm (mode, "sarmat2")) { w[a][b]=get_sar_sim2 (A->seq_al[a], A->seq_al[b]); } else { HERE ("ERROR: %s is an unknown mode of raw_sim\n", mode); exit (0); } w[b][a]=w[a][b]; if (!set){min=max=w[a][b];set=1;} min=MIN(min,w[a][b]); max=MAX(max,w[a][b]); } for (a=0; anseq; a++) for (b=a; bnseq; b++) { w[b][a]=((max-min)==0)?0:((w[b][a]-min)*100)/(max-min); w[a][b]=w[b][a]; } free_int (M, -1); return w; } int ** get_sim_aln_array ( Alignment *A, char *mode) { int **w; int a, b; w=declare_int ( A->nseq, A->nseq); for ( a=0; a< A->nseq-1; a++) { for ( b=a+1; b< A->nseq; b++) { w[a][b]=w[b][a]=generic_get_seq_sim ( A->seq_al[a], A->seq_al[b], (A->cdna_cache)?A->cdna_cache[0]:NULL, mode); } } return w; } int generic_get_seq_sim ( char *seq1, char *seq2, int*cache, char *mode) { if ( strm (mode, "cdna")) return get_cdna_seq_sim ( cache, seq1, seq2,GAP_LIST, mode); else if ( strnm (mode, "ktup",4)) return ktup_comparison (seq1, seq2,atoi(mode+4)); else if ( strstr (mode, "sarmat2")) { return get_sar_sim2 (seq1, seq2); } else if ( strstr (mode, "sarmat")) return (int) get_sar_sim (seq1,seq2); else { return get_seq_sim ( seq1,seq2,GAP_LIST, mode); } } int *** get_winsim_aln_array ( Alignment *A,char *mode, int ***w) { int a, b; for ( a=0; a< A->nseq; a++) for ( b=0; b< A->nseq; b++) { if ( strm (mode, "cdna")) w[a][b]=get_cdna_seq_winsim ( A->cdna_cache[0], A->seq_al[a], A->seq_al[b],GAP_LIST, mode, w[a][b]); else w[a][b]=get_seq_winsim ( A->seq_al[a], A->seq_al[b],GAP_LIST, mode, w[a][b]); } return w; } Alignment * seq2profile (Sequence *S, int i) { Alignment *A; if ((A=seq2R_template_profile (S, i))) { return A; } else { char *tmp; FILE *fp; tmp=vtmpnam (NULL); fp=vfopen ( tmp, "w"); fprintf (fp, ">%s\n%s\n", S->name[i], S->seq[i]); vfclose (fp); (S->T[i])->R=fill_R_template (S->name[i], tmp, S); return seq2R_template_profile (S, i); } } Alignment* aln2sub_aln_file (Alignment *A, int n, char **string) { char ***list; int a; list=vcalloc (A->nseq, sizeof (char***)); if ( n==0)return A; else if (n>1) { int l; char *buf; for (l=0,a=0; a< n; a++)l+=strlen (string[a]); buf=vcalloc ( 2*n+l+1, sizeof (char)); for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");} list[0]=string2list (buf); vfree (buf); } else if ( file_exists (NULL,string[0])) { list=read_group (string[0]); } else { fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM); myexit (EXIT_FAILURE); } a=0; while (list[a]) { int i, b; FILE *fp; n=atoi (list[a][0]); fp=vfopen (list[a][1], "w"); for (b=2; bname, A->nseq, MAXNAMES); if (n==3)ungap (A->seq_al[i]); fprintf (fp, ">%s\n%s\n", A->name[i], A->seq_al[i]); } vfclose (fp); free_char (list[a], -1); a++; } vfree(list); return A; } Sequence *remove_empty_sequence (Sequence *S) { int a, b; char *c; Sequence *NS; c=vcalloc ( S->max_len+1, sizeof (char)); for (a=0, b=0; a< S->nseq; a++) { sprintf ( c, "%s",S->seq[a]); ungap (c); if ( strlen (c)==0) { //vfree (S->seq[a]); S->seq[a]=NULL; add_warning ( stderr, "WARNING: Sequence %s does not contain any residue: automatically removed from the set [WARNING:%s]",S->name[a], PROGRAM); } } NS=duplicate_sequence (S); free_sequence (S, S->nseq); vfree (c); return NS; } Alignment* aln2sub_seq (Alignment *A, int n, char **string) { char ***list; int a; Sequence *S=NULL; list=vcalloc (A->nseq, sizeof (char***)); if ( n==0)return A; else if (n>1) { int l; char *buf; for (l=0,a=0; a< n; a++)l+=strlen (string[a]); buf=vcalloc ( 2*n+l+1, sizeof (char)); for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");} list[0]=string2list (buf); vfree (buf); } else if ( file_exists (NULL,string[0])) { list=read_group (string[0]); } else { fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM); myexit (EXIT_FAILURE); } a=0; while (list[a]) { int t; Alignment *B; Sequence *subS; B=main_read_aln (list[a][1], NULL); t=aln2most_similar_sequence(B, "idmat"); subS=extract_one_seq(B->name[t],0,0,B,KEEP_NAME); S=add_sequence (subS,S,0); free_aln (B);free_sequence (subS, -1); vremove (list[a][1]); a++; } vfree(list); return seq2aln (S, NULL, RM_GAP); } Alignment * aln2collapsed_aln (Alignment * A, int n, char **string) { Alignment *B; char ***list; char **list2; char *buf=NULL; FILE *fp; int a, b,c, ns, m, l; int *collapsed; list=vcalloc (A->nseq, sizeof (char***)); ns=0; if ( n==0)return A; else if (n>1) { for (l=0,a=0; a< n; a++)l+=strlen (string[a]); buf=vcalloc ( 2*n+l+1, sizeof (char)); for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");} list[0]=string2list (buf);ns=1; } else if ( file_exists (NULL,string[0])) { /*Format: Fasta like, the name fo the group followed with the name of the sequences > .... Groups must NOT be overlaping */ l=measure_longest_line_in_file (string[0])+1; buf=vcalloc (l, sizeof (char)); ns=0; fp=vfopen (string[0], "r"); while ((c=fgetc(fp))!=EOF) { buf=fgets (buf,l-1, fp); if ( c=='>')list[ns++]=string2list (buf); } vfclose (fp); } else { fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM); myexit (EXIT_FAILURE); } vfree (buf); buf=NULL; /*Identify lost sequences*/ collapsed=vcalloc (A->nseq, sizeof (int)); for ( a=0; a< ns; a++) { m=atoi (list[a][0]); for (b=2; bname, A->nseq, MAXNAMES); if ( c>=0)collapsed[c]=1; } } for ( a=0; a< A->nseq; a++) { if ( collapsed[a]==0) { list[ns]=declare_char (3, MAXNAMES); sprintf ( list[ns][0], "3"); sprintf ( list[ns][1], "%s", A->name[a]); sprintf ( list[ns][2], "%s", A->name[a]); ns++; } } vfree (collapsed); list2=declare_char (A->nseq, 100); /*1 Collapse the alignment*/ for ( a=0; a< ns; a++) { sprintf ( list2[a], "%s", list[a][2]); } B=extract_sub_aln2 ( A, ns, list2); /*2 Rename the sequences*/ for ( a=0; a< ns; a++) { sprintf ( B->name[a], "%s", list[a][1]); } /*replace sequence with consensus*/ for ( a=0; a< ns; a++) { m=atoi (list[a][0]); for (c=0, b=2; bseq_al[a], "%s", buf); } vfree (buf); free_aln (A); B->S=aln2seq(B); return B; } Alignment * aln2profile (Alignment * A) { Alignment *B=NULL; char *cons; if (!A->P) { A->P=declare_profile (AA_ALPHABET,A->len_aln+1); } B=copy_aln (A, B); free_int ((A->P)->count, -1); free_int ((A->P)->count2, -1); free_int ((A->P)->count3, -1); (A->P)->count=aln2count_mat (A); (A->P)->count2=aln2count_mat2 (A); cons=aln2cons_seq_mat (A, "blosum62mt"); sprintf (B->seq_al[0], "%s", cons); B->nseq=1; (A->P)->count3=aln2count_mat2 (B); vfree (cons); free_aln (B); return A; } int** aln2count_mat2 ( Alignment *A) { return sub_aln2count_mat2 (A, 0, NULL); } int sub_aln2nseq_prf ( Alignment *A, int ns, int *ls) { int a, c, s; Alignment *R; int n; int free_ls=0; if ( ns==0) { n=ns=A->nseq; ls=vcalloc (n, sizeof (int)); for ( a=0; anseq; a++)ls[a]=a; free_ls=1; } else { n=ns; } for (c=0,a=0; aS && (R=seq2R_template_profile (A->S, A->order[s][0]))!=NULL) { n+=R->nseq; } else { ; } } if ( free_ls) vfree (ls); return n; } int** sub_aln2count_mat2 ( Alignment *A, int ns, int *ls) { char **p; int **count; int a, b, c, s; Alignment *R; int n; int free_ls=0; if ( ns==0) { n=ns=A->nseq; p=vcalloc ( n, sizeof (char*)); ls=vcalloc (n, sizeof (int)); for ( a=0; anseq; a++)ls[a]=a; free_ls=1; } else { n=ns; p=vcalloc (n, sizeof (char*)); } for (c=0,a=0; aS && (R=seq2R_template_profile (A->S, A->order[s][0]))!=NULL) { n+=R->nseq; p=vrealloc (p, n*sizeof (char*)); for (b=0; bnseq; b++) { p[c++]=R->seq_al[b]; } } else { int w; w=A->order[s][4]+1; for (b=0; bseq_al[s]; } } count=sub_aln2count_mat3 (p,c); vfree (p); if ( free_ls) vfree (ls); return count; } int** sub_aln2count_mat3 (char **al, int ns) { int **count; int used[1000]; int a, b; int r; int len; int us; /*count[x][0]=n symbols in column count[x][1]=total_size of line count[x][2]=Gap frequency count[x][n]=symbol n count[x][n+1]=N occurence symbol n; count[x][n+2]=N frequence symbol n*100; special multi-channeling count[x][count[x][1]]=Nseq count[x][count[x][1]+s]=residue col x, sequence s */ for (a=0; a< 1000; a++)used[a]=0; len=strlen (al[0]); count=declare_int (len+2,100+ns+2); count[len][0]=END_ARRAY; count[len][1]=ns; count[len][2]=len; for (a=0; anseq-us)*100/A->nseq;*/ count[a][2]=ns-us; for (b=3; bnseq; for (b=1; b<=A->nseq; b++) count [a][count[a][1]+b]=(is_gap(A->seq_al[b-1][a]))?0:A->seq_al[b-1][a]; */ } #ifdef XXXXXX HERE ("Display "); for (a=0; a< 5; a++) { fprintf ( stderr, "\n"); for ( b=3; b< count[a][1]; b+=3) { fprintf ( stderr, "[%c %d]", count[a][b], count[a][b+1]); } fprintf ( stderr, "\n"); for ( b=0; blen_aln); for ( a=0; alen_aln; a++) { for ( b=0; b< A->nseq; b++) { if ( is_gap ( A->seq_al[b][a]))freq_mat[alp_size][a]++; else { x=tolower(A->seq_al[b][a]); freq_mat[x-'a'][a]++; freq_mat[alp_size+1][a]++; } } } return freq_mat; } char *aln2random_seq (Alignment *A, int pn1, int pn2, int pn3, int gn) { /* Given the frequencies in A ( read as total counts of each Residue in freq[A->nseq][A->len_aln], and pn1, pn2 and pn3: 1-Generate a new amino-acid at each position 2-Insert Gaps, using a HMM. pn3=Weight of the noise induced with sub mat. pn1=% noise type 1 ( Varies with entropi) n1=Ratio noise type 1 T =Nseq t1=Noise 1 expressed in Nseq al=alphabet size; ncat=number of non 0 cat for a given position ICi initial count for residue i Ci=freq[seq][AA] t1=T*n1*(1-1/ncat); t2=T*n2; Ci= ICi*(T-(t1+t2))/T +(t1)/al+(t2)/al */ int **freq; int **count; float T, tot_t1, tot_t2,tot_t3, n1, n2, n3; float ncat; double gf; double *init_freq; double *blur_freq; double *t1, *t2,*t3; int a, b, c, x; char *seq; int tot; /*Viterbi Parameters */ int p; int AL=0; /*Allowed Transition*/ int F=-100000; /*Forbiden Transition*/ int GAP_TRANSITION; int IGAP=0, IAA=1; int state,best_state=0, score, best_score=0; int p_state; int e=0; int **score_tab; int **state_tab; int nstate=2; int **transitions; int max; seq=vcalloc ( A->len_aln+1, sizeof (char)); count=aln2count_mat(A); freq=aln2count_mat(A); T=100; n1=(float)pn1/100; n2=(float)pn2/100; n3=(float)pn3/100; for ( a=0; a< A->len_aln; a++) { for ( b=0; b<26; b++) freq[b][a]=freq[b][a]*((T)/(A->nseq-freq[26][a])); freq[26][a]= (freq[26][a]*T)/A->nseq; } init_freq=vcalloc ( 26, sizeof (double)); blur_freq=vcalloc ( 26, sizeof (double)); tot_t1=tot_t2=tot_t3=0; t1=vcalloc ( 27, sizeof (double)); t2=vcalloc ( 27, sizeof (double)); t3=vcalloc ( 27, sizeof (double)); for (a=0; a< A->len_aln; a++) { /*Compute Frequencies*/ for (tot=0, b=0; b<26; b++) { if ( is_aa(b+'A')) { init_freq[b]=freq[b][a]; tot+=freq[b][a]; } } /*Count the number of different amino acids*/ for ( ncat=0, b=0; b<=26; b++) { ncat+=(freq[b][a]!=0)?1:0; } /*Blurr the distribution using */ blur_freq=compute_matrix_p (init_freq,tot); /*compute noise 1: biased with blurred content * enthropy--> keeps prosite motifs*/ tot_t1=T*n1*(1-1/ncat); for ( b=0; b< 26; b++)if ( is_aa(b+'A')){t1[b]=blur_freq[b]*(1-1/ncat)*n1;} /*Compute noise 2: completely random*/ tot_t2=T*n2; for ( b=0; b< 26; b++)if ( is_aa(b+'A')){t2[b]=tot_t2/21;} /*compute noise 3: biased with the sole content(pam250mt)*/ tot_t3=T*n3; for ( b=0; b<26; b++)if ( is_aa(b+'A')){t3[b]=blur_freq[b]*n3;} for ( b=0; b<26; b++) { if ( is_aa('A'+b)) freq[b][a]=freq[b][a]*(T-(tot_t1+tot_t2+(tot_t3)))/T+t1[b]+t2[b]+t3[b]; } /*end of the loop that mutates position a*/ } vfree (blur_freq); vfree (init_freq); vfree ( t3); /*1-Generate the amino acids of the new sequence new*/ vsrand (0); for ( a=0; a< A->len_aln; a++) { for (T=0,b=0; b<26; b++)T+=freq[b][a]; x=rand ()%((int)T); for (c=0,b=0; b<26; b++) { c+=freq[b][a]; if ( c>=x) { seq[a]='A'+b; c=-1; break; } } if ( c!=-1)seq[a]='-'; } seq[a]='\0'; /*2 Generate the gaps in the new sequence*/ if ( gn<0); else { transitions=declare_int ( nstate, nstate); score_tab=declare_int ( A->len_aln+2, nstate ); state_tab=declare_int ( A->len_aln+2, nstate ); for (a=0; alen_aln; p++){for (state=0; state< nstate; state++){score_tab[p][state]=F;state_tab[p][state]=-1;} } for (p=1; p<= A->len_aln; p++) { for (max=0,a=0; a<26; a++)max=MAX(max, freq[a][p-1]); max=(max*(A->nseq-count[26][p-1]))/A->nseq; for (state=0; state< nstate; state++) { gf=freq[26][p-1]; if ( state==IGAP) e=gf-50; else if ( state==IAA ) e=max-50; for (p_state=0; p_statebest_score){ best_score=score;best_state=p_state;} } score_tab[p][state]=best_score; state_tab[p][state]=best_state; } } for (state=0; statebest_score){best_score=score_tab[p-1][state]; best_state=state;} } for (p=A->len_aln; p>0;) { if ( best_state==IGAP) { seq[p-1]='-'; } else if ( best_state==IAA) { seq[p-1]=seq[p-1]; } best_state=state_tab[p][best_state]; p--; } } free_int (freq, -1); return seq; } /********************************************************************/ /* */ /* Weighting functions */ /* */ /* */ /* */ /********************************************************************/ Alignment * master_trimseq( Alignment *A, Sequence *S,char *mode) { Alignment *NA; char *p; int a, b; int use_aln=0, upper_sim=0, min_nseq=0, lower_sim=0; float f_upper_sim, f_lower_sim; char weight_mode[1000]; char method[1000]; int statistics=0; int trim_direction=TOP; float **sim_weight; int *seq_list; int table=0; /* mode: (trim)__%_n_w */ seq_list=vcalloc ( S->nseq, sizeof (int)); for ( a=0; a< A->nseq; a++) { seq_list[a]=1; } use_aln=aln_is_aligned(A); if ( mode[0]=='\0') { upper_sim=50; lower_sim=0; min_nseq=0; sprintf (weight_mode, "pwsim"); sprintf ( method, "clustering2"); } else { upper_sim=lower_sim=min_nseq; sprintf (weight_mode, "pwsim"); sprintf ( method, "clustering2"); } /* U or % (deprecated) Upper bound for pairwise similarity L or m (depercated) Lower bound for pairwise similarity n max number of sequences N max number of sequences as a fraction of thet total S print Statistics T print Table of distances */ while ( (p=strtok(mode, "_"))) { mode=NULL; if (strm (p, "seq"))use_aln=0; else if ( strm(p,"aln"))use_aln=1; else if (p[0]=='s')statistics=1; else if (p[0]=='t')table=1; else if (p[0]=='U')upper_sim=atoi(p+1); else if (p[0]=='L')lower_sim=atoi(p+1); else if (p[0]=='n')min_nseq=atoi(p+1); else if (p[0]=='N')min_nseq=atoi(p+1)*-1; else if (p[0]=='B')trim_direction=BOTTOM; else if (p[0]=='T')trim_direction=TOP; else if (p[0]=='W')sprintf (weight_mode, "%s", p+1); else if (p[0]=='M')sprintf (method, "%s", p+1); else if (p[0]=='K') { while ((p=strtok(NULL, ":"))) { if ( p[0]=='#') { seq_list[atoi(p+1)-1]=2; } else if ( (a=name_is_in_list (p, A->name, A->nseq, 100))!=-1) { seq_list[a]=2; } } } } if ( !upper_sim && !min_nseq && !lower_sim)upper_sim=50; if (!S) { fprintf ( stderr, "\ntrimseq requires a set of sequences[FATAL:%s]\n", PROGRAM); crash(""); } else if ( min_nseq> S->nseq) { min_nseq=S->nseq; } else if ( min_nseq<0) { if ( min_nseq<-100) { add_warning ( stderr, "\nWARNING: trimseq: Nseq(N) max_val=100%% [Automatic reset]\n"); min_nseq=-100; } min_nseq=(int)((float)S->nseq*((float)min_nseq/100)*-1); } NA=seq2subseq3 (A, S,use_aln,lower_sim,upper_sim,min_nseq,trim_direction, weight_mode,&sim_weight, seq_list ); if ( table) { fprintf ( stderr, "\nSIMILARITY MATRIX\n"); for ( a=0; a< A->nseq-1; a++) for ( b=a+1; b< A->nseq; b++) { fprintf ( stderr, "%15s Vs %15s : %3.2f %% id\n", A->name[a], A->name[b], 100-sim_weight[a][b]); } } if ( statistics) { f_upper_sim=(upper_sim>100)?((float)upper_sim/(float)100):upper_sim; f_lower_sim=(upper_sim>100)?((float)lower_sim/(float)100):lower_sim; fprintf ( stderr, "\nTRIM Informations:\n"); fprintf ( stderr, "\tUse...........: %s\n",(use_aln)?"multiple_aln":"pairwise_aln"); fprintf ( stderr, "\tcluster_mode..: %s\n" ,method); fprintf ( stderr, "\tsim_mode......: %s\n" ,weight_mode); fprintf ( stderr, "\tlower_id_bound: %.2f%%\n" ,(f_lower_sim==0)?-1:f_lower_sim); fprintf ( stderr, "\tupper_id_bound: %.2f%%\n",(f_upper_sim==0)?-1:f_upper_sim); fprintf ( stderr, "\tnseq_kept.....: %d (out of %d)\n" ,NA->nseq, S->nseq); fprintf ( stderr, "\treduction.....: %d%% of original set\n" ,(NA->nseq*100)/S->nseq); fprintf ( stderr, "\tTrim_direction: From %s \n" ,(trim_direction==BOTTOM)?"Bottom":"Top"); } return NA; } Alignment *sim_filter (Alignment *A, char *in_mode, char *seq) { int **sim, **cov; int *list; int *keep; int maxnseq, nseq_ratio, nc; int new_nseq; int a, s, n, k; Alignment *R; char *mode; int outlayers; int direction=1;//remove the higher than int coverage=0; //remove based on coverage static char *field; int maxsim, minsim, maxcov, mincov; if ( !field) field=vcalloc (1000, sizeof (char)); mode=vcalloc ( strlen (in_mode)+10, sizeof (char)); sprintf ( mode, "_%s_", in_mode); strget_param ( mode, "_I", "100", "%d", &maxsim); strget_param ( mode, "_i", "0", "%d", &minsim); strget_param ( mode, "_C", "100", "%d", &maxcov); strget_param ( mode, "_c", "0", "%d", &mincov); keep=vcalloc ( A->nseq, sizeof (int)); list=vcalloc ( A->nseq, sizeof (int)); if (!seq)s=0; else s=name_is_in_list (seq, A->name, A->nseq, 100); if (s==-1) { if ( s==-1)printf_exit (EXIT_FAILURE, stderr, "ERROR: %s is not a valid sequence", seq); } else keep[s]=1; //get the distances if ( strstr (mode, "_seq_")) { char **seq; int **M; M=read_matrice ("blosum62mt"); seq=declare_char (A->nseq, A->len_aln+1); for (a=0; anseq; a++) { sprintf ( seq[a], "%s", A->seq_al[a]); ungap (seq[a]); } sim=declare_int (A->nseq, A->nseq); cov=declare_int (A->nseq, A->nseq); for (a=0; anseq; a++) { if ( s!=a) { sim[s][a]=sim[a][s]=idscore_pairseq(seq[s], seq[a],-12, -1,M,"sim"); cov[s][a]=cov[a][s]=idscore_pairseq(seq[s], seq[a],-12, -1,M,"cov"); } } free_char (seq, -1); free_int (M,-1); } else { sim=aln2sim_mat (A, "idmat"); cov=aln2cov (A); } for (a=0; a< A->nseq; a++) { if (a==s)continue; else { if ( sim[s][a]>maxsim || sim[s][a]maxcov)keep[a]=-1; else keep[a]=1; } } for ( n=0, a=0; a< A->nseq; a++) { if ( keep[a]!=-1) { list[n++]=a; } } R=extract_sub_aln (A, n, list); free_int (sim, -1); free_int (cov, -1);vfree (list); return R; } static int find_worst_seq ( int **sim, int n, int *keep, int max, int direction); Alignment *simple_trimseq (Alignment *A, Alignment *K, char *in_mode, char *seq_list) { int **sim; int *list; int *keep; int maxnseq, maxsim, nseq_ratio, nc; int new_nseq; int a,b, s, n, k; Alignment *R; char *mode; int outlayers; int direction=1;//remove the higher than int coverage=0; //remove based on coverage static char *field; int *tot_avg; if ( !field) field=vcalloc (1000, sizeof (char)); mode=vcalloc ( strlen (in_mode)+10, sizeof (char)); sprintf ( mode, "_%s_", in_mode); strget_param ( mode, "_%%", "0", "%d", &maxsim); strget_param ( mode, "_n", "0", "%d", &maxnseq); strget_param ( mode, "_N", "0", "%d", &nseq_ratio); strget_param ( mode, "_F", "0", "%d", &nc); strget_param ( mode, "_O", "0", "%d", &outlayers); strget_param ( mode, "_f", "NAME", "%s", field); if ( strstr (mode, "_min"))direction=-1; else direction=1; if ( strstr (mode, "_cov"))coverage=1; else coverage=0; if ( nseq_ratio) { maxnseq=(A->nseq*nseq_ratio)/100; maxsim=0; } else if ( maxnseq) { maxsim=0; } else if ( !maxsim) { maxsim=100; } keep=vcalloc ( A->nseq, sizeof (int)); list=vcalloc ( A->nseq, sizeof (int)); /*Remove Sequences that do not have at least one residue in the first and last nc columns*/ if ( nc) { int left, right, full_n,x, y; int *full_list; Alignment *F; full_list=vcalloc ( A->nseq, sizeof (int)); full_n=0; for (x=0; x< A->nseq; x++) { for ( left=0,y=0; ylen_aln,nc); y++) if (!is_gap(A->seq_al[x][y]))left=1; for ( right=0,y=MAX(0,(A->len_aln-nc)); ylen_aln; y++) if (!is_gap(A->seq_al[x][y]))right=1; if ( left && right)full_list[full_n++]=x; } F=extract_sub_aln (A, full_n, full_list); free_aln (A); vfree (full_list); A=F; } /*Reorder the sequences according to the tree order: hopefully better phylogenetic coverage after trim*/ if (strstr (mode, "_T")) { NT_node **T; Sequence *O; sim=sim_array2dist_array ( sim, MAXID); T=int_dist2nj_tree (sim, A->name, A->nseq, NULL); O=tree2seq (T[3][0], NULL); A=reorder_aln (A, O->name, O->nseq); free_int (sim, -1); free_sequence (O, -1); } if ( coverage==0) { if ( strstr (mode, "seq_"))sim=seq2comp_mat (aln2seq(A), "blosum62mt", "sim"); else sim=aln2sim_mat (A, "idmat"); } else { int b; if ( strstr (mode, "seq_"))sim=seq2comp_mat (aln2seq(A), "blosum62mt", "cov"); else sim=aln2cov (A); } if ( K && K->nseq>0) { for ( a=0; a< K->nseq; a++) if ( (k=name_is_in_list (K->name[a], A->name, A->nseq, MAXNAMES+1))!=-1) { keep[k]=1; } } if ( seq_list) { for ( a=0; a< A->nseq; a++) { if (strstr (field, "NAME") && perl_strstr (A->name[a], seq_list)){keep[a]=1;} else if (strstr (field, "COMMENT") && A->seq_comment && perl_strstr(A->seq_comment[a], seq_list)){keep[a]=1;} else if (strstr (field, "SEQ") && perl_strstr((A->S)->seq[a], seq_list)){keep[a]=1;} } } for ( a=0; a< A->nseq; a++) if ( keep[a]) fprintf ( stderr, "\nFORCED KEEP %s", A->name[a]); new_nseq=A->nseq; while ( (s=find_worst_seq (sim, A->nseq, keep, maxsim, direction))!=-1 && new_nseq>maxnseq) { for ( a=0; a< A->nseq; a++)sim[a][s]=sim[s][a]=-1; keep[s]=-1; new_nseq--; } /*Trim Outlayers*/ if (outlayers!=0) { int nn, b; tot_avg=vcalloc ( A->nseq, sizeof (int)); for (a=0; anseq; a++) { if ( keep[a]==-1)tot_avg[a]=-1; else { for (nn=0, b=0; b< A->nseq; b++) { if (a==b || keep[b]==-1)continue; else { tot_avg[a]+=sim[a][b]; nn++; } } tot_avg[a]=(nn==0)?-1:(tot_avg[a])/nn; } } for ( a=0; anseq; a++) { if (tot_avg[a]!=-1 && tot_avg[a]name[a]); keep[a]=-1; } } vfree ( tot_avg); } for ( n=0, a=0; a< A->nseq; a++) { if ( keep[a]!=-1) { list[n++]=a; } } R=extract_sub_aln (A, n, list); free_int (sim, -1); vfree (list); return R; } int find_worst_seq ( int **sim, int n, int *keep,int max,int direction) { int **sc; int a, b, r=0; int si; sc=declare_int (n, 2); if (direction==-1)max=100-max; for ( a=0; a< n; a++) sc[a][0]=a; for ( a=0; a< n-1; a++) { for ( b=a+1; b=0)si=(direction==-1)?100-sim[a][b]:sim[a][b]; else si=sim[a][b]; if ( si>max) { if ( keep[a]!=1)sc[a][1]+=si; if ( keep[b]!=1)sc[b][1]+=si; } } } sort_int_inv ( sc, 2, 1, 0, n-1); if ( sc[0][1]>0)r=sc[0][0]; else r=-1; free_int (sc, -1); if (r!=-1 && keep && keep[r])return -1; else return r; } int find_worst_seq_old ( int **sim, int n, int *keep,int max,int direction) { int **sc; int a, b, r=0; sc=declare_int (n, 2); for ( a=0; a< n; a++) sc[a][0]=a; for ( a=0; a< n-1; a++) { for ( b=a+1; bmax) { if ( keep[a]!=1)sc[a][1]+=sim[a][b]; if ( keep[b]!=1)sc[b][1]+=sim[a][b]; } } else if ( direction == -1) { if ( sim[a][b]=0) { if ( keep[a]!=1)sc[a][1]+=sim[a][b]; if ( keep[b]!=1)sc[b][1]+=sim[a][b]; } } } } if ( direction ==1) //remove max { sort_int_inv ( sc, 2, 1, 0, n-1); if ( sc[0][1]>0)r=sc[0][0]; else r=-1; } else if ( direction ==-1)//remove min { sort_int_inv ( sc, 2, 1, 0, n-1); if ( sc[0][1]>=0)r=sc[0][0]; else r=-1; HERE ("** %d %d\n", r,sc[0][1]); } free_int (sc, -1); if (r!=-1 && keep && keep[r])return -1; else return r; } Alignment * trimseq( Alignment *A, Sequence *S,char *mode) { Alignment *NA; char *p; int a, b; int use_aln=0, upper_sim=0, min_nseq=0, lower_sim=0; char weight_mode[1000]; char method[1000]; int statistics=0; int trim_direction=TOP; float **sim_weight; int *seq_list; int table=0; int print_name=0; float f_lower_sim, f_upper_sim; /* mode: (trim)__%_n_w */ seq_list=vcalloc ( S->nseq, sizeof (int)); for ( a=0; a< A->nseq; a++) { seq_list[a]=1; } use_aln=aln_is_aligned(A); if ( mode[0]=='\0') { upper_sim=50; lower_sim=0; min_nseq=0; sprintf (weight_mode, "pwsim_fragment"); sprintf ( method, "clustering2"); } else { upper_sim=lower_sim=min_nseq; sprintf (weight_mode, "pwsim_fragment"); sprintf ( method, "clustering2"); } /* U or % (deprecated) Upper bound for pairwise similarity L or m (depercated) Lower bound for pairwise similarity n max number of sequences N max number of sequences as a fraction of thet total S print Statistics T print Table of distances */ while ( (p=strtok(mode, "_"))) { mode=NULL; if (strm (p, "seq"))use_aln=0; else if ( strm(p,"aln"))use_aln=1; else if (p[0]=='s')statistics=1; else if (p[0]=='t')table=1; else if (p[0]=='p')print_name=1; else if (p[0]=='U')upper_sim=atoi(p+1); else if (p[0]=='L')lower_sim=atoi(p+1); else if (p[0]=='n')min_nseq=atoi(p+1); else if (p[0]=='N')min_nseq=atoi(p+1)*-1; else if (p[0]=='B')trim_direction=BOTTOM; else if (p[0]=='T')trim_direction=TOP; else if (p[0]=='W')sprintf (weight_mode, "%s", p+1); else if (p[0]=='M')sprintf (method, "%s", p+1); else if (p[0]=='K') { while ((p=strtok(NULL, ":"))) { if ( (a=name_is_in_list (p, A->name, A->nseq, 100))!=-1) { seq_list[a]=2; } } } } if ( !upper_sim && !min_nseq && !lower_sim)upper_sim=50; if (!S) { fprintf ( stderr, "\ntrimseq requires a set of sequences[FATAL:%s]\n", PROGRAM); crash(""); } else if ( min_nseq> S->nseq) { min_nseq=S->nseq; } else if ( min_nseq<0) { if ( min_nseq<-100) { add_warning ( stderr, "\nWARNING: trimseq: Nseq(N) max_val=100%% [Automatic reset]\n"); min_nseq=-100; } min_nseq=(int)((float)S->nseq*((float)min_nseq/100)*-1); } NA=seq2subseq2 (A, S,use_aln,lower_sim,upper_sim,min_nseq,trim_direction, weight_mode,&sim_weight, seq_list ); if ( table) { fprintf ( stderr, "\nSIMILARITY MATRIX\n"); for ( a=0; a< A->nseq-1; a++) for ( b=a+1; b< A->nseq; b++) { fprintf ( stderr, "%15s Vs %15s : %3.2f %% id\n", A->name[a], A->name[b], 100-sim_weight[a][b]); } } NA=seq_name2removed_seq_name(S, NA,sim_weight); if ( print_name) { fprintf ( stderr, "\nList of sequences with their closest removed neighbors\n"); for ( a=0; a< NA->nseq; a++)fprintf ( stderr, "\n%s: %s\n", NA->name[a], NA->seq_comment[a]); } if ( statistics) { f_lower_sim=(lower_sim>100)?(float)lower_sim/100:lower_sim; f_upper_sim=(upper_sim>100)?(float)upper_sim/100:upper_sim; fprintf ( stderr, "\nTRIM seq Informations:\n"); fprintf ( stderr, "\tUse...........: %s\n",(use_aln)?"multiple_aln":"pairwise_aln"); fprintf ( stderr, "\tcluster_mode..: %s\n" ,method); fprintf ( stderr, "\tsim_mode......: %s\n" ,weight_mode); fprintf ( stderr, "\tlower_id_bound: %.2f%%\n" ,(f_lower_sim==0)?-1:f_lower_sim); fprintf ( stderr, "\tupper_id_bound: %.2f%%\n",(f_upper_sim==0)?-1:f_upper_sim); fprintf ( stderr, "\tnseq_kept.....: %d (out of %d)\n" ,NA->nseq, S->nseq); fprintf ( stderr, "\treduction.....: %d%% of original set\n" ,(NA->nseq*100)/S->nseq); fprintf ( stderr, "\tTrim_direction: From %s \n" ,(trim_direction==BOTTOM)?"Bottom":"Top"); } return NA; } Alignment * tc_trimseq( Alignment *A, Sequence *S,char *mode) { Alignment *NA; Sequence *TS; char *trimfile, *alnfile; int *seq_list; int a, nseq=0, sim=0; char *p; char command[100000]; char keep_list[10000]; int top, bottom, middle, pmiddle; keep_list[0]='\0'; seq_list=vcalloc ( S->nseq, sizeof (int)); for ( a=0; a< A->nseq; a++) { seq_list[a]=1; } trimfile=vtmpnam (NULL); alnfile=vtmpnam (NULL); if ( !aln_is_aligned (A)) { fprintf ( stderr, "\ntrimTC: computation of an Approximate MSA ["); A=compute_tcoffee_aln_quick ( A, NULL); fprintf ( stderr, "DONE]\n"); } output_clustal_aln (alnfile, A); while ( (p=strtok(mode, "#"))) { mode=NULL; if (p[0]=='%' || p[0]=='S')sim=(p[1]=='%')?atoi(p+2):atoi(p+1); else if (p[0]=='n' || p[0]=='N')nseq=atoi(p+1); else if (p[0]=='K') { if ( (a=name_is_in_list (p+1, A->name, A->nseq, 100))!=-1) { seq_list[a]=2; } } } if ( nseq ==0 && sim ==0) { fprintf ( stderr, "\nERROR: trimTC\nIndicate the maximum number of sequences Nnseq\nOR the maximum average similarity of the chosen sequencesSx\nEX: +trimTC S20 OR +trimTC N5"); fprintf ( stderr, "\n[FATAL:%s]", PROGRAM); myexit (EXIT_FAILURE); } for ( a=0; anseq; a++)if (seq_list[a]==2){strcat ( keep_list, A->name[a]);strcat ( keep_list," ");} if ( sim) { sprintf ( command , "%s -infile %s -trim -trimfile=%s -split_score_thres %d -convert -iterate 0 ",get_string_variable("t_coffee"), alnfile, trimfile,sim); if ( keep_list[0]){strcat ( command, " -seq_to_keep ");strcat ( command, keep_list);} my_system ( command); TS=read_sequences (trimfile); } else if ( nseq && A->nseq>nseq) { top=100;bottom=0; pmiddle=0;middle=50; sprintf ( command , "%s -infile %s -trim -trimfile=%s -split_score_thres %d -convert -iterate 0",get_string_variable("t_coffee"), alnfile, trimfile,middle); if ( keep_list[0]){strcat ( command, " -seq_to_keep ");strcat ( command, keep_list);} my_system ( command); TS=read_sequences (trimfile); fprintf ( stderr, "\n\tTrimTC: Sim %d Nseq %d\t",middle, TS->nseq); if ( TS->nseq>nseq)top=middle; else if ( TS->nseqnseq!=nseq && pmiddle!=middle) { sprintf ( command , "%s -infile %s -trim -trimfile=%s -split_score_thres %d -convert -iterate 0 ",get_string_variable("t_coffee"), alnfile, trimfile,middle); if ( keep_list[0]){strcat ( command, " -seq_to_keep ");strcat ( command, keep_list);} my_system ( command); free_sequence (TS, -1); TS=read_sequences (trimfile); fprintf ( stderr, "\n\tTrimTC: Sim %d Nseq %d\t", middle, TS->nseq); if ( TS->nseq>nseq)top=middle; else if ( TS->nseq100)?(float)int_lower_sim/100:int_lower_sim; upper_sim=(int_upper_sim>100)?(float)int_upper_sim/100:int_upper_sim; sim_weight[0]=get_weight ((use_aln)?A:NULL, S, weight_mode); name=declare_char (S->nseq, (MAXNAMES+1)); seq= declare_char (S->nseq, S->max_len+1); /* Remove every sequence that is more than upper_sim and less than lower_sim similar to the master sequences the master sequence(s) are those for which seq_list[x]==2 */ new_nseq=A->nseq; for (a=0; a< A->nseq; a++) { if ( seq_list[a]==2) { for ( b=0; b< A->nseq;b++) { sim=100-sim_weight[0][a][b]; if (seq_list[b]==1 && (sim>upper_sim || simnseq; a++) { if ( seq_list[a]) { sprintf ( name[b], "%s", S->name[a]); sprintf ( seq[b] , "%s",(use_aln)?A->seq_al[a]: S->seq[a] ); b++; } } NS=fill_sequence_struc (new_nseq,seq,name); NA=seq2aln(NS,NULL,1); if ( use_aln && A) { NA=realloc_aln2 ( NA,A->max_n_seq,A->len_aln+1); for (b=0, a=0; anseq; a++) { if ( seq_list[a]) { sprintf ( NA->seq_al[b] , "%s",A->seq_al[a]); b++; } } NA->len_aln=A->len_aln; ungap_aln(NA); } return NA; } Alignment* seq2subseq2( Alignment *A, Sequence *S,int use_aln, int int_lower_sim,int int_upper_sim, int min_nseq, int trim_direction, char *weight_mode, float ***sim_weight, int *seq_list) { int a, b; int new_nseq; int seq_index=0; /*OUTPUT*/ char **seq, **name; Sequence *NS; Alignment *NA; float lower_sim, upper_sim; lower_sim=(int_lower_sim>100)?(float)int_lower_sim/100:int_lower_sim; upper_sim=(int_upper_sim>100)?(float)int_upper_sim/100:int_upper_sim; sim_weight[0]=get_weight ((use_aln)?A:NULL, S, weight_mode); name=declare_char (S->nseq, (MAXNAMES+1)); seq= declare_char (S->nseq, S->max_len+1); /* 1 REMOVE OUTLAYERS 2 REMOVE CLOSELY RELATED SEQUENCES 3 IF STILL TOO MANY SEQUENCES: REMOVE THE MOST CLOSELY RELATED ONES */ /*1 Remove outlayers*/ new_nseq=A->nseq; /*1 Remove outlayers*/ while ( lower_sim && (extreme_seq(BOTTOM,A,sim_weight[0],seq_list, &seq_index) min_nseq) && seq_index!=-1) { if ( seq_list[seq_index]==1) { seq_list[seq_index]=0; new_nseq--; } } /*2 Remove close relative*/ while ( upper_sim && (extreme_seq(TOP, A,sim_weight[0],seq_list, &seq_index)>upper_sim) && ((new_nseq)>min_nseq)&& seq_index!=-1) { if ( seq_list[seq_index]==1) { seq_list[seq_index]=0; new_nseq--; } } /*Remove extra sequences*/ while ( min_nseq>0 && new_nseq>min_nseq && seq_index!=-1) { extreme_seq(trim_direction, A,sim_weight[0],seq_list, &seq_index); if ( seq_index==-1)break; if ( seq_list[seq_index]==1) { seq_list[seq_index]=0; new_nseq--; } } /*Prepare the new sequence List*/ for (b=0, a=0; anseq; a++) { if ( seq_list[a]) { sprintf ( name[b], "%s", S->name[a]); sprintf ( seq[b] , "%s",(use_aln)?A->seq_al[a]: S->seq[a] ); b++; } } NS=fill_sequence_struc (new_nseq,seq,name); NA=seq2aln(NS,NULL,1); if ( use_aln && A) { NA=realloc_aln2 ( NA,A->max_n_seq,A->len_aln+1); for (b=0, a=0; anseq; a++) { if ( seq_list[a]) { sprintf ( NA->seq_al[b],"%s",A->seq_al[a]); b++; } } NA->len_aln=A->len_aln; ungap_aln(NA); } return NA; } float extreme_seq (int direction, Alignment *A,float **sim_weight,int *seq_list, int *seq_index) { /*find the closest relative of each sequence Return: Direction= BOTTOM: the sequence whose closest relative is the most distant Direction= TOP: the sequence whose closest relative is the closest weight: different sequences=100 similar sequences =0 */ int a, b; float top_sim,bottom_sim, best_sim, sim; int top_seq, bottom_seq; bottom_seq=top_seq=seq_index[0]=-1; top_sim=-1; bottom_sim=101; for (a=0; a< A->nseq; a++) { if (seq_list[a]!=1)continue; for ( best_sim=0, b=0; b< A->nseq; b++) { if ( a==b || !seq_list[b])continue; sim=100-sim_weight[a][b]; if (sim>best_sim) { best_sim=sim; } } if ( best_sim>top_sim) { top_seq=a; top_sim=best_sim; } if ( best_sim_%_n_w */ sim_weight=get_weight ((use_aln)?A:NULL, S, weight_mode); pw_weight=declare_float (S->nseq, S->nseq); seq_weight=declare_float ( S->nseq, 2); for (best_score=0,a=0; anseq; a++) { for ( b=0; bnseq; b++) { if ( a==b)continue; seq_weight[a][0]+=sim_weight[a][b]; } seq_weight[a][0]=seq_weight[a][0]/(S->nseq-1); score=seq_weight[a][0]=100-seq_weight[a][0]; if ( score>best_score) { best_seq=a; best_score=score; } } for (a=0; anseq; a++) { for ( b=0; bnseq; b++) { if ( a==b)continue; pw_weight[a][b]=sim_weight[a][b]*seq_weight[a][0]*seq_weight[b][0]/(100*100); } } seq_list=vcalloc ( S->nseq, sizeof (int)); used_seq_list=vcalloc ( S->nseq, sizeof (int)); name=declare_char (S->nseq, (MAXNAMES+1)); seq= declare_char (S->nseq, S->max_len+1); /*compute the normalization factor*/ for (sum=0,d=0; d< S->nseq; d++) { for (score=0,c=0; cnseq; c++) { if ( c!=d) score=MAX(score, 100-sim_weight[c][d]); } sum+=score; } sum=sum/S->nseq; /*chose the first sequence */ for ( best_score=0,a=0; a< S->nseq; a++) { for (score=0, b=0; b< S->nseq; b++) { score+=100-sim_weight[a][b]; } if ( score>best_score) { best_seq=a; best_score=score; } } last_chosen=chosen=((best_score/S->nseq)*100)/sum; nchosen=last_nchosen=1; seq_list[0]=best_seq; used_seq_list[best_seq]=1; sprintf ( name[0],"%s", S->name[seq_list[0]]); sprintf ( seq[0],"%s", S->seq[seq_list[0]]); nchosen=last_nchosen=1; fprintf ( stderr, "\nTRIM:\n"); fprintf ( stderr, "\n1-Chosen Sequences\n"); /*Assemble the list of sequences*/ for (a=1; a< S->nseq; a++) { for (best_score=0,b=0; b< S->nseq; b++) { if (used_seq_list[b]); else { score=pw_weight[seq_list[0]][b]+1; for (c=0; c=best_score) { best_seq=b; best_score=score; } } } seq_list[a]=best_seq; used_seq_list[best_seq]=1; for ( chosen=0,d=0; d< S->nseq; d++) { for (score=0, c=0; c<=a; c++) { if ( seq_list[c]!=d) score=MAX(score, 100-sim_weight[seq_list[c]][d]); } chosen+=score; } chosen=((chosen/S->nseq)*100)/sum; nchosen=a+1; condition1= (int)chosen<=(int)percent || !percent; condition2=(nchosen)<=max_nseq || !max_nseq; if (condition1 && condition2) { fprintf ( stderr, "\tADD %s (set score: %.2f %%)\n", S->name[seq_list[a]], chosen); sprintf ( name[a],"%s", S->name[seq_list[a]]); sprintf ( seq[a],"%s", S->seq[seq_list[a]]); } else { break; } last_chosen=chosen; last_nchosen=nchosen; } NS=fill_sequence_struc (last_nchosen,seq,name); NA=seq2aln(NS,NULL,1); fprintf ( stderr, "\n2-Informations:\n"); fprintf ( stderr, "\tUse...........: %s\n",(use_aln)?"multiple_aln":"pairwise_aln"); fprintf ( stderr, "\tweight_mode...: %s\n" ,weight_mode); fprintf ( stderr, "\tpercent_weight: %.2f%% (max=%d%%)\n",last_chosen,percent); fprintf ( stderr, "\tn_seq.........: %d\n" ,NS->nseq); fprintf ( stderr, "\treduction.....: %d%% of original set\n" ,(NS->nseq*100)/S->nseq); return NA; } float ** get_weight ( Alignment *A, Sequence *S, char *mode) { char *aln_name; char *weight_name; char *seq_name; char command[LONG_STRING]; char program[LONG_STRING]; float **weight; FILE *fp; int c; if ( !mode || !mode[0] || strm (mode, "msa")) { if ( getenv ( "SEQ2MSA_WEIGHT")==NULL)sprintf (program, "%s",SEQ2MSA_WEIGHT); else sprintf ( program, "%s", (getenv ( "SEQ2MSA_WEIGHT"))); } else if ( strm(mode, "pwsim") ||strm(mode, "pwsim_fragment") ) { return seq2pwsim (A, S, mode); } else { if (getenv (mode))sprintf ( program, "%s", (getenv (mode))); else fprintf ( stderr, "\nERROR: %s is not a valid mode for weight computation [FATAL:%s]", mode, PROGRAM); } /*MSA weights*/ seq_name=vtmpnam(NULL); aln_name=vtmpnam(NULL); weight_name=vtmpnam(NULL); weight=declare_float (S->nseq+1, 2); if (A) { output_clustal_aln (seq_name,A); output_fasta_seq (aln_name,A); sprintf ( command, "%s %s -i %s -w %s", program, seq_name, aln_name, weight_name); } else { A=seq2aln(S,A,1); output_fasta_seq (seq_name,A); sprintf ( command, "%s %s -w %s", program, seq_name, weight_name); } my_system ( command); fp=vfopen( weight_name, "r"); while ( (c=fgetc(fp))!='$'); c=fgetc(fp); c=0; while ( (fscanf (fp, "%*s %f\n",&(weight[c][1])))==1) {weight[c][0]=c;c++;} vfclose (fp); return weight; } float **seq2pwsim ( Alignment *A, Sequence *S, char *mode) { int a, b, c; float d,t; float **W; Alignment *B; W=declare_float (S->nseq, S->nseq); for (a=0; a< S->nseq; a++) for ( b=a; bnseq; b++) { if ( a==b){d=1;} else if (!A) { B=align_two_sequences ((S)->seq[a], (S)->seq[b],"pam250mt", -10, -1, "fasta_pair_wise"); for (t=0,d=0,c=0; clen_aln; c++) { d+=(B->seq_al[0][c]==B->seq_al[1][c] && !is_gap(B->seq_al[0][c])); t+=(!is_gap(B->seq_al[0][c]) && !is_gap(B->seq_al[1][c])); } t=(strm ( mode, "pwsim_fragment"))?B->len_aln:t; d=d/((t==0)?1:t); free_aln(B); } else { for (t=0,d=0,c=0; clen_aln; c++) { d+=(A->seq_al[a][c]==A->seq_al[b][c] && !is_gap(A->seq_al[a][c])); t+=(!is_gap(A->seq_al[a][c]) && !is_gap(A->seq_al[b][c])); } d=d/((t==0)?1:t); } W[a][b]=W[b][a]=(1-d)*100; } return W; } float **seq2pwsim_fragment ( Alignment *A, Sequence *S, char *mode) { int a, b, c; float d,t; float **W; Alignment *B; W=declare_float (S->nseq, S->nseq); for (a=0; a< S->nseq; a++) for ( b=a; bnseq; b++) { if ( a==b){d=1;} else if (!A) { B=align_two_sequences ((S)->seq[a], (S)->seq[b],"pam250mt", -10, -1, "fasta_pair_wise"); for (t=0,d=0,c=0; clen_aln; c++) { d+=(B->seq_al[0][c]==B->seq_al[1][c] && !is_gap(B->seq_al[0][c])); t+=(!is_gap(B->seq_al[0][c]) && !is_gap(B->seq_al[1][c])); } d=d/((t==0)?1:t); free_aln(B); } else { for (t=0,d=0,c=0; clen_aln; c++) { d+=(A->seq_al[a][c]==A->seq_al[b][c] && !is_gap(A->seq_al[a][c])); t+=(!is_gap(A->seq_al[a][c]) && !is_gap(A->seq_al[b][c])); } d=d/((t==0)?1:t); } W[a][b]=W[b][a]=(1-d)*100; } return W; } /********************************************************************/ /* */ /* AMINO ACID FUNCTIONS */ /* */ /* */ /* */ /********************************************************************/ //Builds an extended alphabet from a string char** string2alphabet (char *string, int depth, int *falp_size) { int max_s; int a, b,c, l, n; char buf[1000]; char **alp; int alp_size; char ***alp2; int *alp2_size; int *array; char **falp; l=strlen (string); array=vcalloc ( 256, sizeof (int)); max_s=l+1; falp_size[0]=0; falp=declare_char (l+1, 2); alp=declare_char(l,2); alp_size=0; array=vcalloc ( 256, sizeof (int)); for (a=0;a0) { for ( c=0,b=0;b< 26; b++) { if ( matrix[a][b]>0 && matrix[b][b]>0) { buf[c++]=b+'A'; buf[c++]=b+'a'; } } buf[c]='\0'; for ( is_in=0,b=0; b< ngroup[0]; b++)if ( strcmp (buf, group_list[b])==0)is_in=1; if (is_in==0)sprintf ( group_list[ngroup[0]++], "%s", buf); } } free_int (matrix, -1); vfree (matrix_name); return group_list; } char** make_group_aa_upgma (char*matrix, int max_n) { char **group_list; int **mat; int *used; int a, b, ba, bb, best, set, l, n; l=26; group_list=declare_char (l+1, l+1); for (a=0; amax_n) { for (set=0,a=0; abest) { best=mat[a][b]; ba=a; bb=b; set=1; } } for (a=0; ac)?d[a]:c; return c; } int is_in_same_group_aa ( char r1, char r2, int n_group, char **gl, char *mode) { int a; static char **lgl; static int ln_group; char **gl2; int n_group2; /*use mode=idmat for similarity based on id*/ r1=toupper(r1); r2=toupper(r2); if (mode==NULL)return (r1==r2)?1:0; if ( strm (mode, "clean")) { free_char (lgl, -1); lgl=NULL; ln_group=0; return 0; } else if ( strstr (mode, "cov")) { return 1; } if ( lgl==NULL) { lgl=make_group_aa ( &ln_group, mode); } if ( gl==NULL) { gl2=lgl; n_group2=ln_group; } else { gl2=gl; n_group2=n_group; } for ( a=0; a< n_group2; a++) if ( is_in_set ( r1, gl2[a]) && is_in_set ( r2, gl2[a]))return 1; return 0; } Alignment * gene2prot (Alignment *A){return A; } char * test_gene2prot (Constraint_list *CL, int s1) { int a, b,q, nal; int F=-10000000; /*FORBIDEN STATE*/ int AL=0; /*ALLOWED STATE*/ int SPLICE_PENALTY=1000; int FRAME_PENALTY=1000; int START, ORF1, ORF2, ORF3, s5NC; int s3NC,ORF3_G1, ORF3_T2, ORF3_NC, ORF3_A3, ORF3_T4; int U1_G1, U1_T2, U1_NC, U1_A3, U1_T4; int U2_G1, U2_T2, U2_NC, U2_A3, U2_T4; int U1, U2, U3, U4, U5, END; int nstate=0; int **transitions; int **v_tab; int **v_tab_p; int **last_coding; int **last_t4; int *potential; int v; int orf1, orf2, orf3, ncp, p, state, pstate, e, best_state_p=0, best_state_v=0, best_pstate_p=0, best_pstate_v; char *seq, *seq2, *seq3; int l; int *is_coding; int *is_t4; char *codon; static int *entry; int tot=0; seq=vcalloc ( strlen ((CL->S)->seq[s1])+1, sizeof (char)); seq2=vcalloc ( strlen ((CL->S)->seq[s1])+1, sizeof (char)); seq3=vcalloc ( strlen ((CL->S)->seq[s1])+1, sizeof (char)); sprintf ( seq, "%s", (CL->S)->seq[s1]); ungap (seq); l=strlen (seq); for ( a=0; a< l; a++) seq[a]=tolower ( seq[a]); for ( a=0; a< l; a++) seq[a]=(seq[a]=='t')?'u': seq[a]; potential=vcalloc (l+1, sizeof (int)); CL=index_constraint_list ( CL); for (nal=0, a=0; a<(CL->S)->nseq; a++) for ( b=CL->start_index[s1][a]; b< CL->end_index[s1][a];b++) { entry=extract_entry(entry, b, CL); if ( entry[SEQ1]==s1)potential[entry[R1]-1]+=entry[WE]; else if ( entry[SEQ2]==s1)potential[entry[R2]-1]+=entry[WE]; tot+=entry[WE]; nal++; } SPLICE_PENALTY=10000; FRAME_PENALTY=1000; nstate=0; START=nstate++; ORF1=nstate++; ORF2=nstate++; ORF3=nstate++; s5NC=nstate++; s3NC=nstate++; ORF3_G1=nstate++;U1_G1=nstate++;U2_G1=nstate++; ORF3_T2=nstate++;U1_T2=nstate++;U2_T2=nstate++; ORF3_NC=nstate++;U1_NC=nstate++;U2_NC=nstate++; ORF3_A3=nstate++;U1_A3=nstate++;U2_A3=nstate++; ORF3_T4=nstate++;U1_T4=nstate++;U2_T4=nstate++; U1=nstate++; U2=nstate++; U3=nstate++; U4=nstate++; U5=nstate++; END=nstate++; is_coding=vcalloc ( nstate, sizeof (int)); is_coding[ORF1]=is_coding[ORF2]=is_coding[ORF3]=is_coding[U1]=is_coding[U2]=1; is_coding[U3]=is_coding[U4]=is_coding[U5]=1; is_t4=vcalloc ( nstate, sizeof (int)); is_t4[ORF3_T4]=is_t4[U1_T4]=is_t4[U2_T4]=1; transitions=declare_int ( nstate, nstate); for (a=0; a< nstate; a++) for ( b=0; b< nstate; b++)transitions[a][b]=F; transitions[START][ORF1]=AL; transitions[START][s5NC]=AL-FRAME_PENALTY; transitions[s5NC][s5NC]=AL; transitions[s5NC][ORF1]=AL-FRAME_PENALTY; transitions[ORF1][ORF2]=AL; transitions[ORF2][ORF3]=AL; transitions[ORF3][U1]=AL; transitions[ORF3][ORF1]=AL; transitions[ORF3][ORF3_G1]=AL-SPLICE_PENALTY; transitions[ORF3_G1][ORF3_T2]=AL; transitions[ORF3_T2][ORF3_NC]=AL; transitions[ORF3_NC][ORF3_NC]=AL; transitions[ORF3_NC][ORF3_A3]=AL; transitions[ORF3_A3][ORF3_T4]=AL; transitions[ORF3_T4][ORF1]=AL-SPLICE_PENALTY; transitions[U1][U2]=AL; transitions[U1][U1_G1]=AL-SPLICE_PENALTY; transitions[U1_G1][U1_T2]=AL; transitions[U1_T2][U1_NC]=AL; transitions[U1_NC][U1_NC]=AL; transitions[U1_NC][U1_A3]=AL; transitions[U1_A3][U1_T4]=AL; transitions[U1_T4][U3]=AL-SPLICE_PENALTY; transitions[U3][U4]=AL; transitions[U4][ORF1]=AL; transitions[U2][U2_G1]=AL-SPLICE_PENALTY; transitions[U2_G1][U2_T2]=AL; transitions[U2_T2][U2_NC]=AL; transitions[U2_NC][U2_NC]=AL; transitions[U2_NC][U2_A3]=AL; transitions[U2_A3][U2_T4]=AL; transitions[U2_T4][U5]=AL-SPLICE_PENALTY; transitions[U5][ORF1]=AL; transitions[ORF3][s3NC]=AL-FRAME_PENALTY; transitions[ORF3][END]=AL; transitions[s3NC][END]=AL; v_tab=declare_int ( l+1,nstate); v_tab_p=declare_int ( l+1,nstate); last_coding=declare_int ( l+1,nstate); last_t4=declare_int ( l+1,nstate); for (a=0; a< l; a++) potential[a]-=200; codon=vcalloc ( 4, sizeof (char)); best_pstate_p=START; best_pstate_v=0; nal=0; for ( p=1; p<=l; p++) { if (translate_dna_codon (seq+(p-1), 'x')=='x' || p>(l-2))orf1=F; else orf1=potential[p-1]; if (p<2 || translate_dna_codon (seq+(p-2), 'x')=='x' || p>(l-1))orf2=F; else orf2=potential[p-1]; if (p<3 || translate_dna_codon (seq+(p-3), 'x')=='x' || p>l)orf3=F; else orf3=potential[p-1]; if ( best_int (3, 1, &a, orf1, orf2, orf3)!=F)ncp=-best_int (3, 1, &a, orf1, orf2, orf3); else ncp=1000; for ( state=0; state< nstate; state++) { if ( state==ORF1)e=orf1; else if ( state==ORF2)e=orf2; else if ( state==ORF3)e=orf3; else if ( state>=U1 && state<=U3) { e=0; } else if ( state==U4) { codon[2]=seq[p-1]; codon[1]=seq[last_coding[p-1][U3]-1]; codon[0]=seq[last_coding[p-2][U1_T4]-1]; if ( translate_dna_codon (codon, 'x')=='x')e=F; else e=0; } else if ( state==U5) { codon[2]=seq[p-1]; codon[1]=seq[last_coding[p-1][U2_T4]-1]; q=seq[last_coding[p-1][U2_T4]]; codon[0]=seq[last_coding[q-1][U1]-1]; if ( translate_dna_codon (codon, 'x')=='x')e=F; else e=0; } else if (state>=ORF3_G1 && state<=U2_G1)e=(p=ORF3_T2 && state<=U2_T2) { e=(p>1 && seq[p-2]=='g' && seq[p-1]=='u')?ncp:F; } else if ( state>=ORF3_A3 && state<=U2_A3)e=(seq[p-1]=='a')?ncp:F; else if ( state>=ORF3_T4 && state<=U2_T4)e=(seq[p-1]=='u')?ncp:F; else e=ncp; for ( pstate=0; pstatebest_pstate_v) {best_pstate_v=v;best_pstate_p=pstate;} } v_tab[p][state]=best_pstate_v; v_tab_p[p][state]=best_pstate_p; if (!is_coding[state])last_coding[p][state]=last_coding[p-1][best_pstate_p]; else if (is_coding[state])last_coding[p][state]=p; if (!is_t4[state]) { if (is_coding[state] && last_t4[p-1][best_pstate_p]==0)last_t4[p][state]=p; else last_t4[p][state]=last_t4[p-1][best_pstate_p]; } else if (is_t4[state])last_t4[p][state]=p; if (state==0 ||best_pstate_v>best_state_v ){best_state_p=state; best_state_v=best_pstate_v;} } } tot=0; for ( p=l; p>0; p--) { if ( best_state_p>=ORF1 && best_state_p<=ORF3){seq2[tot++]=tolower (seq[p-1]);} else if ( best_state_p>=U1 && best_state_p<=U5){seq2[tot++]=tolower (seq[p-1]);} if (best_state_p==ORF1)seq[p-1]=toupper (seq[p-1]); else if (best_state_p==ORF2 || best_state_p==ORF3)seq[p-1]=tolower (seq[p-1]); else if ( best_state_p==ORF3_NC || best_state_p==U1_NC || best_state_p==U2_NC) seq[p-1]='.'; else if ( best_state_p==U1 || best_state_p==U2 || best_state_p==U3 || best_state_p==U4 || best_state_p==U5) seq[p-1]=best_state_p-U1+'1'; else seq[p-1]=toupper (seq[p-1]); best_state_p=v_tab_p[p][best_state_p]; } for ( a=0, b=tot-1; b>=0; b--, a++) seq3[a]=seq2[b]; fprintf ( stderr, "\n%s\n", seq); fprintf ( stderr, "\nN coding=%d\n", tot); for ( a=0; a< tot; a+=3) { b=translate_dna_codon (seq3+a, 'x'); fprintf ( stderr, "%c",b); if ( b=='x'){fprintf ( stderr, "\n");myexit (EXIT_SUCCESS);} } fprintf ( stderr, "\n"); myexit (EXIT_SUCCESS); return 0; } Alignment * dna_aln2_3frame_cdna_aln(Alignment *A,int *ns,int **l_s) { Alignment *B; int a; B=realloc_aln2 (NULL,6,strlen(A->seq_al[l_s[0][0]])+strlen(A->seq_al[l_s[1][0]])); for ( a=0; a< 3; a++) { B->seq_al[a]=translate_dna_seq (A->seq_al[l_s[0][0]]+a, 0, 'o',B->seq_al[a]); B->seq_al[a+3]=translate_dna_seq (A->seq_al[l_s[1][0]]+a, 0, 'o',B->seq_al[a+3]); } for ( a=1; a<3; a++) { if ( strlen(B->seq_al[a])seq_al[0])) B->seq_al[a]=strcat ( B->seq_al[a], "x"); if ( strlen(B->seq_al[a+3])seq_al[3])) B->seq_al[a+3]=strcat ( B->seq_al[a+3], "x"); } B->nseq=6; B->len_aln=strlen (B->seq_al[0]); return B; } //JM_ADD //For normal distribution scan #ifndef PI #define PI 3.141592653589793238462643 #endif double normal(double x, double mean, double std) { return (1/(std*sqrt(2.0*PI)))*exp((-0.5*(x-mean)*(x-mean))/(std*std)); } int ** get_sim_aln_array_normal_distribution ( Alignment *A, char *mode, int *STD, int *CENTER) { int **w; int a, b; w=declare_int ( A->nseq, A->nseq); for ( a=0; a< A->nseq-1; a++) { for ( b=a+1; b< A->nseq; b++) { w[a][b]=w[b][a]=generic_get_seq_sim_normal_distribution ( A->seq_al[a], A->seq_al[b], (A->cdna_cache)?A->cdna_cache[0]:NULL, mode, STD, CENTER); } } return w; } int generic_get_seq_sim_normal_distribution ( char *seq1, char *seq2, int*cache, char *mode, int *STD, int *CENTER) { return get_seq_sim_distribution ( seq1,seq2,GAP_LIST, mode, STD, CENTER); } int get_seq_sim_distribution ( char *string1, char *string2, char *ignore, char *in_mode, int *STD, int *CENTER) { int len1; int a; int pos0, gap=0; int p1, p2; int r=0,r1=0,r2=0; char *p; char mode[1000]; double sim; sprintf ( mode, "%s", in_mode); /*mode: __ mat: idscore to get the alignment done any legal cw matrix sim_mode: sim1->identities/matches sim2->identities/min len */ if ( (p=strstr (mode, "_"))!=NULL) { p[0]='\0'; p++; } if (strstr (mode, "idscore")) { static int **mat; if (!mat) mat=read_matrice ("blosum62mt"); return idscore_pairseq (string1, string2, -12, -1, mat,mode); } len1=strlen (string1); for ( sim=pos0=0,a=0; a< len1; a++) { r1=string1[a]; r2=string2[a]; p1=1-is_in_set (r1, ignore); p2=1-is_in_set (r2, ignore); if (p1 && p2) { pos0++; if (is_in_same_group_aa(r1,r2,0, NULL, mode)) { sim += normal(a, *CENTER, *STD); } } else if (p1+p2==1) { gap++; } } if ( p==NULL || strm (p, "sim1") || strm (p, "sim")) { r=(pos0==0)?0:(sim*MAXID); } /* else if ( strm (p, "sim2")) { r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MIN(pos1,pos2); } else if ( strm (p, "sim3")) { r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MAX(pos1,pos2); } else if ( strm (p, "gap1")) { r=(len1==0)?MAXID:(gap*MAXID)/len1; r=MAXID-r; } else if ( strm (p, "logid")) { r=logid_score (pos0, sim); }*/ return r; } Alignment *aln2clean_pw_aln (Alignment *A, OveralnP *F)// char *mode, int t, int f, int p1,int p2, int p3, char *fsa_mode) { int **C, **T; int a, b, c; Alignment *B; if (F->t==0)F->t=2; C=declare_int ( A->nseq, A->len_aln); T=declare_int ( A->nseq, A->len_aln); B=copy_aln (A, NULL); for (a=0; a< A->nseq;a++) { for (b=0; bnseq; b++) { int *w; w=pw_aln2clean_aln_weight (A->seq_al[a], A->seq_al[b], 1,F);//f,p1, p2, p3, fsa_mode); for (c=0; clen_aln; c++) { if (A->seq_al[a][c]=='-')continue; C[a][c]+=w[c]; T[a][c]++; } vfree (w); } } for (a=0; anseq; a++) { for (b=0; blen_aln; b++) { int c; c=A->seq_al[a][b]; if ( c=='-'); else if (T[a][b]==0); else { int r; r=(C[a][b]*10)/T[a][b]; r=(r==10)?9:r; if (!F->mode || strm (F->mode, "number")) B->seq_al[a][b]='0'+r; else if ( F->mode && strm (F->mode, "unalign")) B->seq_al[a][b]='0'+r; else if ( F->mode && strm (F->mode, "lower") ) { if (r<=F->t)B->seq_al[a][b]=tolower (B->seq_al[a][b]); else B->seq_al[a][b]=toupper (B->seq_al[a][b]); } } } } if ( F->mode && strm (F->mode, "unalign")) { A=unalign_aln (A, B, F->t); free_aln (B); B=copy_aln (A, NULL); } free_int (C, -1); free_int (T, -1); return B; } char **pw_aln2clean_pw_aln_fsa1 (char ** aln, OveralnP *F); char **pw_aln2clean_pw_aln_fsa2 (char ** aln, OveralnP *F); int * pw_aln2clean_aln_weight ( char *seq1, char *seq2, int w, OveralnP *F) { char **aln; int *weight; int l, a; if ( (l=strlen (seq1)) !=strlen (seq2)) { HERE ("\n%s\n%s\n", seq1, seq2); printf_exit ( EXIT_FAILURE, stderr, "\nERROR: Comparing unaligned sequences [FATAL:%s]", PROGRAM); } aln=declare_char (2, l+1); sprintf ( aln[0], "%s", seq1); sprintf ( aln[1], "%s", seq2); aln=pw_aln2clean_pw_aln (aln, F); weight=vcalloc (l+1, sizeof (int)); for (a=0; amodel, "fsa2"))return pw_aln2clean_pw_aln_fsa2 (aln,F); else if ( strm (F->model, "fsa1"))return pw_aln2clean_pw_aln_fsa1 (aln,F); else return pw_aln2clean_pw_aln_fsa1 (aln,F); } char **pw_aln2clean_pw_aln_fsa2 (char ** aln, OveralnP *FO) { int a, b, c, d, l, id; int c1, c2, e0, e1,tb, obs; int T0, T1,T2; int **mat, **tran, **p, **t, *s, *ids; int ns, ps, cs; int S, M1, M2, m1, m2,B1, B2,G1,G2, K; int F=-9999999; int MID_EXON_FACTOR=50; int best; static int **smat; int model_type=1; int *translate; if ( getenv ("MID_EXON_FACTOR"))MID_EXON_FACTOR=atoi (getenv ("MID_EXON_FACTOR")); if (!smat)smat=read_matrice ( "blosum62mt"); l=strlen (aln[0]); if ( l!=strlen (aln[1])) { printf_exit ( EXIT_FAILURE, stderr, "\nERROR: unaligned strings"); } s=vcalloc (l, sizeof (int)); ids=vcalloc (l, sizeof (int)); //record the id level of each posotion for (b=0; b=2){id++; s[a]=1;} else {s[a]=0;} b++; } } if (b==0) { vfree(s);vfree (ids); return aln; } FO->p1=(FO->p1==0)?5:FO->p1; FO->p2=(FO->p2==0)?15:FO->p2; FO->p3=(FO->p3==0)?0:FO->p3; FO->p4=(FO->p4==0)?100:FO->p4; T1=100*(float)id/(float)b; T2=(FO->f==0)?30:T1*(float)((float)FO->f/(float)100); T2=MAX(T2,20); //0: unaligned //1: aligned //2: gap //3: exon boundary ns=0; S=ns++; M1=ns++;//1 matched aligned m1=ns++;//2 mmatched aligned M2=ns++;//3 matched unaligned m2=ns++;//4 mmatched unaligned B1=ns++;//5 transition aligned B2=ns++;//6 transition unaligned mat=declare_int (ns, 4); tran=declare_int (ns, ns); p=declare_int (l+1, ns); t=declare_int (l+1, ns); //emission Values mat[M1][0]=F; //non id mat[M1][1]=T1;//id mat[M1][2]=0; //gap mat[M1][3]=F; //transition mat[M2][0]=F; mat[M2][1]=T2; mat[M2][2]=0; mat[M2][3]=F; mat[m1][0]=100-T1; mat[m1][1]=F; mat[m1][2]=0; mat[m1][3]=F; mat[m2][0]=100-T2; mat[m2][1]=F; mat[m2][2]=0; mat[m1][3]=F; mat[B1][0]=F; mat[B1][1]=F; mat[B1][2]=F; mat[B1][3]=0; mat[B2][0]=F; mat[B2][1]=F; mat[B2][2]=F; mat[B2][3]=0; //transition values tran[S][m1]=0; tran[S][m2]=0; tran[S][M1]=0; tran[S][M2]=0; tran[S][B1]=0; tran[S][B2]=0; tran[M1][m1]= 0; tran[M1][m2]=-FO->p4; tran[M1][M1]=+FO->p2; tran[M1][M2]= F; tran[M1][S ]= F; tran[M1][B1]= 0; tran[M1][B2]=-FO->p1; tran[M2][m1]= F; tran[M2][m2]=+FO->p3; tran[M2][M1]= F; tran[M2][M2]= 0; tran[M2][S] = F; tran[M2][B1]= F; tran[M2][B2]= 0; tran[m1][m1]= 0; tran[m1][m2]= F; tran[m1][M1]= 0; tran[m1][M2]= F; tran[m1][S] = F; tran[m1][B1]= 0; tran[m1][B2]=-FO->p1; tran[m2][m1]= F; tran[m2][m2]= 0; tran[m2][M1]= -FO->p4; tran[m2][M2]= +FO->p3; tran[m2][S] = F; tran[m2][B1]= F; tran[m2][B2]= 0; tran[B1][m1]= 0; tran[B1][m2]= F; tran[B1][M1]= 0; tran[B1][M2]= F; tran[B1][S]= F; tran[B1][B1]= F; tran[B1][B2]= F; tran[B2][m1]= -FO->p1; tran[B2][m2]= 0; tran[B2][M1]= -FO->p1; tran[B2][M2]= 0; tran[B2][S]= F; tran[B2][B1]= F; tran[B2][B2]= F; translate=vcalloc (ns, sizeof (int)); translate[M1]=1; translate[m1]=1; translate[M2]=0; translate[m2]=0; translate[B1]=1; translate[B2]=0; for (a=1;a<=l; a++) { obs=s[a-1]; for (cs=0; cs=best){t[a][cs]=ps;best=p[a][cs]=c;} } } } for (a=0; a=best){tb=a;best=p[l][a];} } for (a=l; a>0; a--) { int v; int p2; p2=a-1; aln[0][p2]=aln[1][p2]=translate[tb]; tb=t[a][tb]; } free_int (p, -1); vfree(s); free_int (t, -1); free_int (mat, -1); free_int (tran, -1); vfree (translate); return aln; } char **pw_aln2clean_pw_aln_fsa1 (char ** aln, OveralnP *FO) { int a, b, c, d, l, id; int c1, c2, e0, e1,tb, obs; int T0, T1,T2; int **mat, **tran, **p, **t, **s; int ns, ps, cs; int S, M1, M2, m1, m2, K; int F=-9999999; int best; static int **smat; int *translate; if (!smat)smat=read_matrice ( "blosum62mt"); l=strlen (aln[0]); if ( l!=strlen (aln[1])) { printf_exit ( EXIT_FAILURE, stderr, "\nERROR: unaligned strings"); } s=declare_int (l+1, 2); for (id=0,b=0,a=0;a=2){id++; s[b][0]=1;} else {s[b][0]=0;} s[b][1]=a; b++; } } if (b==0) { free_int (s, -1); return aln; } FO->f=(FO->f==0)?30:FO->f; FO->p1=(FO->p1==0)?90:FO->p1; FO->p2=(FO->p2==0)?15:FO->p2; FO->p3=(FO->p3==0)?0:FO->p3; l=b;//length of the ungapped aln T1=100*(float)id/(float)b; T2=FO->f;//T1*f; //0: unaligned //1: aligned ns=0; S=ns++; M1=ns++;//1 matched aligned m1=ns++;//2 mmatched aligned M2=ns++;//3 matched unaligned m2=ns++;//4 mmatched unaligned mat=declare_int (ns, 2); tran=declare_int (ns, ns); p=declare_int (l+1, ns); t=declare_int (l+1, ns); mat[M1][0]=F; mat[M1][1]=T1; mat[M2][0]=F; mat[M2][1]=T2; mat[m1][0]=100-T1; mat[m1][1]=F; mat[m2][0]=100-T2; mat[m2][1]=F; tran[S][m1]=0; tran[S][m2]=0; tran[S][M1]=0; tran[S][M2]=0; tran[M1][m1]= 0; tran[M1][m2]=-FO->p1;// -P; tran[M1][M1]=+FO->p2; tran[M1][M2]= F; tran[M1][S] = F; tran[M2][m1]= F; tran[M2][m2]=+FO->p3; tran[M2][M1]= F; tran[M2][M2]= 0; tran[M2][S]= F; tran[m1][m1]= 0; tran[m1][m2]= F; tran[m1][M1]= 0; tran[m1][M2]= F; tran[m1][S]= F; tran[m2][m1]= F; tran[m2][m2]= 0; tran[m2][M1]=-FO->p1; tran[m2][M2]=+FO->p3; tran[m2][S]= F; translate=vcalloc (ns, sizeof (int)); translate[M1]=1; translate[m1]=1; translate[M2]=0; translate[m2]=0; translate[S]=1; for (a=1;a<=l; a++) { obs=s[a-1][0]; for (cs=0; cs=best){t[a][cs]=ps;best=p[a][cs]=c;} } } } for (a=0; a=best){tb=a;best=p[l][a];} } for (a=l; a>0; a--) { int p2=s[a-1][1]; aln[0][p2]=aln[1][p2]=translate[tb]; tb=t[a][tb]; } free_int (p, -1); free_int (s, -1); free_int (t, -1); free_int (mat, -1); free_int (tran, -1); vfree (translate); return aln; } float* analyze_overaln ( Alignment *iA, Alignment *iB, char *mode, int filter, int f, int p1,int p2, int p3) { Alignment *C, *D; Alignment *A, *B; OveralnP *F; F=vcalloc (1, sizeof (OveralnP)); F->p1=p1; F->p2=p2; F->p3=p3; F->f=f; F->t=filter; sprintf (F->mode, "%s", mode); float *r; A=copy_aln (iA, NULL); B=copy_aln (iB, NULL); C=aln2gap_cache (A,0); A=filter_aln_upper_lower (A, C, 0, 0); D=aln2clean_pw_aln (B, F); r=aln2pred (A,D,mode); free_aln (C); free_aln (D); free_aln (A); free_aln (B); return r; } float* aln2pred ( Alignment *A, Alignment*B, char *mode) { int a, b, c, d, i, l, salp, s, n; static char **list, *buf1, *buf2, *alp, *alp_lu; static int ***r; int T, N; int fp, fn, tn, tp; int tfp, tfn, ttn, ttp; float sp, sn, sen2, best, result; int print=1; float *fresult; fresult=vcalloc ( 3, sizeof (float)); if ( mode && strstr (mode, "case")) { A=aln2case_aln (A,"u","l"); B=aln2case_aln (B,"u","l"); } if (mode && strstr (mode, "printaln")) { Sequence *S; Alignment *C; S=aln2seq (A); C=copy_aln (B, NULL); for (a=0; anseq; a++) { i=name_is_in_list (C->name[a], S->name, S->nseq, 100); if ( i==-1) for (b=0; blen_aln; b++) C->seq_al[a][b]='-'; else for (d=0,b=0; blen_aln; b++) { if ( !is_gap (C->seq_al[a][b])) { if (C->seq_al[a][b]==S->seq[i][d])C->seq_al[a][b]=toupper(C->seq_al[a][b]); d++; } } } print_aln (C); } vfree (alp);vfree (alp_lu); alp=vcalloc ( 256, sizeof (char)); alp_lu=vcalloc ( 256, sizeof (char)); for (c=0; c<2; c++) { Alignment *AL; AL=(c==0)?A:B; for (salp=0,a=0; anseq; a++) { for (b=0; blen_aln; b++) { c=AL->seq_al[a][b]; if (!is_gap(c) && !alp[c]) { salp++; alp_lu[salp]=c; alp[c]=salp; } } } } vfree (buf1); vfree(buf2); buf1=vcalloc ( A->len_aln+1, sizeof (char)); buf2=vcalloc ( B->len_aln+1, sizeof (char)); free_arrayN ((void **)r, 3); r=declare_arrayN(3, sizeof (int),A->nseq,salp+1,salp+1); free_char ( list, -1); list=declare_char ( A->nseq, 100); for (n=0,a=0; a< A->nseq; a++) { for ( b=0; bnseq; b++) { if ( strm (A->name[a], B->name[b])) { sprintf ( buf1, "%s", A->seq_al[a]); sprintf ( buf2, "%s", B->seq_al[b]); ungap (buf1); ungap (buf2); if ((l=strlen (buf1))!=strlen (buf2))continue; else { sprintf ( list[n], "%s", A->name[a]); for (c=0; c%s S=%c sp=%6.2f sn=%6.2f sen2=%6.2f best=%6.2f\n", list[a],alp_lu[s],sp, sn, sen2, best); } rates2sensitivity (ttp, ttn, tfp, tfn, &sp, &sn, &sen2, &best); if (mode && strstr (mode, "printstat"))fprintf ( stdout, ">TOT S=%c sp=%6.2f sn=%6.2f re=%6.2f best=%6.2f\n", alp_lu[s],sp, sn, sen2, best); if ( mode && strstr (mode, type)) { fresult[0]=sn; fresult[1]=sp; fresult[2]=sen2; } } return fresult; } Alignment * mark_exon_boundaries (Alignment *A, Alignment *E) { char *buf, *buf2; int a, b, c, i, l; buf2=vcalloc ( E->len_aln+1, sizeof (char)); buf =vcalloc ( E->len_aln+1, sizeof (char)); for (a=0; a< A->nseq; a++) { i=name_is_in_list (A->name[a], E->name, E->nseq, 100); if ( i==-1) continue; sprintf (buf, "%s", E->seq_al[i]); ungap (buf); l=strlen (buf); //clean buf2 for (c=0, b=0; b=1)buf2[c-1]=tolower(buf2[c-1]); else if (buf[b]=='j' &&clen_aln; b++) { if (!is_gap(A->seq_al[a][b])) { A->seq_al[a][b]=buf2[c++]; } } } vfree (buf); vfree (buf2); return A; } /*********************************COPYRIGHT NOTICE**********************************/ /*© Centro de Regulacio Genomica */ /*and */ /*Cedric Notredame */ /*Tue Oct 27 10:12:26 WEST 2009. */ /*All rights reserved.*/ /*This file is part of T-COFFEE.*/ /**/ /* T-COFFEE is free software; you can redistribute it and/or modify*/ /* it under the terms of the GNU General Public License as published by*/ /* the Free Software Foundation; either version 2 of the License, or*/ /* (at your option) any later version.*/ /**/ /* T-COFFEE is distributed in the hope that it will be useful,*/ /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/ /* GNU General Public License for more details.*/ /**/ /* You should have received a copy of the GNU General Public License*/ /* along with Foobar; if not, write to the Free Software*/ /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/ /*............................................... |*/ /* If you need some more information*/ /* cedric.notredame@europe.com*/ /*............................................... |*/ /**/ /**/ /* */ /*********************************COPYRIGHT NOTICE**********************************/