7 #include "io_lib_header.h"
\r
8 #include "util_lib_header.h"
\r
9 #include "dp_lib_header.h"
\r
10 #include "define_header.h"
\r
12 int aln_has_stockholm_structure (Alignment *A)
\r
14 return name_is_in_list ("#=GC SS_cons", A->name, A->nseq, 100);
\r
17 int get_aln_stockholm_structure (Alignment *A)
\r
20 if ((i=aln_has_stockholm_structure(A))==-1)
\r
21 A=add_alifold2aln (A, NULL);
\r
22 return aln_has_stockholm_structure(A);
\r
24 int ** update_RNAfold_list (Alignment *A, int **pos, int s, int **l)
\r
29 if (!is_gap(A->seq_al[s][l[a][0]]) && !is_gap (A->seq_al[s][l[a][1]]))
\r
31 l[a][2]=pos[s][l[a][0]];
\r
32 l[a][3]=pos[s][l[a][1]];
\r
43 Alignment *compare_RNA_fold ( Alignment *A, Alignment *B)
\r
49 int tot_ol=0, tot_l=0;
\r
51 i1=get_aln_stockholm_structure (A);
\r
52 i2=get_aln_stockholm_structure (B);
\r
54 l1=vienna2list (A->seq_al[i1]);
\r
55 l2=vienna2list (B->seq_al[i2]);
\r
57 pos1=aln2pos_simple(A, A->nseq);
\r
58 pos2=aln2pos_simple(B, B->nseq);
\r
62 for (a=0; a< A->nseq; a++)
\r
65 int ol=0, ll1=0, ll2=0;
\r
66 if ( A->name[a][0]=='#')continue;
\r
67 i=name_is_in_list (A->name[a], B->name, B->nseq, 100);
\r
70 l1=update_RNAfold_list (A,pos1,a, l1);
\r
71 l2=update_RNAfold_list (B,pos2,i, l2);
\r
72 lu=declare_char (A->len_aln, B->len_aln);
\r
78 if (l2[b][2]==-1 || l2[b][3]==-1);
\r
82 lu[l2[b][2]][l2[b][3]]=1;
\r
92 if (l1[b][2]==-1 || l1[b][3]==-1);
\r
96 if (lu[l1[b][2]][l1[b][3]]==1)
\r
98 A->seq_al[a][l1[b][0]]='6';
\r
99 A->seq_al[a][l1[b][1]]='6';
\r
104 A->seq_al[a][l1[b][0]]='0';
\r
105 A->seq_al[a][l1[b][1]]='0';
\r
111 free_char (lu, -1);
\r
116 fprintf ( stdout, "@@ Seq: %s Overalp: %.2f Al1: %.2f Al2: %.2f \n", A->name[a], (float)(ol*200)/(ll1+ll2), (float)(ol*100)/ll1,(float)(ol*100)/ll2);
\r
119 fprintf ( stdout, "@@ Seq: Tot Overalp: %.2f \n", (float)(tot_ol*200)/(tot_l));
\r
123 int is_neutral(char c1, char c2);
\r
124 int is_watson (char c1, char c2);
\r
125 int is_watson2 (char c1, char c2);
\r
126 int is_watson (char c1, char c2)
\r
130 if ( is_watson2 (c1, c2)) return 1;
\r
131 else return is_watson2 (c2, c1);
\r
133 int is_watson2 (char c1, char c2)
\r
136 if ( c1=='g' && c2=='c')return 1;
\r
137 else if (c1=='a' && (c2=='t' || c2=='u'))return 1;
\r
140 int is_neutral (char c1, char c2)
\r
145 if (is_watson (c1, c2)) return 1;
\r
146 else if (c1=='g' && (c2=='t' || c2=='u'))return 1;
\r
147 else if ((c1=='t' || c1=='u') && c2=='g')return 1;
\r
151 int ** vienna2list ( char *seq)
\r
156 list=declare_int (l+1, 8);
\r
157 for (i=0,a=0; a<l; a++)
\r
162 for (i2=0,b=a+1; b<l && i2>=0; b++)
\r
164 if (seq[b]=='(')i2++;
\r
165 else if (seq[b]==')')i2--;
\r
175 Alignment *aln2alifold(Alignment *A)
\r
180 tmp1=vtmpnam (NULL);
\r
181 tmp2=vtmpnam (NULL);
\r
182 output_clustal_aln (tmp1,A);
\r
183 printf_system ("RNAalifold %s >%s 2>/dev/null", tmp1, tmp2);
\r
184 return alifold2aln (tmp2);
\r
187 Alignment *add_alifold2aln (Alignment *A, Alignment *ST)
\r
190 int r1, rr1, r2, rr2;
\r
191 int watson, comp,tot;
\r
196 int ncomp=0, nwatson=0;
\r
197 int cons_l, fold_l;
\r
205 T=copy_aln (A, NULL);
\r
206 tmp1=vtmpnam (NULL);
\r
207 tmp2=vtmpnam (NULL);
\r
209 for (a=0; a<A->len_aln; a++)
\r
211 for (f=0,b=0; b<A->nseq && f==0; b++)
\r
213 if (is_gap (A->seq_al[b][a]))f=1;
\r
219 for (b=0; b<A->nseq; b++)T->seq_al[b][a]='-';
\r
222 ST=aln2alifold (T);
\r
226 //add or Replace the structure
\r
227 l=strlen (ST->seq_al[0]);
\r
229 if ( l!=A->len_aln)
\r
231 HERE ("\n%s\n%s\n", ST->seq_al[0], A->seq_al[0]);
\r
232 printf_exit ( EXIT_FAILURE, stderr, "ERROR the predicted structure and the multiple alignment do not have the same length [FATAL:%s]\n", PROGRAM);
\r
236 for (a=0; a< l; a++)if (ST->seq_al[0][a]==STOCKHOLM_CHAR)ST->seq_al[0][a]='.';
\r
237 if ((i=name_is_in_list ("#=GC SS_cons", A->name, A->nseq, 100))!=-1)
\r
239 sprintf (A->seq_al[i], "%s", ST->seq_al[0]);
\r
243 A=realloc_aln2 ( A, A->nseq+1, A->len_aln+1);
\r
244 sprintf (A->name[A->nseq], "#=GC SS_cons");
\r
245 sprintf (A->seq_al[A->nseq], "%s", ST->seq_al[0]);
\r
250 Alignment * alifold2analyze (Alignment *A, Alignment *ST, char *mode)
\r
256 s=name_is_in_list ("#=GC SS_cons", A->name,A->nseq, 100);
\r
260 A=add_alifold2aln (A,ST);
\r
261 s=name_is_in_list ("#=GC SS_cons", A->name,A->nseq, 100);
\r
264 list=vienna2list (A->seq_al[s]);
\r
265 list=alifold_list2cov_list (A, list);
\r
267 usegap=0; //do not use gaped positions by default
\r
268 if (mode && strstr (mode, "usegap"))usegap=1;//count positions with gaps
\r
272 A=alifold2cov_stat (A, list,usegap);
\r
276 if ( strstr (mode, "stat")) A=alifold2cov_stat (A, list, usegap);
\r
277 if ( strstr (mode, "list")) A=alifold2cov_list (A, list, usegap);
\r
278 if ( strstr (mode, "aln")) A=alifold2cov_aln (A, list, usegap);
\r
279 if ( strstr (mode, "color") )
\r
282 C=copy_aln (A, NULL);
\r
283 C=alifold2cov_cache (C, list, usegap);
\r
284 A=alifold2cov_aln (A, list, usegap);
\r
285 if ( strstr ( mode, "ps"))
\r
286 output_color_ps (A, C, "stdout");
\r
288 output_color_html (A, C, "stdout");
\r
289 exit (EXIT_SUCCESS);
\r
296 int ** alifold_list2cov_list (Alignment *A, int **list)
\r
298 int a,b,c,d,p1,p2,s;
\r
299 int r1, rr1, r2, rr2;
\r
300 int neutral,watson, comp,tot, occupancy;
\r
305 int ncomp=0, nwatson=0, nneutral=0, ncomp_wc=0;
\r
306 int cons_l, fold_l;
\r
311 for (nseq=0,a=0; a< A->nseq; a++)if ( A->name[a][0]!='#')nseq++;
\r
312 max=((nseq*(nseq-1))/2);
\r
323 for (c=0; c<A->nseq-1; c++)
\r
325 if (A->name[c][0]=='#')continue;
\r
326 r1=tolower(A->seq_al[c][p1]);
\r
327 r2=tolower(A->seq_al[c][p2]);
\r
328 if (is_gap(r1) || is_gap(r2))continue;
\r
329 for (d=c+1; d<A->nseq; d++)
\r
331 if (A->name[d][0]=='#')continue;
\r
332 rr1=tolower(A->seq_al[d][p1]);
\r
333 rr2=tolower(A->seq_al[d][p2]);
\r
334 if (is_gap(rr1) || is_gap(rr2))continue;
\r
335 if (is_watson (r1, r2))watson++;
\r
336 if (is_watson (rr1, rr2))watson++;
\r
337 if (is_neutral (r1, r2))neutral++;
\r
338 if (is_neutral (rr1, rr2))neutral++;
\r
339 if (r1!=rr1 && r2!=rr2)comp++;
\r
344 watson=(watson*100)/(occupancy*2);
\r
345 comp=(comp*100)/occupancy;
\r
346 neutral=(neutral*100)/(occupancy*2);
\r
347 occupancy=(occupancy*100)/max;
\r
348 list[a][3]=neutral;
\r
351 list[a][6]=occupancy;
\r
353 if (list[a][3]<100)
\r
355 if (list[a][5]>0)list[a][7]='I';//compensated incompatible pair
\r
356 else list[a][7]='i'; //non compensated incompatible pair
\r
360 list[a][7]='N';//Neutral pair
\r
361 if (list[a][4]==100)
\r
363 list[a][7]='W';//Watson and Crick
\r
364 if ( list[a][5]>0)list[a][7]='C'; //Watson and crick compensated
\r
366 else if ( list[a][5]>0)
\r
368 list[a][7]='c';//compensated
\r
376 Alignment *alifold2cov_aln (Alignment *inA,int **list, int ug)
\r
382 A=copy_aln (inA, NULL);
\r
383 A=realloc_aln2 ( A, A->nseq+1, A->len_aln+1);
\r
384 sprintf (A->name[A->nseq], "#=GC SS_analyze");
\r
385 sprintf (A->seq_al[A->nseq], "%s", A->seq_al[A->nseq-1]);
\r
390 if (list[a][6]<100 && !ug);
\r
394 A->seq_al[A->nseq-1][list[a][0]]=s;
\r
395 A->seq_al[A->nseq-1][list[a][1]]=s;
\r
401 Alignment *alifold2cov_stat (Alignment *A,int **list, int ug)
\r
403 int fold=0,watson=0, comp=0, compwc=0, incomp=0, neutral=0;
\r
411 if (list[a][6]<100 && !ug);
\r
415 watson +=(s=='W')?1:0;
\r
416 compwc +=(s=='C')?1:0;
\r
417 comp +=(s=='c')?1:0;
\r
418 neutral+=(s=='N')?1:0;
\r
419 incomp +=(s=='I')?1:0;
\r
423 fprintf ( stdout, "@@ TOT Nseq:%d tot_len: %d fold: %d neutral: %d watson: %d CorWC: %d cor: %d CorIncompatible: %d\n",A->nseq-1, A->len_aln,fold, neutral,watson, compwc,comp,incomp);
\r
426 Alignment *alifold2cov_cache (Alignment *inA, int **list, int ug)
\r
431 A=copy_aln (inA, NULL);
\r
436 if (list[a][6]<100 && !ug);
\r
440 if (s=='C')v=9; //red
\r
441 else if ( s=='c')v=7; //orange
\r
442 else if ( s=='W')v=5; //Yellow
\r
443 else if ( s=='N')v=2; //green
\r
444 else if ( s=='I')v=0; //blue;
\r
445 for (b=0;b<A->nseq; b++)
\r
447 if (A->name[b][0]=='#');
\r
450 for (c=0; c<2; c++)
\r
452 A->seq_al[b][list[a][c]]='0'+v;
\r
462 Alignment *alifold2cov_list (Alignment *A,int **list, int ug)
\r
470 if (list[a][6]<100 && !ug);
\r
473 fprintf ( stdout, "@@ WC Compensated pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
\r
474 for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
\r
475 fprintf (stdout,"\n");
\r
479 fprintf ( stdout, "@@ Neural Compensated pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
\r
480 for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
\r
481 fprintf (stdout,"\n");
\r
485 fprintf ( stdout, "@@ WC pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
\r
486 for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
\r
487 fprintf (stdout,"\n");
\r
491 fprintf ( stdout, "@@ Neutral pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
\r
492 for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
\r
493 fprintf (stdout,"\n");
\r
497 fprintf ( stdout, "@@ incompatible pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
\r
498 for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
\r
499 fprintf (stdout,"\n");
\r
508 Alignment *aln2sample (Alignment *A, int n)
\r
514 B=copy_aln (A, NULL);
\r
518 pos=declare_int (A->len_aln, 2);
\r
519 for (a=0; a<A->len_aln; a++){pos[a][0]=a;pos[a][1]=rand()%(1000*A->len_aln);}
\r
521 sort_int (pos, 2, 1, 0, A->len_aln-1);
\r
523 n=(n==0)?A->len_aln:(MIN (n, (A->len_aln)));
\r
524 for (a=0; a<n; a++)
\r
525 for (b=0; b<A->nseq; b++)
\r
526 A->seq_al[b][a]=B->seq_al[b][pos[a][0]];
\r
527 for (b=0; b<A->nseq; b++)
\r
528 A->seq_al[b][n]='\0';
\r
532 free_int (pos, -1);
\r
535 Alignment *aln2bootstrap (Alignment *A, int n)
\r
540 if (n==0)n=A->len_aln;
\r
541 else A=realloc_aln (A, n+1);
\r
543 B=copy_aln (A, NULL);
\r
544 for (a=0; a<n; a++)
\r
546 p=rand ()%A->len_aln;
\r
547 for (b=0; b<A->nseq; b++)
\r
548 A->seq_al[b][a]=B->seq_al[b][p];
\r
550 for ( b=0; b<A->nseq; b++)A->seq_al[b][n]='\0';
\r
559 Alignment * aln2random_aln (Alignment *A, char *smode)
\r
562 int a, b, n, **res;
\r
569 smode=vcalloc (4, sizeof (char));
\r
570 sprintf ( smode, "SCR");//Sequences, Column Residues
\r
572 else if ( strm (smode, "NO"))return A;
\r
577 if ( strstr ( smode, "S"))
\r
579 A=aln2scramble_seq (A);
\r
581 if ( strstr ( smode, "C"))
\r
584 res=declare_int (A->nseq, 2);
\r
585 for (a=0; a< A->len_aln; a++)
\r
587 for (n=0,b=0;b<A->nseq; b++)
\r
589 if ( !is_gap(A->seq_al[b][a]))
\r
591 res[n][0]=A->seq_al[b][a];
\r
592 res[n][1]=rand()%max;
\r
595 sort_int (res, 2, 1, 0, n-1);
\r
597 for (n=0,b=0;b<A->nseq; b++)
\r
599 if ( !is_gap(A->seq_al[b][a]))A->seq_al[b][a]=res[n++][0];
\r
602 free_int (res, -a);
\r
606 //Redistributes the residues randomly without changing the gap pattern
\r
607 if ( strstr ( smode, "R"))
\r
609 max=A->len_aln*A->nseq;
\r
610 res=declare_int (max, 2);
\r
612 for (n=0,a=0; a< A->len_aln; a++)
\r
614 for (b=0;b<A->nseq; b++)
\r
616 if ( !is_gap(A->seq_al[b][a]))
\r
618 res[n][0]=A->seq_al[b][a];
\r
619 res[n][1]=rand()%max;
\r
625 sort_int (res, 2, 1, 0, n-1);
\r
626 for (n=0,a=0; a< A->len_aln; a++)
\r
628 for (b=0;b<A->nseq; b++)
\r
630 if ( !is_gap(A->seq_al[b][a]))
\r
632 A->seq_al[b][a]=res[n++][0];
\r
638 free_int (res, -1);
\r
643 Alignment *score_aln2score_ascii_aln (Alignment *A, Alignment *C)
\r
645 //Convert the output of T-Coffee evaluate into a printable score_ascii alignment*/
\r
646 //A and C must be sorted
\r
647 //sets to 0 lone residues
\r
650 for (a=0; a<A->nseq; a++)
\r
651 for (b=0; b<A->len_aln; b++)
\r
654 int rC=C->seq_al[a][b];
\r
655 int rA=A->seq_al[a][b];
\r
656 if ( !strm (A->name[a], C->name[a])){HERE ("Unsorted aln in score_aln2score_ascii"); exit (EXIT_FAILURE);}
\r
658 if ( rA=='x' || rA=='X')C->seq_al[a][b]='9';
\r
659 else if ( rC >='0' && rC<='9');
\r
660 else if ( rC<10)C->seq_al[a][b]='0'+rC;
\r
661 else if ( rC==NO_COLOR_RESIDUE && !is_gap(rA)) C->seq_al[a][b]='0';
\r
662 else if ( rC==NO_COLOR_RESIDUE && is_gap(rA))C->seq_al[a][b]='-';
\r
666 Alignment*aln2gap_cache (Alignment *A, int val)
\r
671 B=copy_aln (A, NULL);
\r
672 for (b=0; b<A->len_aln; b++)
\r
674 for (nr=0,a=0; a<A->nseq; a++)nr+=!is_gap (A->seq_al[a][b]);
\r
675 for (a=0; a<A->nseq; a++)if (!is_gap(A->seq_al[a][b]))B->seq_al[a][b]=(nr==1)?'0'+val:'1';
\r
680 Alignment* aln2case_aln (Alignment *B, char *upper, char *lower)
\r
682 int a, b, c, up, lo;
\r
685 A=copy_aln (B, NULL);
\r
687 up=(upper)?upper[0]:'u';
\r
688 lo=(lower)?lower[0]:'l';
\r
690 for (a=0; a<A->nseq; a++)
\r
691 for (b=0; b<A->len_aln; b++)
\r
696 else A->seq_al[a][b]=(isupper (c))?up:lo;
\r
700 Alignment *aln2scale (Alignment *A, char *coffset)
\r
707 if (coffset)offset=atoi(coffset);
\r
710 sprintf (s, "%d", A->len_aln+offset);
\r
713 A=realloc_aln2 (A, A->nseq+n, A->len_aln+1);
\r
714 s1=vcalloc ( n+1, sizeof (char));
\r
715 s2=vcalloc ( n+1, sizeof (char));
\r
717 for (a=0; a<n; a++)
\r
719 if (a==0)s2[a]='1';
\r
720 else strcat (s2, "0");
\r
721 sprintf (A->name[A->nseq+a], "%s", s2);
\r
724 for (a=0; a<A->len_aln; a++)
\r
726 sprintf (s1, "%d", a+1+offset);
\r
727 s2=invert_string (s1);
\r
730 for (b=0; b<=n; b++)
\r
735 A->seq_al[A->nseq+b][a]=v;
\r
746 int * pos2list (int * pos, int len, int *nl)
\r
751 list=vcalloc (len, sizeof (int));
\r
752 for (a=0; a<len; a++)if (pos[a])list[nl[0]++]=a;
\r
755 int *list2pos (int *list, int nl, int len)
\r
758 pos=vcalloc (len, sizeof (int));
\r
759 for (a=0; a<nl; a++)pos[list[a]]=1;
\r
763 int **aln2resindex ( Alignment *A, Alignment *B, FILE *fp)
\r
769 list=vcalloc (A->nseq+((B)?B->nseq:0), sizeof (int));
\r
770 pos=aln2pos_simple_2 (A);
\r
774 for ( a=0; a<B->nseq; a++)
\r
776 list[a]=name_is_in_list(B->name[a], A->name, A->nseq, 100);
\r
781 for ( a=0; a<A->nseq; a++)
\r
787 fprintf ( fp, "#");
\r
788 for ( b=0; b<n; b++)
\r
791 if ( s!=-1)fprintf (fp, " %s",A->name[s]);
\r
793 fprintf (fp, "\n");
\r
795 for ( a=0; a<A->len_aln; a++)
\r
797 for ( b=0; b<n; b++)
\r
801 else if (pos[s][a]<0)
\r
802 fprintf (fp, "%4d", -1);
\r
804 fprintf (fp, "%4d", pos[s][a]);
\r
806 fprintf (fp, "\n");
\r
811 int **index_seq_res ( Sequence *S1, Sequence *S2, int **name_index)
\r
813 /*Index the residues of S1 according to S2
\r
814 index[seq1 of S1][z]->x, where x is the position of residue z of seq1/S1 in S2->seq[index[Seq1/S1]]
\r
818 char *seq1=NULL, *seq2=NULL;
\r
819 Alignment *Profile;
\r
821 index=vcalloc ( S1->nseq, sizeof (int*));
\r
823 for (a=0; a< S1->nseq; a++)
\r
825 int len1, len2, b, c;
\r
829 if (name_index[a][0]==-1)
\r
831 else if (name_index[a][1]==-1)
\r
833 seq2=S2->seq[name_index[a][0]];
\r
835 else if ((Profile=seq2R_template_profile (S2, name_index[a][0])) !=NULL)
\r
837 seq2=Profile->seq_al[name_index[a][1]];
\r
840 len1=strlen (seq1);len2=strlen (seq2);
\r
841 index[a]=vcalloc (len2, sizeof(int));
\r
844 for (c=0,b=0; b<len2; b++)if( !is_gap(seq2[b]))index[a][c++]=b;
\r
845 //index[a]=get_res_index ( seq1, seq2);
\r
850 int **index_seq_name ( Sequence *S1, Sequence *S2)
\r
852 /*Index the names of S1 according to S2
\r
853 index[seq1 of S1][0]->x if seq1 is the xth sequence of S2
\r
854 ->-1 if seq1 is nowhere to be found
\r
855 index[seq1 of S1][1]->z if seq1 is the zth sequence within the xth profile of S2
\r
859 Alignment *Profile;
\r
860 index=declare_int (S1->nseq, 2);
\r
863 for ( a=0; a<S1->nseq; a++)
\r
865 index[a][0]=index[a][1]=-1;
\r
866 x=name_is_in_list (S1->name[a],S2->name,S2->nseq,100);
\r
867 if ( x!=-1){index[a][0]=x;index[a][1]=-1;}
\r
868 for ( b=0; b<S2->nseq; b++)
\r
870 if ((Profile=seq2R_template_profile (S2,b)))
\r
872 z=name_is_in_list (S1->name[a],Profile->name,Profile->nseq,100);
\r
873 if ( z!=-1){index[a][0]=b;index[a][1]=z;b=S2->nseq;}
\r
883 int *get_name_index (char **l1, int n1, char **l2, int n2)
\r
887 /*return Array[Index_L1]=Index_L2 */
\r
888 r=vcalloc ( n1, sizeof (int));
\r
889 for ( a=0; a< n1; a++)
\r
890 r[a]=name_is_in_list (l1[a],l2,n2,100);
\r
894 int* get_res_index (char *seq0, char *seq1)
\r
898 if ( !seq0 || !seq1) return NULL;
\r
901 coor=vcalloc ( strlen (seq0)+1, sizeof (int));
\r
902 if (!strm (seq0, seq1))
\r
904 int r0, r1 , isr0, isr1;
\r
907 A=align_two_sequences (seq0,seq1,"pam250mt",-5,-1, "myers_miller_pair_wise");
\r
909 for ( a=0; a< A->len_aln; a++)
\r
911 r0=A->seq_al[0][a];r1=A->seq_al[1][a];
\r
916 if (isr0 && isr1)coor[l0-1]=l1-1;
\r
917 else if (isr0) coor[l0-1]=-1;
\r
926 for ( a=0;a< l0; a++)
\r
933 int change_residue_coordinate ( char *in_seq1, char *in_seq2, int v)
\r
935 /*Expresses the coordinate of a residue in seq1, in the coordinate system of seq2*/
\r
938 static char *seq1, *seq2;
\r
942 if ( seq1 !=in_seq1 || seq2 !=in_seq2)
\r
944 int r0, r1 , isr0, isr1;
\r
950 seq1=in_seq1, seq2=in_seq2;
\r
951 A=align_two_sequences (seq1,seq2,"pam250mt", -14, -2, "myers_miller_pair_wise");
\r
953 coor=vcalloc ( A->len_aln, sizeof (int));
\r
954 for ( a=0; a< A->len_aln; a++)
\r
956 r0=A->seq_al[0][a];r1=A->seq_al[1][a];
\r
963 if (isr0 && isr1)coor[l0-1]=l1-1;
\r
964 else if (isr0) coor[l0-1]=-1;
\r
972 int ** minimise_repeat_coor (int **coor, int nseq, Sequence *S)
\r
976 new_coor=declare_int ( nseq, 3);
\r
977 min=return_min_int (coor, nseq, 2);
\r
978 for ( a=0; a< nseq; a++)
\r
980 new_coor[a][0]=coor[a][0];
\r
981 new_coor[a][1]=coor[a][1];
\r
982 new_coor[a][2]=min;
\r
986 int ** get_nol_seq ( Constraint_list *CL, int **coor, int nseq, Sequence *S)
\r
988 int a, s, p, l, nl;
\r
992 new_coor=declare_int ( nseq+1, 3);
\r
995 buf=get_undefined_list ( CL);
\r
999 for ( a=0; a< nseq; a++)buf[coor[a][0]][coor[a][1]]=1;
\r
1002 for ( a=0; a< nseq; a++)
\r
1006 l=strlen(S->seq[s]);
\r
1008 while ( p<=l && !buf[s][p++])nl++;
\r
1010 new_coor[a][1]=coor[a][1];
\r
1011 new_coor[a][2]=nl;
\r
1013 free_int ( buf, -1);
\r
1019 int compare_pos_column( int **pos1,int p1, int **pos2,int p2, int nseq)
\r
1026 for ( a=0; a< nseq; a++)
\r
1032 if (v1>0 || v2>0)
\r
1034 if ( v1!=v2)return 0;
\r
1042 char *seq2alphabet (Sequence *S)
\r
1044 return array2alphabet (S->seq, S->nseq, "");
\r
1047 char *aln2alphabet (Alignment *A)
\r
1049 return array2alphabet (A->seq_al, A->nseq, "");
\r
1052 char *array2alphabet (char **array, int n, char *forbiden)
\r
1058 hasch=vcalloc (256, sizeof (int));
\r
1059 alphabet=vcalloc ( 257, sizeof (char));
\r
1062 for ( a=0; a<n; a++)
\r
1064 l=strlen (array[a]);
\r
1065 for ( b=0; b<l; b++)
\r
1066 hasch[tolower(array[a][b])]++;
\r
1069 for ( a=0, b=0; a< 256; a++)
\r
1071 if (hasch[a] && !strrchr(forbiden,a))alphabet[b++]=a;
\r
1080 //***************************************************************
\r
1083 //***************************************************************
\r
1085 char* alnpos2hmmtop_pred (Alignment *A,Alignment *Pred, int pos, int mode)
\r
1087 static char *result;
\r
1088 static Alignment *Cache;
\r
1089 static int *score;
\r
1094 score=vcalloc (256, sizeof (int));
\r
1095 result=vcalloc (100, sizeof (char));
\r
1098 if (!Pred && !Cache)
\r
1100 Cache=aln2hmmtop_pred (A);
\r
1102 if (!Pred) Pred=Cache;
\r
1105 for (tot=0,a=0; a<A->nseq; a++)
\r
1108 s=Pred->seq_al[a][pos];
\r
1111 score[tolower(s)]++;
\r
1116 if ( score['h']>score['i'] && score['h']>score['o'])cons='h';
\r
1118 else if ( score['i']>score['o'])cons='i';
\r
1120 if (tot==0) return "";
\r
1123 if (mode==VERBOSE)sprintf (result, " H: %3d I: %3d O: %3d P: %c", (score['h']*100)/tot, (score['i']*100)/tot, (score['o']*100)/tot, cons);
\r
1124 else if (mode == SHORT)sprintf ( result, "%c", cons);
\r
1125 score['h']=score['o']=score['i']=0;
\r
1130 Alignment * aln2hmmtop_pred (Alignment *A)
\r
1136 PA=copy_aln (A, NULL);
\r
1137 buf=vcalloc ( A->len_aln+1, sizeof (char));
\r
1139 for ( a=0; a< A->nseq; a++)
\r
1141 sprintf (buf, "%s", A->seq_al[a]);
\r
1142 pred=seq2tmstruc (buf);
\r
1143 for (c=0,b=0; b<A->len_aln; b++)
\r
1145 if (!is_gap (PA->seq_al[a][b]))PA->seq_al[a][b]=pred[c++];
\r
1153 char * seq2tmstruc ( char *seq)
\r
1155 static Sequence *S;
\r
1156 char *seqfile, *predfile, *buf;
\r
1159 seqfile=vtmpnam (NULL);
\r
1160 predfile=vtmpnam (NULL);
\r
1162 fp=vfopen (seqfile, "w");
\r
1163 fprintf ( fp, ">seq1\n%s", seq);
\r
1167 printf_system ( "fasta_seq2hmmtop_fasta.pl -in=%s -out=%s -arch=%s/%s -psv=%s/%s", seqfile, predfile, get_mcoffee_4_tcoffee(), "hmmtop.arch", get_mcoffee_4_tcoffee(), "hmmtop.psv");
\r
1168 S=get_fasta_sequence (predfile, NULL);
\r
1169 buf=vcalloc ( strlen (S->seq[0])+1, sizeof (char));
\r
1170 sprintf ( buf, "%s", S->seq[0]);
\r
1172 free_sequence (S, S->nseq);
\r
1177 char * set_blast_default_values()
\r
1179 set_string_variable ("blast_server", (getenv ("blast_server_4_TCOFFEE"))?getenv ("blast_server_4_TCOFFEE"):"EBI");
\r
1180 set_string_variable ("pdb_db", (getenv ("pdb_db_4_TCOFFEE"))?getenv ("pdb_db_4_TCOFFEE"):"pdb");
\r
1181 set_string_variable ("prot_db", (getenv ("prot_db_4_TCOFFEE"))?getenv ("prot_db_4_TCOFFEE"):"uniprot");
\r
1182 set_int_variable ("prot_min_sim", 0);
\r
1183 set_int_variable ("prot_max_sim", 100);
\r
1185 set_int_variable ("prot_min_cov", 0);
\r
1186 set_int_variable ("prot_max_cov", 100);
\r
1188 set_int_variable ("pdb_min_sim", 0);
\r
1189 set_int_variable ("pdb_max_sim", 100);
\r
1190 set_int_variable ("pdb_min_cov", 0);
\r
1191 set_int_variable ("pdb_max_cov", 100);
\r
1196 char * seq2pdb (Sequence *S)
\r
1198 set_blast_default_values();
\r
1200 S=seq2template_seq (S, "PDB", NULL);
\r
1201 return seq2P_pdb_id(S,0);
\r
1204 Alignment * seq2blast ( Sequence *S)
\r
1207 set_blast_default_values();
\r
1211 S=seq2template_seq (S, "BLAST", NULL);
\r
1212 A=seq2R_template_profile(S,0);
\r
1213 sprintf ( A->name[0], "%s", S->name[0]);
\r
1218 for (a=0; a< S->nseq; a++)
\r
1222 NS=fill_sequence_struc(1, &(S->seq[a]), &(S->name[a]));
\r
1223 NS=seq2template_seq (NS, "BLAST", NULL);
\r
1224 A=seq2R_template_profile(NS,0);
\r
1225 sprintf ( name, "%s.prf", S->name[a]);
\r
1227 output_fasta_aln (name,A);
\r
1228 fprintf (stdout, "\nOUTPUT %s\n", name);
\r
1230 exit (EXIT_SUCCESS);
\r
1238 Sequence * seq2unique_name_seq ( Sequence *S)
\r
1241 if ((a=name_list2unique_name_list (S->nseq, S->name)))
\r
1243 add_warning ( stderr, "\nWarning: Sequence %s is duplicated in file %s. The sequence will be renamed", S->name[a-1], S->file[a-1]);
\r
1247 Alignment * aln2unique_name_aln ( Alignment *S)
\r
1250 if ((a=name_list2unique_name_list (S->nseq, S->name)))
\r
1252 add_warning ( stderr, "\nWarning: Sequence %s is duplicated in file %s. The sequence will be renamed", S->name[a-1], S->file[a-1]);
\r
1258 int name_list2unique_name_list (int n, char **name)
\r
1263 for (a=0; a<n-1; a++)
\r
1264 for (b=a+1; b<n; b++)
\r
1266 if ( strm (name[a], name[b]))
\r
1267 {duplicate=a+1;b=a=n;}
\r
1272 char *tmp1, *tmp2;
\r
1276 tmp1=vtmpnam (NULL);
\r
1277 tmp2=vtmpnam (NULL);
\r
1278 fp=vfopen (tmp1, "w");
\r
1279 for (a=0; a< n; a++)fprintf ( fp, ">%s\naggggg\n", name[a]);
\r
1281 printf_system ("fasta_aln2fasta_aln_unique_name.pl %s > %s", tmp1, tmp2);
\r
1282 S=get_fasta_sequence (tmp2, NULL);
\r
1283 for (a=0; a<n; a++)
\r
1285 name[a]=vrealloc (name [a], sizeof (int)*(strlen (S->name[a])+1));
\r
1286 sprintf ( name[a], "%s", S->name [a]);
\r
1288 free_sequence(S, -1);
\r
1293 Sequence* seq2clean_seq (Sequence *S, char *alp)
\r
1295 int a, b, c, d, l;
\r
1297 for (a=0; a< S->nseq; a++)
\r
1299 l=strlen (S->seq[a]);
\r
1300 for (d=0,b=0; b<l; b++)
\r
1303 if ( alp==NULL && !strchr (AA_ALPHABET, c) && !strchr (DNA_ALPHABET, c));
\r
1304 else if (alp && strchr (alp, c));
\r
1305 else S->seq[a][d++]=c;
\r
1307 S->seq[a][d]='\0';
\r
1308 S->len[a]=strlen (S->seq[a]);
\r
1312 int ** seq2aln_pos (Alignment *A, int *ns, int **l_s)
\r
1315 int a, b,c, d,l, p , g;
\r
1318 l=MAX(strlen (A->seq_al[l_s[0][0]]), strlen (A->seq_al[l_s[1][0]]));
\r
1319 code=declare_int ((A->S)->nseq,l+1);
\r
1321 for (c=0; c<2; c++)
\r
1323 l=strlen (A->seq_al[l_s[c][0]]);
\r
1324 for (d=0; d<ns[c]; d++)
\r
1326 a=A->order[l_s[c][d]][0];
\r
1327 for (p=0, b=0; b<l; b++)
\r
1329 g=is_gap (A->seq_al[l_s[c][d]][b]);
\r
1330 if (!g){p++; code[a][p]=b+1;}
\r
1337 Alignment *local_maln2global_maln (char *seq, Alignment *A)
\r
1339 /*inputs a BLAST alignmnent where the master sequence may be partila
\r
1340 outputs the same alignment, while amkeing sure the profile is perfectly in sink with its master sequence
\r
1344 int start, end, rend;
\r
1345 char qname[100], *p;
\r
1346 Alignment *B=NULL;
\r
1348 sprintf ( qname, "%s", A->name[0]);
\r
1349 p=strtok (qname, "_");
\r
1350 if ( !strm (p, "QUERY"))
\r
1352 fprintf ( stderr, "\nUnappropriate format for the alignment [%s:FATAL]", PROGRAM);
\r
1353 myexit (EXIT_FAILURE);
\r
1356 start=atoi(strtok (NULL, "_"));
\r
1357 end=atoi(strtok (NULL, "_"));
\r
1358 rend=strlen (seq);
\r
1360 B=copy_aln (A,NULL);
\r
1361 if ( start>1 || end<rend )A=realloc_aln (A,rend+1);
\r
1363 for (a=0; a<start-1; a++)
\r
1365 A->seq_al[0][a]=seq[a];
\r
1366 for ( b=1; b< A->nseq; b++)A->seq_al[b][a]='-';
\r
1369 for (c=0,a=start-1; a< end; a++, c++)
\r
1371 A->seq_al[0][a]=seq[a];
\r
1372 for ( b=1; b< A->nseq; b++)
\r
1374 A->seq_al[b][a]=B->seq_al[b][c];
\r
1377 for ( a=end; a<rend; a++)
\r
1379 A->seq_al[0][a]=seq[a];
\r
1380 for ( b=1; b< A->nseq; b++)A->seq_al[b][a]='-';
\r
1382 for ( a=0; a< A->nseq; a++) A->seq_al[a][rend]='\0';
\r
1389 int ** aln2inv_pos ( Alignment *A)
\r
1392 pos=vcalloc (A->nseq, sizeof (char*));
\r
1393 for (a=0; a< A->nseq; a++)pos[a]=seq2inv_pos (A->seq_al[a]);
\r
1396 int * seq2inv_pos ( char *seq)
\r
1398 /*returns a list where each value gives the index of the corresponding residue in seq*/
\r
1399 /*Numbering: 1 to L : Analogy to the aln2pos*/
\r
1405 for ( l2=a=0; a< l1; a++)l2+=1-is_gap(seq[a]);
\r
1406 pos=vcalloc (l2+1, sizeof (int));
\r
1407 for ( l2=a=0; a< l1; a++)if (!is_gap(seq[a]))pos[++l2]=a+1;
\r
1412 int ** aln2pos_simple_2 (Alignment *A)
\r
1416 pos1=aln2pos_simple (A, A->nseq);
\r
1417 pos2=duplicate_int (pos1, A->nseq,read_size_int (pos1[0],sizeof (int)));
\r
1418 pos1=aln2pos_simple (NULL, 0);
\r
1421 int ** aln2pos_simple (Alignment *A, int n_nseq, ...)
\r
1424 function documentation: start
\r
1425 int ** aln2pos_simple (Alignment *A, int n_nseq, ...)
\r
1427 ####with two parameter only: Alignment *A, int n_nseq
\r
1429 this function turns A into pos, a matrix where each residue is replace by its index according to the complete sequence.
\r
1430 the indices in pos are computed using A->order[x][1] that contains the indice of the first residue of seq x of A
\r
1432 n_nseq MUST not be null
\r
1434 ####with more than two param:
\r
1435 int ** aln2pos_simple (Alignment *A, int n_nseq, int *ns, int **ls)
\r
1436 n_nseq must be set to 0 for the param 3 and four to be read
\r
1438 ns[x]=number seq in group
\r
1439 ls[x]=list of the sequences in group x ( size=ns[x])
\r
1441 The computation of the indices is only carried out on the scpecified residues
\r
1444 in pos, the numbering of the residues goes from 1 to L:
\r
1445 pos[0][0]=3, means that the first position of the first sequence
\r
1446 in the alignmnet contains residue #3 from sequence A->order[0][0];
\r
1448 function documentation: end
\r
1451 int a, b,c, p, g,l;
\r
1474 list=vcalloc(n_nseq, sizeof (int));
\r
1475 for ( a=0; a< n_nseq; a++)list[a]=a;
\r
1479 va_start (ap, n_nseq);
\r
1480 ns=va_arg(ap, int * );
\r
1481 ls=va_arg(ap, int **);
\r
1483 list=vcalloc ( ns[0]+ns[1], sizeof (int));
\r
1485 for ( a=0; a< ns[0]; a++)list[n_nseq++]=ls[0][a];
\r
1486 for ( a=0; a< ns[1]; a++)list[n_nseq++]=ls[1][a];
\r
1489 max_nseq=MAX(read_size_int(A->order,sizeof (int*)),return_max_int (A->order, read_size_int(A->order,sizeof (int*)),0))+1;
\r
1490 n_len=get_longest_string ( A->seq_al,A->max_n_seq, NULL, NULL)+1;
\r
1493 T=declare_int (max_nseq, n_len);
\r
1494 for ( c=0; c< n_nseq; c++)
\r
1497 l=strlen ( A->seq_al[a]);
\r
1499 for ( p=A->order[a][1],b=0; b<l; b++)
\r
1501 g=1-is_gap(A->seq_al[a][b]);
\r
1503 T[a][b]=(g==1)?p:-(1+p);
\r
1504 if ( A->seq_al[a][b]==UNDEFINED_RESIDUE)T[a][b]=0;
\r
1505 if ( A->seq_cache && T[a][b]>0)T[a][b]=A->seq_cache[A->order[a][0]][T[a][b]];
\r
1513 Alignment ** split_seq_in_aln_list ( Alignment **aln, Sequence *S, int n_seq, char **seq_list)
\r
1516 char * long_seq=NULL;
\r
1518 int **translation;
\r
1524 if ( aln==NULL)return NULL;
\r
1525 translation=declare_int ( S->nseq,2);
\r
1527 for (len=0,a=0; a< S->nseq; a++)
\r
1529 if((b=name_is_in_list (S->name[a],seq_list, n_seq, 100))!=-1)
\r
1531 l=strlen(S->seq[a])+1;
\r
1532 long_seq=vrealloc(long_seq,(len+l+1)*sizeof(char));
\r
1533 long_seq=strcat(long_seq, S->seq[a]);
\r
1534 long_seq=strcat(long_seq, "*");
\r
1536 translation[a][0]=b;
\r
1537 translation[a][1]=len;
\r
1540 else translation[a][0]=-1;
\r
1543 long_seq[len-1]='\0';
\r
1546 table=declare_int ( len+1, 2);
\r
1548 for ( b=0,a=0; a< S->nseq; a++)
\r
1550 if ( translation[a][0]!=-1)
\r
1553 while (long_seq[b]!='\0' && long_seq[b]!='*')
\r
1555 table[b+1][1]=c++;
\r
1556 table[b+1][0]=translation[a][0];
\r
1560 table[b][0]=translation[a][0];
\r
1565 for ( a=0; a< (aln[-1])->nseq; a++)
\r
1567 for ( b=0; b< (aln[a])->nseq; b++)
\r
1570 (aln[a])->order[b][0]=table[(aln[a])->order[b][1]][0];
\r
1571 (aln[a])->order[b][1]=table[(aln[a])->order[b][1]][1];
\r
1572 sprintf ( (aln[a])->name[b],"%s_%d_%d", S->name[(aln[a])->order[b][0]],a+1,b+1);
\r
1575 free_int (translation, -1);
\r
1576 free_int (table, -1);
\r
1582 Sequence * fill_sequence_struc ( int nseq, char **sequences, char **seq_name)
\r
1586 int shortest, longuest;
\r
1590 shortest=longuest=0;
\r
1594 shortest=get_shortest_string( sequences, nseq, NULL, NULL);
\r
1595 longuest=get_longest_string (sequences, nseq, NULL, NULL);
\r
1597 else if ( nseq==1)
\r
1599 shortest=longuest=strlen (sequences[0]);
\r
1607 S=declare_sequence (shortest, longuest,nseq);
\r
1610 if (sequences)S->seq=copy_char ( sequences, S->seq, nseq, -1);
\r
1611 else S->seq=declare_char (S->nseq, 1);
\r
1613 S->name=copy_char ( seq_name, S->name,nseq, -1);
\r
1615 ungap_array (S->seq,nseq);
\r
1616 for ( a=0; a< S->nseq; a++)S->len[a]=strlen(S->seq[a]);
\r
1621 Alignment * thread_profile_files2aln (Alignment *A, char *template_file, Fname *F)
\r
1627 if (!A->S)A->S=aln2seq (A);
\r
1628 if (template_file)A->S=seq2template_seq (A->S, template_file,F);
\r
1629 for ( a=0; a< A->nseq; a++)
\r
1631 P=seq2R_template_profile (A->S, a);
\r
1635 sprintf ( P->name[0], "%s", A->name[a]);
\r
1639 return expand_aln (A);
\r
1645 Alignment * expand_aln (Alignment *A)
\r
1647 /*This function expands the profiles within an alignment*/
\r
1651 Alignment *MAIN=NULL, *SUB=NULL;
\r
1655 Alignment *Profile;
\r
1661 list=vcalloc (A->nseq, sizeof (int));
\r
1662 for ( a=0; a< A->nseq; a++)
\r
1664 Profile=seq2R_template_profile (A->S, A->order[a][0]);
\r
1665 if (Profile && Profile->expand)
\r
1667 new_nseq+=Profile->nseq;
\r
1672 list[n_sub_seq++]=a;
\r
1676 if ( n_sub_seq==A->nseq){vfree(list);return A;}
\r
1677 else if (n_sub_seq==0){MAIN=copy_aln (A, MAIN);MAIN->nseq=0;}
\r
1680 MAIN=extract_sub_aln (A, n_sub_seq, list);
\r
1685 for ( a=0; a< A->nseq; a++)
\r
1687 Profile=seq2R_template_profile (A->S, A->order[a][0]);
\r
1688 if ( Profile && Profile->expand)
\r
1690 SUB=copy_aln (Profile,SUB);
\r
1691 SUB=realloc_aln2(SUB, SUB->nseq, A->len_aln+1);
\r
1693 for ( e=0,b=0; b< A->len_aln; b++)
\r
1695 if ( is_gap(A->seq_al[a][b]))
\r
1696 {for (d=0; d< SUB->nseq; d++)SUB->seq_al[d][b]='-';}
\r
1699 for(d=0; d<SUB->nseq; d++)SUB->seq_al[d][b]=Profile->seq_al[d][e];
\r
1704 MAIN=stack_aln(MAIN, SUB);
\r
1711 Alignment * expand_number_aln (Alignment *A,Alignment *EA)
\r
1713 /*This function expands the profiles within an alignment*/
\r
1717 Alignment *MAIN=NULL, *SUB=NULL, *C=NULL;
\r
1721 Alignment *Profile;
\r
1723 if ( !EA || !A)return EA;
\r
1725 if ( EA->nseq<A->nseq)
\r
1727 fprintf (stderr, "\n[ERROR:expand_number_aln] Using as a master an expanded aln (%d %d) [FATAL:%s]", EA->nseq, A->nseq,PROGRAM);
\r
1731 myexit (EXIT_FAILURE);
\r
1735 list=vcalloc (EA->nseq, sizeof (int));
\r
1736 for ( a=0; a< EA->nseq; a++)
\r
1738 Profile=seq2R_template_profile (EA->S, EA->order[a][0]);
\r
1739 if (Profile && Profile->expand)new_nseq+=Profile->nseq;
\r
1743 list[n_sub_seq++]=a;
\r
1747 if ( n_sub_seq==EA->nseq){vfree(list);return EA;}
\r
1748 else if (n_sub_seq==0){MAIN=copy_aln (EA, MAIN);MAIN->nseq=0;}
\r
1751 MAIN=extract_sub_aln (EA, n_sub_seq, list);
\r
1756 C=extract_sub_aln (EA,1, list);
\r
1761 for ( a=0; a< EA->nseq; a++)
\r
1763 Profile=seq2R_template_profile (EA->S, EA->order[a][0]);
\r
1764 if ( Profile && Profile->expand)
\r
1766 SUB=copy_aln (Profile,SUB);
\r
1767 SUB=realloc_aln2(SUB, SUB->nseq, EA->len_aln+1);
\r
1769 for ( e=0,b=0; b<= EA->len_aln; b++)
\r
1771 if (is_gap(A->seq_al[a][b]))
\r
1773 for ( d=0; d<SUB->nseq; d++)
\r
1774 SUB->seq_al[d][b]=NO_COLOR_RESIDUE;
\r
1778 for ( d=0; d<SUB->nseq; d++)
\r
1781 if ( is_gap (Profile->seq_al[d][e]))
\r
1783 SUB->seq_al[d][b]=NO_COLOR_RESIDUE;
\r
1785 else SUB->seq_al[d][b]=EA->seq_al[a][b];
\r
1790 for (d=0; d< SUB->nseq; d++)SUB->score_seq[d]=EA->score_seq[a];
\r
1792 MAIN=stack_aln(MAIN, SUB);
\r
1796 MAIN=stack_aln(MAIN, C);
\r
1798 MAIN->score=MAIN->score_aln=EA->score_aln;
\r
1808 Alignment * probabilistic_rm_aa ( Alignment *A, int pos, int len)
\r
1822 if (pos==0)pos= (rand()%(A->len_aln-(2*len+len))) +len;
\r
1825 for ( a=0; a< A->nseq; a++)
\r
1827 if (random_len)left =rand()%len;
\r
1829 if (random_len)right=rand()%len;
\r
1831 if ( (pos-right)<0 || (pos+left)>A->len_aln)
\r
1833 add_warning ( stderr, "\nWarning: probabilistic_rm_aa, pos out of range [%s]\n", PROGRAM);
\r
1836 for ( b=pos-right; b<pos+left; b++)A->seq_al[a][b]=(b==pos)?'~':'*';
\r
1840 free_sequence ( A->S, A->nseq);
\r
1846 Alignment * remove_gap_column ( Alignment *A, char *mode)
\r
1855 seq_list =vcalloc ( A->nseq, sizeof (int));
\r
1856 while ( (p=strtok(mode, ":")))
\r
1861 seq_list[nseq++]=atoi(p+1)-1;
\r
1863 else if ( (a=name_is_in_list (p, A->name, A->nseq, 100))!=-1)
\r
1865 seq_list[nseq++]=a;
\r
1871 for ( a=0; a< A->nseq; a++)seq_list[a]=a;
\r
1875 for ( cl=0,a=0; a<=A->len_aln; a++)
\r
1877 for (keep_col=1, b=0; b< nseq && keep_col; b++)
\r
1879 keep_col=(is_gap(A->seq_al[seq_list[b]][a]))?0:keep_col;
\r
1884 for ( b=0; b< A->nseq; b++)
\r
1886 A->seq_al[b][cl]=A->seq_al[b][a];
\r
1892 for ( b=0; b< A->nseq; b++)
\r
1894 A->seq_al[b][cl]='-';
\r
1906 Alignment * ungap_sub_aln (Alignment *A, int ns, int *ls)
\r
1912 len=strlen ( A->seq_al[ls[0]]);
\r
1914 for ( c=0,a=0; a<len; a++)
\r
1916 for ( t=0,b=0; b<ns; b++)
\r
1917 t+=is_gap(A->seq_al[ls[b]][a]);
\r
1921 for ( b=0; b<ns; b++)
\r
1922 A->seq_al[ls[b]][c]=A->seq_al[ls[b]][a];
\r
1926 for ( b=0; b<ns; b++)A->seq_al[ls[b]][c]='\0';
\r
1930 Sequence * ungap_seq ( Sequence *S)
\r
1934 if ( !S)return NULL;
\r
1936 S->max_len=S->min_len=strlen (S->seq[0]);
\r
1937 for ( a=0; a< S->nseq; a++)
\r
1940 S->len[a]=strlen (S->seq[a]);
\r
1941 S->max_len=MAX(S->max_len,S->len[a]);
\r
1942 S->min_len=MAX(S->min_len,S->len[a]);
\r
1947 Alignment * unalign_aln (Alignment *A, Alignment *C, int t)
\r
1949 int a, b, pos, len;
\r
1952 for (a=0; a<A->nseq; a++)
\r
1953 for (b=0; b<A->len_aln; b++)
\r
1955 int res=C->seq_al[a][b];
\r
1956 A->seq_al[a][b]=toupper(A->seq_al[a][b]);
\r
1957 if ((isdigit (res) && (res-'0')<=t))
\r
1958 A->seq_al[a][b]=tolower(A->seq_al[a][b]);
\r
1962 for (pos=-1, a=0; a<C->nseq; a++)
\r
1965 while ( C->seq_al[a][b])
\r
1967 int res=C->seq_al[a][b];
\r
1968 if ((isdigit (res) && (res-'0')<=t))
\r
1970 if (pos==-1){pos=b;len=1;}
\r
1976 C=unalign_aln_pos(C,a,pos, len);
\r
1981 if ( pos!=-1){C=unalign_aln_pos(C,a,pos, len);pos=-1;}
\r
1984 thread_seq_struc2aln (C, S);
\r
1985 A=realloc_aln2 (A, A->nseq, C->len_aln+1);
\r
1986 A->len_aln=C->len_aln;
\r
1987 for (a=0; a<A->nseq; a++)sprintf ( A->seq_al[a], "%s", C->seq_al[a]);
\r
1990 free_sequence (S, -1);
\r
1993 Alignment * unalign_aln_pos (Alignment *A, int s, int p, int l)
\r
2000 buf=vcalloc (l+1, sizeof (char));
\r
2001 for (a=0; a<l; a++)
\r
2003 buf[a]=A->seq_al[s][p+a];
\r
2004 A->seq_al[s][p+a]='-';
\r
2008 A=insert_gap_col (A,p, l);
\r
2009 for (a=0; a<l; a++)
\r
2011 A->seq_al[s][p+a]=buf[a];
\r
2016 Alignment * insert_gap_col (Alignment *A, int p, int l)
\r
2022 gap=generate_null(l);
\r
2023 if ( !A || p>=A->len_aln || p<0 || p<=0)return A;
\r
2025 buf=vcalloc (A->len_aln+l+1, sizeof (char));
\r
2026 A=realloc_aln2(A,A->nseq, A->len_aln+l+1);
\r
2027 for (a=0; a<A->nseq; a++)
\r
2029 c=A->seq_al[a][p];
\r
2030 A->seq_al[a][p]='\0';
\r
2031 sprintf ( buf, "%s%s%c%s", A->seq_al[a],gap,c,A->seq_al[a]+p+1);
\r
2032 sprintf (A->seq_al[a], "%s", buf);
\r
2038 Alignment * unalign_residues (Alignment *A, int si1, int si2)
\r
2040 char *s1, *s2, *ns1, *ns2;
\r
2041 int l, a, b,r1, r2;
\r
2043 s1=A->seq_al[si1];s2=A->seq_al[si2];
\r
2046 ns1=vcalloc (2*l+1, sizeof (char));
\r
2047 ns2=vcalloc (2*l+1, sizeof (char));
\r
2049 for (b=a=0; a< l; a++)
\r
2051 r1=s1[a]; r2=s2[a];
\r
2052 if (is_gap(r1) || is_gap(r2) || isupper (r1) || isupper(r2))
\r
2054 ns1[b]=(r1=='.')?'-':r1;
\r
2055 ns2[b]=(r2=='.')?'-':r2;
\r
2070 A->seq_al[si1]=ns1;
\r
2071 A->seq_al[si2]=ns2;
\r
2074 A->len_aln=strlen (ns1);
\r
2077 Alignment *degap_aln (Alignment *A)
\r
2079 //Reomove all the gaps
\r
2081 for ( a=0; a< A->nseq; a++)ungap (A->seq_al[a]);
\r
2085 Alignment *ungap_aln_n ( Alignment *A, int p)
\r
2087 /*remove all the columns of gap-only within an alignment*/
\r
2092 if ( A->nseq==0)return A;
\r
2094 for ( c=0,a=0; a< A->len_aln; a++)
\r
2096 for ( t=0,b=0; b<A->nseq; b++)
\r
2097 t+=is_gap(A->seq_al[b][a]);
\r
2098 gp=(t*100)/A->nseq;
\r
2099 if (p>0 && (gp>=p || (t==A->nseq && p==100) || (t && p==1)));//Remove columns containing more than p% gaps
\r
2100 else if (p<0 && (gp<=p || (t==0 && p==-100) ||(t && p==-1)));//remove columns containing less than p% gaps
\r
2103 for ( b=0; b<A->nseq; b++)
\r
2104 A->seq_al[b][c]=A->seq_al[b][a];
\r
2108 for ( b=0; b<A->nseq; b++)A->seq_al[b][c]='\0';
\r
2113 Alignment *ungap_aln ( Alignment *A)
\r
2115 return ungap_aln_n (A, 100);
\r
2118 Alignment *ungap_aln ( Alignment *A)
\r
2122 for ( c=0,a=0; a< A->len_aln; a++)
\r
2124 for ( t=0,b=0; b<A->nseq; b++)
\r
2125 t+=is_gap(A->seq_al[b][a]);
\r
2129 for ( b=0; b<A->nseq; b++)
\r
2130 A->seq_al[b][c]=A->seq_al[b][a];
\r
2134 for ( b=0; b<A->nseq; b++)A->seq_al[b][c]='\0';
\r
2142 Alignment *remove_end (Alignment *A)
\r
2147 for (a=0; a< A->len_aln; a++)
\r
2149 for ( b=0, d=0; b< A->nseq; b++)
\r
2150 if ( !is_gap(A->seq_al[b][a]))d++;
\r
2154 for (a=A->len_aln-1; a>0; a--)
\r
2156 for ( b=0, d=0; b< A->nseq; b++)
\r
2157 if ( !is_gap(A->seq_al[b][a]))d++;
\r
2162 return extract_aln(A, left, right+1);
\r
2165 Alignment* condense_aln (Alignment *A)
\r
2167 /* condense complementarz columns:
\r
2173 int a, b, plen, n,m, r1, r2;
\r
2176 while ( A->len_aln !=plen)
\r
2179 for ( a=0; a< A->len_aln-1; a++)
\r
2181 for ( n=m=b=0; b< A->nseq; b++)
\r
2183 r1=is_gap(A->seq_al[b][a]);
\r
2184 r2=is_gap(A->seq_al[b][a+1]);
\r
2189 if ( n==A->nseq && m!=A->nseq)
\r
2191 for (b=0; b< A->nseq; b++)
\r
2193 if (!is_gap(A->seq_al[b][a+1]))
\r
2195 A->seq_al[b][a]=A->seq_al[b][a+1];
\r
2196 A->seq_al[b][a+1]='-';
\r
2210 void compress_aln ( Alignment *A)
\r
2213 /*remove all the columns of gap-only within an alignment*/
\r
2218 for (c=0, a=0; a< A->len_aln; a++)
\r
2220 for ( b=0, d=0; b< A->nseq; b++)
\r
2221 if ( A->seq_al[b][a]!='-'){d=1; break;}
\r
2225 for (b=0; b< A->nseq; b++)
\r
2226 A->seq_al[b][c]=A->seq_al[b][a];
\r
2232 for ( a=0; a< A->nseq; a++)
\r
2233 A->seq_al[a][c]='\0';
\r
2236 Alignment *seq_coor2aln ( Sequence *S, Alignment *A, int **coor, int nseq)
\r
2241 A=realloc_alignment2(A, nseq, return_maxlen ( S->seq, S->nseq)+1);
\r
2242 for ( a=0; a< S->nseq; a++)sprintf ( A->file[a], "%s", S->file[a]);
\r
2243 for ( a=0; a< nseq; a++)
\r
2245 sprintf (A->name[a], "Repeat_%d_%d", a, coor[a][0]);
\r
2246 buf=extract_char ( S->seq[coor[a][0]], coor[a][1]-1, coor[a][2]);
\r
2247 sprintf ( A->seq_al[a],"%s", buf);
\r
2250 A->order[a][1]=coor[a][1]-1;
\r
2256 Alignment *strings2aln (int nseq,...)
\r
2258 /*strings2aln(nseq, <name1>, <seq1>, <name2>, <seq2>....)*/
\r
2260 char **list, **list2;
\r
2261 char **name, **name2;
\r
2266 va_start(ap, nseq);
\r
2267 list=vcalloc (nseq, sizeof (char*));
\r
2268 name=vcalloc (nseq, sizeof (char*));
\r
2269 for ( a=0; a< nseq; a++)
\r
2271 name[a]=va_arg(ap,char*);
\r
2272 list[a]=va_arg(ap,char*);
\r
2277 for ( max=0,a=0; a< nseq; a++)
\r
2279 max=(strlen (list[a])>max)?strlen(list[a]):max;
\r
2281 list2=declare_char (nseq, max+1);
\r
2282 name2=declare_char (nseq, MAXNAMES+1);
\r
2284 for ( a=0; a< nseq; a++)
\r
2286 sprintf ( list2[a], "%s", list[a]);
\r
2287 sprintf ( name2[a], "%s", name[a]);
\r
2291 S=fill_sequence_struc(nseq,list2,name2);
\r
2293 free_char (list2, -1);
\r
2294 free_char (name2, -1);
\r
2297 A=seq2aln(S,NULL, 1);
\r
2300 Alignment *seq2aln ( Sequence *S, Alignment *A,int rm_gap)
\r
2304 A=realloc_alignment2(A, S->nseq, S->max_len+1);
\r
2305 for ( a=0; a< S->nseq; a++)sprintf ( A->file[a], "%s", S->file[a]);
\r
2307 A->max_len=S->max_len;
\r
2308 A->min_len=S->min_len;
\r
2310 for ( a=0; a< S->nseq; a++)
\r
2315 sprintf ( A->seq_comment[a], "%s", S->seq_comment[a]);
\r
2316 sprintf ( A->aln_comment[a], "%s", S->aln_comment[a]);
\r
2318 sprintf ( A->name[a], "%s", S->name[a]);
\r
2319 sprintf ( A->seq_al[a], "%s", S->seq[a]);
\r
2321 ungap ( A->seq_al[a]);
\r
2322 A->len[a]=strlen ( A->seq_al[a]);
\r
2324 if ( rm_gap==0 || rm_gap==NO_PAD)sprintf ( A->seq_al[a], "%s", S->seq[a]);
\r
2327 if (rm_gap!=NO_PAD)padd_aln (A);
\r
2332 Alignment *padd_aln ( Alignment *A)
\r
2334 A->seq_al=padd_string (A->seq_al, A->nseq, '-');
\r
2335 A->len_aln=strlen (A->seq_al[0]);
\r
2339 char **padd_string ( char **string, int n,char pad)
\r
2341 /*Pads a the strings so that they all have the same length*/
\r
2346 max_len=get_longest_string (string,n, NULL, NULL);
\r
2347 for (a=0; a<n; a++)
\r
2349 buf=generate_null (max_len-strlen (string[a]));
\r
2350 strcat ( string[a], buf);
\r
2356 Alignment * trim_aln_with_seq ( Alignment *S, Alignment *P)
\r
2360 static int seqindex;
\r
2361 P=aln2profile (P);
\r
2362 S=aln2profile (S);
\r
2364 A=align_two_aln (S,P, "blosum62mt",-8,-1, "myers_miller_pair_wise");
\r
2365 for (a=0; a<A->nseq; a++) sprintf (A->name[a], "tmpname_%d", seqindex++);
\r
2367 R=copy_aln (A, NULL);
\r
2368 for (c=0, a=0; a< A->len_aln; a++)
\r
2370 if ( is_gap (A->seq_al[0][a]));
\r
2373 for ( b=0; b<A->nseq; b++)
\r
2374 R->seq_al[b][c]=A->seq_al[b][a];
\r
2378 for ( a=0; a< A->nseq; a++)R->seq_al[a][c]='\0';
\r
2389 Alignment * add_align_seq2aln ( Alignment *A, char *seq, char *seq_name)
\r
2393 A=declare_aln (NULL);
\r
2394 A=realloc_aln2 ( A, 1, strlen (seq)+1);
\r
2396 sprintf ( A->name[A->nseq], "%s", seq_name);
\r
2397 sprintf ( A->seq_al[A->nseq], "%s", seq);
\r
2401 else if ( strlen (seq)!=A->len_aln)
\r
2403 fprintf ( stderr, "\nError: Attempt to stack incompatible aln and aligned sequence[FATAL]\n");
\r
2404 myexit (EXIT_FAILURE);
\r
2410 A=realloc_aln2 ( A, A->nseq+1, A->len_aln+1);
\r
2411 sprintf ( A->name[A->nseq], "%s", seq_name);
\r
2412 sprintf ( A->seq_al[A->nseq], "%s", seq);
\r
2419 Alignment *aln2number (Alignment *A)
\r
2421 A->seq_al=char_array2number(A->seq_al, A->nseq);
\r
2424 Sequence *seq2number (Sequence *A)
\r
2426 A->seq=char_array2number(A->seq, A->nseq);
\r
2430 Sequence * aln2seq (Alignment *A)
\r
2432 return aln2seq_main(A, RM_GAP);
\r
2434 Sequence * aln2seq_main (Alignment *A, int mode)
\r
2440 if ( !A) return NULL;
\r
2441 else if ( A->nseq==0)return NULL;
\r
2442 for (maxlen=0,a=0; a<A->nseq; a++)maxlen=MAX(maxlen, strlen (A->seq_al[a]));
\r
2445 LS=declare_sequence ( maxlen+1, maxlen+1, A->nseq);
\r
2447 for ( a=0; a< LS->nseq; a++)
\r
2449 sprintf (LS->file[a],"%s", A->file[a]);
\r
2451 sprintf ( LS->seq[a], "%s", A->seq_al[a]);
\r
2453 if (mode==RM_GAP)ungap ( LS->seq[a]);
\r
2455 LS->len[a]=strlen ( LS->seq[a]);
\r
2457 sprintf ( LS->seq_comment[a], A->seq_comment[a]);
\r
2458 sprintf ( LS->aln_comment[a], A->aln_comment[a]);
\r
2459 sprintf ( LS->name[a], "%s", A->name[a]);
\r
2464 Sequence *keep_residues_in_seq ( Sequence *S, char *list, char replacement)
\r
2466 Alignment *A=NULL;
\r
2469 A=seq2aln (S, A,1);
\r
2470 A=keep_residues_in_aln ( A, list, replacement);
\r
2471 for ( a=0; a< A->nseq; a++)
\r
2473 ungap (A->seq_al[a]);
\r
2474 sprintf ( S->seq[a], "%s", A->seq_al[a]);
\r
2481 Alignment *aln2short_aln ( Alignment *A, char *list, char *new, int spacer)
\r
2483 int a, b, r, cl, l;
\r
2486 for ( a=0; a< A->nseq; a++)
\r
2488 buf=vcalloc ( strlen (A->seq_al[a])+1, sizeof (char));
\r
2490 for (l=0,cl=0, b=0; b< A->len_aln; b++)
\r
2492 r=A->seq_al[a][b];
\r
2494 else if ( is_in_set (r, list))
\r
2496 if (cl){cl=0; buf[l++]=new[0];}
\r
2501 if ( cl==spacer){buf[l++]=new[0];cl=0;}
\r
2508 sprintf (A->seq_al[a], "%s", buf);
\r
2514 Alignment *keep_residues_in_aln ( Alignment *A, char *list, char replacement)
\r
2516 return filter_keep_residues_in_aln (A,NULL, 0, -1, list, replacement);
\r
2518 Alignment *filter_keep_residues_in_aln ( Alignment *A,Alignment *ST, int use_cons, int value, char *list, char replacement)
\r
2524 sl=declare_char (n+1, 256);
\r
2525 for (a=0; a< n; a++)
\r
2526 sprintf ( sl[a], "%c%c", list[a], list[a]);
\r
2527 sprintf ( sl[a],"#%c", replacement);
\r
2528 A=filter_aln_convert (A, ST,use_cons,value, n+1, sl);
\r
2529 free_char (sl, -1);
\r
2534 Alignment *filter_convert_aln ( Alignment *A,Alignment *ST, int use_cons, int value, int n, ...)
\r
2540 sl=vcalloc ( n,sizeof(char*));
\r
2541 for ( a=0; a< n; a++)
\r
2543 sl[a]=va_arg(ap, char * );
\r
2546 A=filter_aln_convert (A,ST,use_cons,value, n,sl);
\r
2551 Alignment * filter_aln ( Alignment *A, Alignment *ST, int value)
\r
2553 return filter_aln_convert (A, ST,0,value,DELETE, NULL);
\r
2555 Alignment * filter_aln_switchcase ( Alignment *A, Alignment *ST,int use_cons, int value)
\r
2557 return filter_aln_convert (A, ST,0,value,SWITCHCASE, NULL);
\r
2559 Alignment * filter_aln_upper_lower ( Alignment *A, Alignment *ST,int use_cons, int value)
\r
2561 return filter_aln_convert (A, ST,use_cons,value, LOWER, NULL);
\r
2563 Alignment * filter_aln_lower_upper ( Alignment *A, Alignment *ST,int use_cons, int value)
\r
2566 return filter_aln_convert (A, ST,use_cons,value, UPPER, NULL);
\r
2568 Alignment * STseq2STaln ( Alignment *A, Alignment *ST)
\r
2572 if (ST && ST->len_aln !=A->len_aln)
\r
2574 Sequence *S_T, *S_A;
\r
2579 for (a=0; a< A->nseq; a++)
\r
2581 i=name_is_in_list (A->name[a], S_T->name,S_T->nseq, 100);
\r
2585 s1=(S_T)->seq[i];ungap(s1);
\r
2586 s2=(S_A)->seq[a];ungap(s2);
\r
2588 if ( strlen (s1)!=strlen(s2))
\r
2590 fprintf ( stderr, "%s\n%s\n", s1, s2);
\r
2591 printf_exit (EXIT_FAILURE, stderr, "ERROR: Sequence %s has different length in the alignment and in the structure Alignment [FATAL:%s]\n", A->name[a], PROGRAM);
\r
2595 ST=copy_aln (A, ST);
\r
2596 thread_seq_struc2aln (ST,S_T);
\r
2601 Alignment * merge_annotation ( Alignment *A, Alignment *ST, char *seq)
\r
2605 ST=STseq2STaln (A, ST);
\r
2606 if ( seq==NULL)s=0;
\r
2608 s=name_is_in_list ( seq, A->name, A->nseq, 100);
\r
2612 add_warning ( stderr, "\nERROR: %s is not in your MSA [FATAL: %s]", PROGRAM);
\r
2613 myexit (EXIT_FAILURE);
\r
2616 for (a=0; a<A->len_aln; a++)
\r
2620 t=A->seq_al[s][a];
\r
2621 if (is_gap (t))continue;
\r
2622 for (b=0; b<A->nseq; b++)
\r
2624 t=A->seq_al[s][a];
\r
2625 r=ST->seq_al[b][a];
\r
2628 if (!isdigit(t) || (isdigit (t) && t<r))
\r
2629 A->seq_al[s][a]=r;
\r
2638 Alignment * filter_aln_convert ( Alignment *A, Alignment *ST,int use_cons, int value, int n_symbol,char **symbol_list)
\r
2645 ST=STseq2STaln (A, ST);
\r
2646 if ( ST && use_cons)
\r
2648 cons=name_is_in_list ("con", ST->name,ST->nseq+1, 100);
\r
2649 if ( cons==-1)cons=name_is_in_list ("cons", ST->name,ST->nseq+1, 100);
\r
2650 if ( cons==-1)cons=name_is_in_list ("Cons", ST->name,ST->nseq+1, 100);
\r
2654 fprintf (stderr, "WARNING: Could Not Use the Consensus Sequence [WARNING:%s]\n", PROGRAM);
\r
2658 A->residue_case=KEEP_CASE;
\r
2659 for ( a=0; a< A->nseq; a++)
\r
2661 if(value!=10 && ST && !use_cons)
\r
2663 c=name_is_in_list (A->name[a], ST->name, ST->nseq,100);
\r
2667 for ( b=0; b< A->len_aln; b++)
\r
2669 if ( value==10 || !ST)st=11;
\r
2670 else if ( ST && use_cons)
\r
2672 st=(isdigit(ST->seq_al[cons][b]))?ST->seq_al[cons][b]-'0':ST->seq_al[cons][b];
\r
2674 else st=(isdigit(ST->seq_al[c][b]))?ST->seq_al[c][b]-'0':ST->seq_al[c][b];
\r
2677 if ( st==value || value==-1 || st==NO_COLOR_RESIDUE)
\r
2679 if ( n_symbol==UPPER && !symbol_list)A->seq_al[a][b]=toupper (A->seq_al[a][b]);
\r
2680 else if ( n_symbol==LOWER && !symbol_list)A->seq_al[a][b]=tolower (A->seq_al[a][b]);
\r
2681 else if ( n_symbol==SWITCHCASE && !symbol_list)
\r
2683 if ( !isalpha(A->seq_al[a][b]));
\r
2684 else if (isupper (A->seq_al[a][b]))A->seq_al[a][b]=tolower (A->seq_al[a][b]);
\r
2685 else if (islower (A->seq_al[a][b]))A->seq_al[a][b]=toupper (A->seq_al[a][b]);
\r
2687 else if ( n_symbol==DELETE && !symbol_list)A->seq_al[a][b]='-';
\r
2690 A->seq_al[a][b]=convert(A->seq_al[a][b],n_symbol,symbol_list);
\r
2700 char ** sar_aln2motif (Alignment *A, Alignment *B, int *pos, int c);
\r
2701 char ** sar_aln2motif (Alignment *A, Alignment *B, int *pos, int c)
\r
2703 static Alignment *I;
\r
2704 static Alignment *O;
\r
2707 float tp,tn,fp,fn,best, sp, sn, sen2;
\r
2708 float best_pred=-1;
\r
2713 static char ***alp;
\r
2714 static int *alp_size;
\r
2716 char ***motif_list;
\r
2722 I=copy_aln(A, NULL);
\r
2723 O=copy_aln(A, NULL);
\r
2728 I->nseq=O->nseq=I->len_aln=O->len_aln=0;
\r
2729 for (a=0; a<A->len_aln; a++)
\r
2733 for (i=o=0,b=0; b<A->nseq; b++)
\r
2736 if ( is_gap(A->seq_al[b][a]))return 0;
\r
2737 if (B->seq_al[b][c]=='I')I->seq_al[i++][I->len_aln]=A->seq_al[b][a];
\r
2738 else O->seq_al[o++][O->len_aln]=A->seq_al[b][a];
\r
2745 if (O->len_aln==0 || I->len_aln==0) return 0;
\r
2748 for (a=0; a<o; a++)O->seq_al[a][O->len_aln]='\0';
\r
2749 for (a=0; a<i; a++)I->seq_al[a][I->len_aln]='\0';
\r
2751 alp=vcalloc ( sizeof (char**), I->len_aln);
\r
2752 alp_size= vcalloc ( I->len_aln, sizeof (int));
\r
2753 for (a=0; a<I->len_aln; a++)
\r
2756 alp[a]=string2alphabet ( (col=aln_column2string (I,a)),2, &alp_size[a]);
\r
2762 motif_list=generate_array_string_list (I->len_aln, alp, alp_size, &n, NULL, OVERLAP);
\r
2763 best_pred=best_motif=0;
\r
2764 for (a=0; a<n; a++)
\r
2769 for (b=0; b<I->nseq; b++)
\r
2771 if (match_motif (I->seq_al[b], motif_list[a]))tp++;
\r
2774 for (b=0; b<O->nseq; b++)
\r
2776 if (match_motif (O->seq_al[b], motif_list[a]))fp++;
\r
2779 rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
\r
2781 if (best> best_pred)
\r
2788 output_Alignment_without_header ( I, stdout);
\r
2789 fprintf ( stdout, "\n");
\r
2790 output_Alignment_without_header ( O, stdout);
\r
2793 fprintf ( stdout, "\nMotifCompound %d pred: %.2f motif: ", c, best_pred);
\r
2794 for (n1=0, a=0; a<I->len_aln; a++)
\r
2798 m=motif_list[best_motif][a];
\r
2799 fprintf ( stdout, "[%s]-", m);
\r
2801 n1+=(l==1 && !strm ("*",m) )?1:0;
\r
2803 fprintf (stdout, "SCORE: %d", n1);
\r
2805 for (a=0; a<n; a++)vfree (motif_list[a]);
\r
2806 vfree (motif_list);
\r
2807 free_arrayN((void ***) alp, 3);
\r
2816 void explore_weight_matrix (Alignment *A, Alignment *B, int range, int n, int *array);
\r
2817 void explore_weight_matrix (Alignment *A, Alignment *B, int range, int n, int *array)
\r
2820 if ( n==A->len_aln)
\r
2822 fprintf ( stdout, "\n W:");
\r
2823 for (a=0; a<A->len_aln; a++)fprintf ( stdout, "%d", array[a]);
\r
2824 fprintf ( stdout, " %.4f",(float)sar_aln2r(A,B,array,0));
\r
2829 for ( a=0; a<range; a++)
\r
2832 explore_weight_matrix (A, B, range, n+1, array);
\r
2836 float search_best_combo(Alignment *A, Alignment *B);
\r
2837 void search_best_combo_sar_aln(Alignment *A, Alignment *B);
\r
2838 void search_best_combo_sar_aln(Alignment *A, Alignment *B)
\r
2845 S=copy_aln (B, NULL);
\r
2847 for ( a=0; a<B->len_aln-w;a++)
\r
2849 for (b=0; b<B->nseq; b++)
\r
2851 for (c=0; c<w; c++)
\r
2853 S->seq_al[b][c]=B->seq_al[b][a+c];
\r
2855 S->seq_al[b][c]='\0';
\r
2858 s=search_best_combo (A, S);
\r
2859 fprintf ( stdout,"\nP: XXXX \nP: XXXXX A=%d / %d", a, B->len_aln);
\r
2865 float search_best_combo(Alignment *A, Alignment *B)
\r
2867 int a, b, c, d, best_pos,nl, max;
\r
2868 float best_score, score;
\r
2872 int combo_mode=1; //1: greedy 2: consider all thw w combinations;
\r
2879 pos=vcalloc ( A->len_aln, sizeof (int));
\r
2880 list=vcalloc (A->len_aln, sizeof (int));
\r
2883 if ( combo_mode==1)
\r
2885 for (a=0; a< max; a++)
\r
2887 for (best_score=-9999,best_pos=0,b=0; b< A->len_aln-w; b++)
\r
2889 for (c=0; c<nl; c++)pos[list[c]]=1;
\r
2890 for (c=0; c<w; c++)pos[b+c]=1;
\r
2891 score=sar_aln2r(A,B,pos,0);
\r
2892 if ( score>best_score)
\r
2897 for (c=0; c<w; c++)pos[b+c]=0;
\r
2899 if (best_pos==list[nl-1])break;
\r
2900 list[nl++]=best_pos;
\r
2901 for (b=0; b<nl; b++) pos[list[b]]=1;
\r
2902 fprintf ( stdout, "\n%2d P: %d S:%.3f Delta= %d", nl,best_pos, best_score, (int)sar_aln2delta(A,B, pos,0));
\r
2903 for (b=0; b<nl; b++) pos[list[b]]=0;
\r
2907 for (a=0; a<nl; a++) pos[list[a]]=1;
\r
2908 fprintf ( stdout, "\nR: %3f " ,(float)sar_aln2r(A,B,pos,1));
\r
2911 else if ( combo_mode==2)
\r
2917 int *preset, n_preset;
\r
2919 tmpf=vtmpnam (NULL);
\r
2921 generate_array_int_list (max, 0,A->len_aln-1, 1,NULL, tmpf);
\r
2922 printf_system ( "cp %s testfile", tmpf);
\r
2923 buf=vcalloc ( 1000, sizeof (char));
\r
2924 fp=vfopen (tmpf, "r");
\r
2925 best_score=-99999;
\r
2928 preset=vcalloc (A->len_aln, sizeof (int));
\r
2929 preset[n_preset++]=353;
\r
2930 preset[n_preset++]=361;
\r
2931 //preset[n_preset++]=365;
\r
2932 //preset[n_preset++]=187;
\r
2933 //preset[n_preset++]=397;
\r
2934 //preset[n_preset++]=492;
\r
2937 while ( (buf=vfgets ( buf, fp))!=NULL)
\r
2940 array=string2num_list (buf);
\r
2942 for (a=1; a<=max; a++)
\r
2946 for ( a=0; a<n_preset; a++)pos[preset[a]]=1;
\r
2948 score=sar_aln2r(A,B,pos,0);
\r
2950 if ( score>best_score)
\r
2953 fprintf ( stdout, "\n");
\r
2954 for (a=0; a<n_preset; a++)fprintf (stdout, "%2d ", preset[a]);
\r
2955 for (a=1; a<=max; a++)fprintf (stdout, "%2d ", array[a]);
\r
2956 fprintf ( stdout, " R: %.3f", best_score);
\r
2957 for (nl=0,a=0; a<n_preset; a++)list[nl++]=preset[a];
\r
2958 for (a=1; a<=max; a++)list[nl++]=array[a];
\r
2960 //if ( score!=0)HERE ("R=%.2f", score);
\r
2961 for (b=1; b<=max; b++)
\r
2965 fprintf ( stdout, "\n");
\r
2967 //for (a=0; a<max; a++)fprintf (stdout, "%2d ", array[best_pos][a]);
\r
2968 //fprintf ( stdout, " R: %.3f", best_score);
\r
2970 for (c=0; c<B->len_aln; c++)
\r
2972 sar_aln2motif (A,B,pos, c);
\r
2976 HERE ("***************");
\r
2977 fp2=vfopen ("aln.aln", "w");
\r
2978 for (a=0; a<A->nseq; a++)
\r
2980 fprintf (fp2, ">%s\n", A->name[a]);
\r
2981 for ( b=0; b<nl; b++)fprintf (fp2, "%c", A->seq_al[a][list[b]]);
\r
2982 fprintf ( fp2, "\n");
\r
2985 HERE ("Output aln.aln");
\r
2988 float tp=0, tn=0, fp=0, fn=0, pp2=0,pp=0, sn,sn2, sp;
\r
2989 int **result,**result2,**compound_score, *ref_score,n2,n, s, p, c;
\r
2990 Alignment *AI, *AO;
\r
2993 compound_score=declare_int (B->len_aln, 2);
\r
2994 ref_score=vcalloc (nl, sizeof (int));
\r
2996 result=declare_int (B->len_aln*A->nseq*A->nseq, 2);
\r
2997 result2=declare_int (B->len_aln*A->nseq*A->nseq, 2);
\r
2999 for (n2=c=0; c< B->len_aln; c++)
\r
3004 if (!M)M=read_matrice ("blosum62mt");
\r
3005 for (n=0,a=0; a<A->nseq-1; a++)
\r
3007 for (b=a+1; b<A->nseq;b++)
\r
3009 for (s=0,p=0; p<nl; p++)
\r
3013 r1=A->seq_al[a][list[p]];
\r
3014 r2=A->seq_al[b][list[p]];
\r
3015 if ( !is_gap (r1) && !is_gap(r2))s+=M[r1-'A'][r2-'A'];
\r
3017 result2[n2][0]=result[n][0]=s;
\r
3019 sar1=B->seq_al[a][c];sar2=B->seq_al[b][c];
\r
3021 if (sar1=='I' && sar1==sar2)
\r
3023 result2[n2][1]=result[n][1]=1;
\r
3027 else if ( sar1==sar2 && sar1=='O')
\r
3033 result2[n2][1]=result[n][1]=0;
\r
3036 //else if ( s1==s2=='O')result[n][1]=-1;
\r
3040 if (pp==0)continue;
\r
3041 sort_int_inv (result, 2, 0, 0, n-1);
\r
3044 for (tp=0,a=0; a<n; a++)
\r
3047 if ((pp-tp) == (a-tp))break;
\r
3056 fprintf ( stdout, "\nCompound %3d sn: %.3f sn2: %.3f sp: %.3f MIN: %.3f",c,sn, sn2,sp, MIN((MIN(sn,sn2)),sp));
\r
3057 compound_score[c][0]=c;
\r
3058 compound_score[c][1]=1000*MIN((MIN(sn,sn2)),sp);
\r
3061 sort_int_inv (compound_score,2, 1, 0, B->len_aln-1);
\r
3063 fp2=vfopen ("compound.fasta", "w");
\r
3064 for (d=0; d<nl; d++)
\r
3067 for (n=0,a=0;a<A->nseq; a++)
\r
3068 for (b=0; b<A->nseq; b++)
\r
3070 r1= A->seq_al[b][list[d]];
\r
3071 r2= A->seq_al[b][list[d]];
\r
3072 if (is_gap(r1) || is_gap(r2))continue;
\r
3075 ref_score[d]+=M[r1-'A'][r2-'A'];
\r
3081 AO=copy_aln (A, NULL);
\r
3082 AI=copy_aln (A,NULL);
\r
3083 AO->len_aln=AI->len_aln=nl;
\r
3084 for (a=0; a<A->nseq; a++)AO->seq_al[a][nl]=AI->seq_al[a][nl]='\0';
\r
3086 for (a=0; a<B->len_aln; a++)
\r
3088 fprintf (stdout, "\n>%4d %4d ", compound_score[a][0], compound_score[a][1]);
\r
3089 for (b=0; b<B->nseq; b++) fprintf (stdout, "%c", B->seq_al[b][compound_score[a][0]]);
\r
3090 fprintf ( stdout, "\n");
\r
3092 for (AI->nseq=0,b=0; b<B->nseq; b++)
\r
3094 if (B->seq_al[b][compound_score[a][0]]=='O')continue;
\r
3095 fprintf ( stdout, "\n\t");
\r
3096 for (c=0; c<nl; c++)
\r
3098 fprintf ( stdout, "%c", A->seq_al[b][list[c]]);
\r
3099 AI->seq_al[AI->nseq][c]=A->seq_al[b][list[c]];
\r
3103 fprintf ( stdout, "\n\t");
\r
3104 for (d=0; d<nl; d++)
\r
3106 for (score=0,n=0,b=0; b<B->nseq; b++)
\r
3108 if (B->seq_al[b][compound_score[a][0]]=='O')continue;
\r
3109 for (c=0; c<B->nseq; c++)
\r
3111 if (B->seq_al[c][compound_score[a][0]]=='O')continue;
\r
3115 r1= A->seq_al[b][list[d]];
\r
3116 r2= A->seq_al[b][list[d]];
\r
3117 if (is_gap(r1) || is_gap(r2))continue;
\r
3118 else score+=M[r1-'A'][r2-'A'];
\r
3124 if ((float)score/(float)ref_score[d]>1.2)fprintf ( stdout, "*");
\r
3125 else fprintf ( stdout, " ");
\r
3127 for (AO->nseq=0,b=0; b<B->nseq; b++)
\r
3129 if (B->seq_al[b][compound_score[a][0]]=='I')continue;
\r
3130 fprintf ( stdout, "\n\t");
\r
3131 for (c=0; c<nl; c++)
\r
3133 AO->seq_al[AO->nseq][c]=A->seq_al[b][list[c]];
\r
3134 fprintf ( stdout, "%c", A->seq_al[b][list[c]]);
\r
3138 simI=aln2sim (AI, "blosum62mt"); simO=aln2sim (AO, "blosum62mt");
\r
3139 fprintf ( stdout, "\nDELTA: I: %d O: %d %d",simI,simO, simI-simO);
\r
3143 for ( a=0; a<B->nseq; a++)
\r
3146 fprintf ( fp2, ">%s\n", B->name[a]);
\r
3147 for (b=0; b<B->len_aln/2; b++)
\r
3148 fprintf ( fp2, "%c", B->seq_al[a][compound_score[b][0]]);
\r
3149 fprintf (fp2, "\n");
\r
3152 HERE ("OUTPUT compound.fasta");
\r
3157 sort_int_inv (result, 2, 0, 0, n-1);
\r
3160 for (tp=0,a=0; a<n; a++)
\r
3163 if ((pp-tp) == (a-tp))break;
\r
3172 fprintf ( stdout, "\nTOT: sn: %.3f sn2: %.3f sp: %.3f MIN: %.3f",sn, sn2,sp, MIN((MIN(sn,sn2)),sp));
\r
3175 HERE ("Delta= %d", delta);
\r
3179 C=copy_aln(A, NULL);
\r
3180 for (a=0; a< nl; a++)
\r
3181 for (b=0; b<A->nseq; b++)
\r
3182 C->seq_al[b][a]=A->seq_al[b][list[a]];
\r
3184 array=vcalloc (C->len_aln, sizeof (int));
\r
3185 explore_weight_matrix (C, B, 6,0, array);
\r
3188 return best_score;
\r
3192 void count_misc (Alignment *A, Alignment *B)
\r
3194 int **done, a, b, c, d, e,f, g, *list, n, score;
\r
3195 double **slist, *r;
\r
3199 search_best_combo (A,B);
\r
3201 pos=vcalloc (A->len_aln+1, sizeof (int));
\r
3210 fprintf ( stdout, "\nR: %3f " ,(float)sar_aln2r(A,B,pos,1));exit (0);
\r
3212 for (a=0; a< A->len_aln-w; a++)
\r
3214 for (c=0; c<w; c++)
\r
3226 fprintf ( stdout, "\nP: %3d W:2 R: %3f ",a+1, (float)sar_aln2r(A,B,pos,0));
\r
3227 for (c=0; c<w; c++)
\r
3234 for (a=0; a<w; a++) pos[a]=1;
\r
3235 for (a=w; a< A->len_aln-1; a++)
\r
3239 fprintf ( stdout, "\nP: %3d W:2 R: %3f ",a, (float)sar_aln2r(A,B,pos,0));
\r
3248 explore_weight_matrix (A, B,3, 0,pos);
\r
3251 for (a=0; a<A->len_aln; a++)
\r
3252 for ( b=0; b<A->len_aln; b++)
\r
3253 for (c=0; c<A->len_aln; c++)
\r
3254 for (d=0; d<A->len_aln; d++)
\r
3255 for (f=0; f<A->len_aln; f++)
\r
3256 for (g=0; g<A->len_aln; g++)
\r
3266 fprintf ( stdout, "\n%d %d %d %d %d %d %.3f", a, b,c,d,f, g, sar_aln2r(A,B, pos,0));
\r
3273 slist=declare_double (A->nseq*A->nseq*10, 2);
\r
3274 done=declare_int (256, 256);
\r
3275 list=vcalloc ( A->nseq, sizeof (int));
\r
3277 for (a=0; a<A->len_aln-1; a++)
\r
3279 for (b =0; b<256; b++)for (c=0; c<256; c++)done[b][c]=0;
\r
3281 for (b=0; b<A->nseq-1; b++)
\r
3284 r1=A->seq_al[b][a];
\r
3285 r2=A->seq_al[b][a+1];
\r
3286 if (done[r1][r2])continue;
\r
3290 fprintf ( stdout, "\n%3d %c%c: %s ",a+1, r1, r2, A->name[b]);
\r
3291 for ( c=b+1; c<A->nseq; c++)
\r
3293 if (r1==A->seq_al[c][a] && r2==A->seq_al[c][a+1])
\r
3295 fprintf ( stdout, "%s ", A->name[c]);
\r
3302 for (e=0,score=0,c=0; c<n-1; c++)
\r
3303 for (d=c+1; d<n; d++,e++)
\r
3304 score+=get_sar_sim2(B->seq_al[list[c]], B->seq_al[list[d]]);
\r
3305 fprintf ( stdout, " Score=%d", score/e);
\r
3309 for (score=0,e=0,a=0; a<A->nseq-1; a++)
\r
3310 for (b=a+1; b<A->nseq; b++,e++)
\r
3312 score+=get_sar_sim2(B->seq_al[a], B->seq_al[b]);
\r
3314 fprintf (stdout,"AVG=%d", score/e);
\r
3315 for (n=0,a=0; a< A->nseq-1; a++)
\r
3319 if (!M)M=read_matrice ("blosum62mt");
\r
3322 for (b=a+1; b<A->nseq; b++)
\r
3324 int n11, n01, n10, n00, n1;
\r
3326 for (sim=d=0;d<A->len_aln; d++)
\r
3329 r1=A->seq_al[a][d];
\r
3330 r2=A->seq_al[b][d];
\r
3331 sim+=(r1==r2)?1:0;
\r
3332 //sim +=(M[r1-'A'][r2-'A']>0)?1:0;
\r
3335 sim=(100*sim)/(A->len_aln);//+rand()%10;
\r
3336 for (n1=n00=n11=n10=n01=score=0, d=0; d<B->len_aln; d++)
\r
3339 r1=B->seq_al[a][d];
\r
3340 r2=B->seq_al[b][d];
\r
3341 n11+=(r1=='I' && r2=='I');
\r
3342 n00+=(r1=='O' && r2=='O');
\r
3343 n10+=(r1=='I' && r2=='0');
\r
3344 n01+=(r1=='O' && r2=='I');
\r
3345 n1+=(r1=='I' || r2=='I');
\r
3347 score =((n11+n00)*100)/B->len_aln;
\r
3349 //score=get_sar_sim2(B->seq_al[a], B->seq_al[b]);
\r
3351 fprintf ( stdout, "\nSIM: %d SC: %d", sim, score);
\r
3352 slist[n][0]=(double)sim;
\r
3353 slist[n][1]=(double)score;
\r
3357 r=return_r(slist, n);
\r
3358 fprintf ( stdout, "\nR= %.4f", (float)r[0]);
\r
3362 int aln2ngap ( Alignment *A)
\r
3365 for (a=0; a< A->len_aln; a++)
\r
3366 for (b=0; b<A->nseq; b++) ngap+=is_gap (A->seq_al[b][a]);
\r
3369 int * count_in_aln ( Alignment *A, Alignment *ST, int value, int n_symbol,char **symbol_list, int *table)
\r
3374 if (!table)table=vcalloc (n_symbol, sizeof (int));
\r
3376 A->residue_case=KEEP_CASE;
\r
3377 for ( a=0; a< A->nseq; a++)
\r
3379 if(value!=10 && ST)for ( c=0; c< ST->nseq; c++)if ( strm(ST->name[c], A->name[a]))break;
\r
3380 for ( b=0; b< A->len_aln; b++)
\r
3382 if ( value==10 || !ST)st=11;
\r
3383 else st=(isdigit(ST->seq_al[c][b]))?ST->seq_al[c][b]-'0':ST->seq_al[c][b];
\r
3384 if ( st==value || value==-1)
\r
3386 for ( d=0; d<n_symbol; d++)table[d]+=is_in_set ( A->seq_al[a][b], symbol_list[d]);
\r
3393 char *dna_aln2cons_seq ( Alignment *A)
\r
3396 static int **column_count;
\r
3397 static int **old_tot_count;
\r
3398 static int **new_tot_count;
\r
3399 static char *string1, *string2;
\r
3402 int NA=0, NG=1, NC=2, NT=3, IGAP=4;
\r
3403 static int MAX_EST_SIZE=10000;
\r
3404 static int size_increment=1000;
\r
3406 int overlap=0, best_overlap=0;
\r
3409 seq=vcalloc ( A->len_aln+1, sizeof (char));
\r
3411 if (!column_count )
\r
3413 column_count=vcalloc(MAX_EST_SIZE, sizeof (int*));
\r
3414 for ( a=0; a< MAX_EST_SIZE; a++)
\r
3415 column_count[a]=vcalloc (5, sizeof (int));
\r
3417 old_tot_count=vcalloc(MAX_EST_SIZE, sizeof (int*));
\r
3418 new_tot_count=vcalloc(MAX_EST_SIZE, sizeof (int*));
\r
3419 A->P=declare_profile( "agct-",MAX_EST_SIZE);
\r
3420 string1=vcalloc (MAX_EST_SIZE, sizeof (char));
\r
3421 string2=vcalloc (MAX_EST_SIZE, sizeof (char));
\r
3423 else if (A->len_aln>MAX_EST_SIZE)
\r
3425 if ( column_count)
\r
3427 for ( a=0; a< MAX_EST_SIZE; a++)
\r
3428 vfree(column_count[a]);
\r
3429 vfree(column_count);
\r
3430 vfree(old_tot_count);
\r
3431 vfree(new_tot_count);
\r
3436 column_count=vcalloc(MAX_EST_SIZE+ size_increment, sizeof (int*));
\r
3437 for ( a=0; a< MAX_EST_SIZE+ size_increment; a++)
\r
3438 column_count[a]=vcalloc (5, sizeof (int));
\r
3440 old_tot_count=vcalloc(MAX_EST_SIZE+ size_increment, sizeof (int*));
\r
3441 new_tot_count=vcalloc(MAX_EST_SIZE+ size_increment, sizeof (int*));
\r
3443 for (a=0; a< MAX_EST_SIZE; a++)
\r
3445 old_tot_count[a]=*(column_count++);
\r
3446 for ( b=0; b<5; b++)old_tot_count[a][b]=(A->P)->count[b][a];
\r
3448 free_int ( (A->P)->count, -1);
\r
3450 (A->P)->count=declare_int (5, MAX_EST_SIZE+ size_increment);
\r
3451 (A->P)->max_len=MAX_EST_SIZE+ size_increment;
\r
3452 MAX_EST_SIZE+= size_increment;
\r
3453 string1=vcalloc (MAX_EST_SIZE, sizeof (char));
\r
3454 string2=vcalloc (MAX_EST_SIZE, sizeof (char));
\r
3458 sprintf ( string1, "%s",A->seq_al[0]);
\r
3459 sprintf ( string2, "%s",A->seq_al[1]);
\r
3462 string1=mark_internal_gaps(string1,'.');
\r
3463 string2=mark_internal_gaps(string2,'.');
\r
3467 for (b=0,a=0; a< A->len_aln; a++)
\r
3478 best_overlap=MAX(overlap, best_overlap);
\r
3483 if (!is_gap(r1) && first==1)new_tot_count[a]=old_tot_count[b++];
\r
3484 else if (is_gap(r1) || first==0){new_tot_count[a]=*column_count;column_count++;};
\r
3488 if(r1=='a') new_tot_count[a][NA]++;
\r
3489 else if ( r1=='g')new_tot_count[a][NG]++;
\r
3490 else if ( r1=='c')new_tot_count[a][NC]++;
\r
3491 else if ( r1=='t')new_tot_count[a][NT]++;
\r
3492 else if (is_gap(r1));
\r
3495 new_tot_count[a][NA]++;
\r
3496 new_tot_count[a][NG]++;
\r
3497 new_tot_count[a][NC]++;
\r
3498 new_tot_count[a][NT]++;
\r
3501 if ( a> 0 && a<A->len_aln-1 && r1=='.')
\r
3503 new_tot_count[a][IGAP]+=((new_tot_count[a-1][NA]+new_tot_count[a-1][NG]+new_tot_count[a-1][NC]+new_tot_count[a-1][NT]));
\r
3507 if(r2=='a') new_tot_count[a][NA]++;
\r
3508 else if ( r2=='g')new_tot_count[a][NG]++;
\r
3509 else if ( r2=='c')new_tot_count[a][NC]++;
\r
3510 else if ( r2=='t')new_tot_count[a][NT]++;
\r
3511 else if ( r2=='.')new_tot_count[a][IGAP]++;
\r
3512 else if ( r2=='-');
\r
3515 new_tot_count[a][NA]++;
\r
3516 new_tot_count[a][NG]++;
\r
3517 new_tot_count[a][NC]++;
\r
3518 new_tot_count[a][NT]++;
\r
3520 (A->P)->count[0][a]=new_tot_count[a][NA];
\r
3521 (A->P)->count[1][a]=new_tot_count[a][NG];
\r
3522 (A->P)->count[2][a]=new_tot_count[a][NC];
\r
3523 (A->P)->count[3][a]=new_tot_count[a][NT];
\r
3524 (A->P)->count[4][a]=new_tot_count[a][IGAP];
\r
3526 best_int(4,1, &best,new_tot_count[a][NA], new_tot_count[a][NG],new_tot_count[a][NC],new_tot_count[a][NT]);
\r
3527 if( best==0) seq[a]='a';
\r
3528 else if ( best==1)seq[a]='g';
\r
3529 else if ( best==2)seq[a]='c';
\r
3530 else if ( best==3)seq[a]='t';
\r
3536 fprintf ( stderr, "[Best Overlap: %d Residues]", best_overlap);
\r
3537 count_buf=old_tot_count;
\r
3538 old_tot_count=new_tot_count;
\r
3539 new_tot_count=count_buf;
\r
3545 char *aln2cons_maj ( Alignment *A, int ns, int *ls, int n_groups, char **group_list)
\r
3553 if ( !aa) aa=vcalloc (1000, sizeof (int));
\r
3555 len=strlen (A->seq_al[ls[0]]);
\r
3556 seq=vcalloc (len+1, sizeof (char));
\r
3561 ls=vcalloc ( A->nseq, sizeof (int));
\r
3562 for ( a=0; a< A->nseq; a++)ls[a]=a;
\r
3566 for ( a=0; a<len; a++)
\r
3568 int best_s=0, best_aa=0, r;
\r
3569 for (b=0; b< ns; b++)
\r
3571 r=tolower(A->seq_al[ls[b]][a]);
\r
3573 if (!is_gap(r) && aa[r]>best_s)
\r
3580 for (best_s=0, best_aa=0,b=0; b< ns; b++)
\r
3582 aa[tolower(A->seq_al[ls[b]][a])]=0;
\r
3585 if ( clean_ls)vfree(ls);
\r
3591 char *aln2cons_seq ( Alignment *A, int ns, int *ls, int n_groups, char **group_list)
\r
3601 len=strlen (A->seq_al[ls[0]]);
\r
3602 seq=vcalloc (len+1, sizeof (char));
\r
3607 ls=vcalloc ( A->nseq, sizeof (int));
\r
3608 for ( a=0; a< A->nseq; a++)ls[a]=a;
\r
3615 group_list=declare_char ( 26, 2);
\r
3616 for ( a=0; a<26; a++)group_list[a][0]=a+'a';
\r
3622 for ( a=0; a<len; a++)
\r
3624 group=vcalloc (n_groups+1, sizeof (int));
\r
3625 for (best_group=0,b=0; b< ns; b++)
\r
3627 if ( !is_gap(A->seq_al[ls[b]][a]))
\r
3629 for (c=0; c< n_groups; c++)
\r
3630 if ( is_in_set (tolower(A->seq_al[ls[b]][a]), group_list[c]))
\r
3632 best_group=(group[c]>group[best_group])?c:best_group;
\r
3635 seq[a]=group_list[best_group][0];
\r
3640 if ( aa_group) free_char (group_list, -1);
\r
3642 if ( clean_ls)vfree(ls);
\r
3647 Alignment *aln2conservation ( Alignment *A, int threshold,char *seq)
\r
3649 int a, b, c, d, i, c1, c2;
\r
3657 pos =vcalloc (A->len_aln, sizeof (int));
\r
3658 eval=vcalloc (A->len_aln, sizeof (int));
\r
3659 sim=aln2sim_mat (A, "idmat");
\r
3660 if (seq)i=name_is_in_list (seq, A->name, A->nseq, 100);
\r
3663 if ( i==-1) {HERE ("%s is an unknown:sequence [FATAL]"); exit (EXIT_FAILURE);}
\r
3665 for (a=0; a<A->len_aln; a++)
\r
3669 for (c=0,e=a-w; e<=a+w; e++)
\r
3671 if (e<0 || e==A->len_aln)continue;
\r
3672 c1=toupper (A->seq_al[i][e]);
\r
3673 for (b=0; b<A->nseq; b++)
\r
3675 c2=toupper (A->seq_al[b][a]);
\r
3679 s=(double)((double)sim[i][b]/(double)(100));
\r
3684 s=(double)(((double)100-(double)sim[i][b])/(double)(100));
\r
3686 eval[a]+=(s==0)?0:log(s);
\r
3689 pos[a]=(c*100)/A->nseq;
\r
3690 if (!is_gap(c1)){tot+=pos[a]; tn++;}
\r
3692 if (pos[a]>=threshold)A->seq_al[i][a]=toupper (A->seq_al[i][a]);
\r
3693 else A->seq_al[i][a]=tolower (A->seq_al[i][a]);
\r
3695 fprintf (stdout, ">%s %s [i=%d]\n%s\n", A->name[i],A->aln_comment[i],i, A->seq_al[i]);
\r
3696 tot=(tn>0)?(float)tot/(float)tn:0;
\r
3698 for (d=0,a=0; a<A->len_aln; a++)
\r
3700 fprintf (stdout, "# %c %4d", A->seq_al[i][a],pos[a]);
\r
3703 if ( !is_gap (A->seq_al[i][a]))
\r
3705 fprintf (stdout, " LogOdd: %6.2f ", (tot==0 || pos[a]==0)?0:(float)log((float)pos[a]/tot));
\r
3706 fprintf ( stdout, " Pos: %5d E-Val: %9.2f", ++d, eval[a]/(A->nseq));
\r
3708 fprintf ( stdout, "\n");
\r
3710 fprintf ( stdout, "#average conservation: %.2f", tot);
\r
3711 exit (EXIT_SUCCESS);
\r
3713 char *aln2cons_seq_mat ( Alignment *A, char *mat_name)
\r
3715 return sub_aln2cons_seq_mat (A, A->nseq, NULL, mat_name);
\r
3717 char *sub_aln2cons_seq_mat2 ( Alignment *A,int ns, char **ls, char *mat_name)
\r
3721 list=name_array2index_array(ls, ns, A->name, A->nseq);
\r
3722 cons=sub_aln2cons_seq_mat ( A,ns, list, mat_name);
\r
3727 char *sub_aln2cons_seq_mat ( Alignment *A,int ns, int *ls, char *mat_name)
\r
3730 char *seq, r1, r2;
\r
3732 int score=0, best_score=0, best_r=0;
\r
3736 mat=read_matrice (mat_name);
\r
3737 len=strlen ( A->seq_al[(ls==NULL)?0:ls[0]]);
\r
3738 seq=vcalloc (len+1, sizeof (char));
\r
3739 for ( a=0; a<len; a++)
\r
3741 for (b=0; b<20; b++)
\r
3743 r1=AA_ALPHABET[b];
\r
3744 for ( naa=0,score=0,c=0; c<ns; c++)
\r
3746 s=(ls==NULL)?c:ls[c];
\r
3747 if ( ls && ls[c]==-1) continue;
\r
3748 else if (is_gap(A->seq_al[s][a]))continue;
\r
3752 r2=A->seq_al[s][a];
\r
3753 score+=mat[r1-'A'][r2-'A'];
\r
3756 if (naa==0)best_r='-';
\r
3757 if ( b==0 || score>best_score){best_score=score; best_r=r1;}
\r
3761 free_int (mat, -1);
\r
3765 int seq_list2in_file ( TC_method *M, Sequence *S, char *list, char *file)
\r
3767 X_template *T=NULL;
\r
3773 t=tolower(M->seq_type[0]);
\r
3777 return seq_list2fasta_file ( S, list, file);
\r
3788 fp=vfopen ( file, "w");
\r
3789 slist=string2num_list (list);
\r
3792 if (strlen (M->seq_type) >1)
\r
3794 add_warning( stderr, "\nERROR: Mixed seq_type not supported for external methods\n[FATAL:%s]", PROGRAM);
\r
3797 for ( a=2; a<n; a++)
\r
3800 if (t=='p')T=(S->T[s])->P;
\r
3801 else if (t=='r')T=(S->T[s])->R;
\r
3802 else if (t=='g')T=(S->T[s])->G;
\r
3806 fprintf ( fp, ">%s\n%s%s", S->name[s], S->seq[s], LINE_SEPARATOR);
\r
3808 else if ( T && T->template_file && T->template_file[0])
\r
3810 fp2=vfopen (T->template_file, "r");
\r
3811 while ( (c=fgetc (fp2))!=EOF)
\r
3813 fprintf ( fp, "%c", c);
\r
3815 fprintf (fp, "%s", LINE_SEPARATOR);
\r
3820 fprintf (fp, "TARGET_SEQ_NAME: ");
\r
3821 for (a=2; a<n; a++)fprintf ( fp, "%s ", (S->name[slist[a]]));
\r
3822 fprintf ( fp, "%s", LINE_SEPARATOR);
\r
3824 vfclose (fp); vfree (slist);
\r
3832 int seq_list2fasta_file( Sequence *S, char *list, char *file)
\r
3841 /*Buf is used because cmalloced functions cannot go through strtok*/
\r
3845 fp=vfopen ( file, "w");
\r
3848 for ( a=0; a<S->nseq; a++)
\r
3850 fprintf ( fp, ">%s %s\n%s\n", decode_name (S->name[a], CODE),S->name[a], S->seq[a]);
\r
3858 if (buf)vfree(buf);
\r
3859 buf=vcalloc ( strlen (list)+1, sizeof (char));
\r
3860 sprintf ( buf, "%s", list);
\r
3863 n=atoi(strtok (list,SEPARATORS));
\r
3864 for ( a=0; a< n; a++)
\r
3866 s=atoi(strtok (NULL, SEPARATORS));
\r
3867 fprintf ( fp, ">%s %s\n%s\n", decode_name (S->name[s], CODE), S->name[a],S->seq[s]);
\r
3874 Structure * seq2struc ( Sequence *S, Structure *ST)
\r
3878 for ( a=0; a< S->nseq; a++)
\r
3879 for ( b=0; b< S->len[a]; b++)
\r
3880 ST->struc[a][b+1][ST->n_fields-1]=S->seq[a][b];
\r
3884 void aln2struc (Alignment *A, Structure *ST)
\r
3888 for ( a=0; a< A->nseq; a++)
\r
3889 for (c=0, b=0; b< A->len_aln; b++)
\r
3891 if ( !is_gap (A->seq_al[a][b]))
\r
3893 ST->struc[a][c][ST->n_fields-1]=A->seq_al[a][b];
\r
3898 Alignment *stack_aln (Alignment *A, Alignment *B)
\r
3901 int max_len=0, max_nseq=0;
\r
3902 if ( B==NULL)return A;
\r
3903 if ( A==NULL)return B;
\r
3905 max_nseq=A->nseq+B->nseq;
\r
3906 for (a=0; a< A->nseq; a++)max_len=MAX(strlen(A->seq_al[a]),max_len);
\r
3907 for (a=0; a< B->nseq; a++)max_len=MAX(strlen(B->seq_al[a]),max_len);
\r
3909 A=realloc_aln2 ( A,max_nseq,max_len+1);
\r
3911 for (a=A->nseq,b=0; b< B->nseq; b++, a++)
\r
3913 sprintf ( A->seq_comment[a] , "%s", B->seq_comment[b]);
\r
3914 sprintf ( A->aln_comment[a] , "%s", B->aln_comment[b]);
\r
3916 sprintf ( A->seq_al [a] , "%s", B->seq_al [b]);
\r
3917 sprintf ( A->name [a] , "%s", B->name[b]);
\r
3918 sprintf ( A->file [a], "%s" , B->file[b]);
\r
3919 A->order[a][0]=B->order[b][0];
\r
3920 A->order[a][1]=B->order[b][1];
\r
3921 A->score_seq[a]=B->score_seq[b];
\r
3922 A->len[a]=B->len[b];
\r
3925 A->len_aln=MAX(A->len_aln, B->len_aln);
\r
3926 A->nseq=A->nseq+B->nseq;
\r
3927 A->score_aln=A->score_aln+B->score_aln;
\r
3929 A->finished=A->finished+B->finished;
\r
3933 Alignment *chseqIaln(char *name, int seq_n, int start,int len,Sequence *S, int seqIaln, Alignment *A)
\r
3937 seq=extract_char ( S->seq[seq_n], start, len);
\r
3938 A=realloc_aln2 (A, (A==NULL)?(seqIaln+1):MAX(A->nseq,seqIaln+1), ((A==NULL)?(strlen (seq)):MAX(strlen (seq),A->len_aln))+1);
\r
3941 sprintf ( A->seq_al[seqIaln], "%s",seq);
\r
3944 A->order[seqIaln][0]=seq_n;
\r
3945 A->order[seqIaln][1]=start;
\r
3946 sprintf ( A->name[seqIaln], "%s", name);
\r
3947 A->nseq=MAX(A->nseq, seqIaln+1);
\r
3948 A->len_aln=return_maxlen(A->seq_al, A->nseq);
\r
3954 Alignment * aln_gap2random_aa(Alignment *A)
\r
3959 if (strm ( (A->S)->type, "PROTEIN"))
\r
3960 sprintf ( alp, "acefghiklmnpqrstuvwy");
\r
3961 else if ( strm ( (A->S)->type, "DNA") ||strm ( (A->S)->type, "RNA") )
\r
3962 sprintf ( alp, "agct");
\r
3966 for (a=0; a<A->nseq; a++)
\r
3967 for ( b=0; b<A->len_aln; b++)
\r
3968 if ( is_gap (A->seq_al[a][b]))A->seq_al[a][b]=alp[(int)rand()%(l)];
\r
3972 Alignment * make_random_aln(Alignment *A,int nseq, int len, char *alphabet)
\r
3977 A=realloc_aln2(A, nseq, len+1);
\r
3981 for ( a=0; a< A->nseq; a++)sprintf ( A->file[a], "random alignment");
\r
3982 for ( a=0; a< nseq; a++)
\r
3983 A=add_random_sequence2aln(A,alphabet);
\r
3986 Alignment * add_random_sequence2aln( Alignment *A, char *alphabet)
\r
3992 n=strlen(alphabet);
\r
3993 A=realloc_alignment2 (A, A->nseq+1, A->len_aln+1);
\r
3995 for ( a=0; a< A->len_aln; a++)A->seq_al[A->nseq][a]=alphabet[rand()%n];
\r
3996 if (! A->name[A->nseq][0])
\r
3998 for ( a=0; a<10; a++)A->name[A->nseq][a]=alphabet[rand()%n];
\r
3999 A->name[A->nseq][a]='\0';
\r
4006 Sequence *get_defined_residues( Alignment *A)
\r
4010 int a, b, s, l, r;
\r
4011 if ( !A || !A->S) return NULL;
\r
4013 S=duplicate_sequence (A->S);
\r
4014 for ( a=0; a< S->nseq; a++)
\r
4015 for ( b=0; b< S->len[a]; b++)S->seq[a][b]=UNDEFINED_RESIDUE;
\r
4016 buf=vcalloc(A->len_aln+1,sizeof (char));
\r
4017 for ( a=0; a< A->nseq; a++)
\r
4019 sprintf ( buf, "%s",A->seq_al[a]);
\r
4024 for ( b=1; b<= l; b++)
\r
4026 r=A->seq_cache[s][b];
\r
4028 if ( r>=0)S->seq[s][r-1]=(A->S)->seq[s][r-1];
\r
4034 Alignment *thread_defined_residues_on_aln ( Alignment *A, Sequence *S1)
\r
4038 for ( a=0; a< A->nseq; a++)
\r
4042 for (b=0;b< A->len_aln; b++)
\r
4044 gap=is_gap(A->seq_al[a][b]);
\r
4049 r2=A->seq_cache[s][r]-1;
\r
4051 if (r2>=0 && S1->seq[s][r2]==UNDEFINED_RESIDUE)
\r
4052 A->seq_al[a][b]=UNDEFINED_RESIDUE;
\r
4059 int ** trim_aln_borders (char **seq1, char **seq2, int nseq)
\r
4061 int a, b, c,l1,l2;
\r
4069 max=MAX(get_longest_string (seq1,-1, NULL, NULL),get_longest_string (seq2,-1, NULL, NULL))+1;
\r
4070 buf1=vcalloc ( max, sizeof(char));
\r
4071 buf2=vcalloc ( max, sizeof(char));
\r
4073 for ( a=0; a< nseq; a++)
\r
4075 sprintf ( buf1, "%s", seq1[a]);
\r
4076 sprintf ( buf2, "%s", seq2[a]);
\r
4083 if (str_overlap ( buf1, buf2,'*')!=0)
\r
4085 l1=strlen ( seq1[a]);
\r
4086 l2=strlen ( seq2[a]);
\r
4087 for ( b=0,c=0; c< l1; c++)
\r
4088 if ( !is_gap(seq1[a][c]))seq1[a][c]=buf1[b++];
\r
4090 for ( b=0,c=0; c< l2; c++)
\r
4091 if ( !is_gap(seq2[a][c]))seq2[a][c]=buf2[b++];
\r
4100 Sequence * merge_seq ( Sequence *IN, Sequence *OUT)
\r
4104 if ( OUT==NULL)return duplicate_sequence (IN);
\r
4107 if ( IN && check_list_for_dup( IN->name, IN->nseq))
\r
4109 fprintf ( stderr, "\nERROR: %s is duplicated in file %s[FATAL]\n", check_list_for_dup( IN->name, IN->nseq), IN->file[0]);
\r
4110 myexit (EXIT_FAILURE);
\r
4112 for ( a=0; a< IN->nseq; a++)
\r
4113 if ((OUT=add_sequence ( IN, OUT, a))==NULL)return NULL;
\r
4118 Alignment *seq_name2removed_seq_name(Sequence *S, Alignment *NA, float **diff)
\r
4122 for (a=0; a< S->nseq; a++)
\r
4124 if (name_is_in_list( S->name[a], NA->name, NA->nseq, 100)!=-1) continue;
\r
4125 for ( min_diff=100, s=0, b=0; b< NA->nseq; b++)
\r
4127 rb=name_is_in_list ( NA->name[b], S->name, S->nseq, 100);
\r
4128 if ( diff[a][rb]<min_diff)
\r
4131 min_diff=diff[a][rb];
\r
4135 strcat ( NA->seq_comment[s], " ");
\r
4136 strcat ( NA->seq_comment[s], S->name[a]);
\r
4144 int seq_name2index (char *name, Sequence *S)
\r
4146 if ( !S) return -1;
\r
4147 else return name_is_in_list ( name, S->name, S->nseq, MAXNAMES+1);
\r
4149 char * seq_name2coor ( char *s, int *start, int *end, char sep)
\r
4151 /*name|start|end */
\r
4152 char n1[100], n2[100];
\r
4153 int a=0, b=0, c=0;
\r
4156 start[0]=end[0]=0;
\r
4158 while ( s[a]!=sep && s[a]!='\0')a++;
\r
4159 if ( s[a]=='\0')return s;
\r
4165 while ( s[a]!=sep && s[a]!='\0')n1[b++]=s[a++];
\r
4167 if ( s[a]=='\0'){n1[b]='\0';if ( n1[0])start[0]=atoi(n1);return s;}
\r
4168 else s[a++]=n1[b]='\0';
\r
4171 while ( s[a]!=sep && s[a]!='\0')n2[c++]=s[a++];
\r
4175 if ( n1[0])start[0]=atoi(n1);
\r
4176 if ( n2[0])end[0]=atoi(n2);
\r
4182 Sequence *extract_one_seq(char *n,int start, int end, Alignment *S, int keep_name)
\r
4191 if ( n[0]=='#')seq=S->nseq;
\r
4192 else if ( (seq=name_is_in_list (n, S->name, S->nseq, 100)+1)!=0);
\r
4193 else if (is_number (n) && (seq=atoi(n))!=0) seq=atoi(n);
\r
4196 fprintf ( stderr, "\nCould not find Sequence %s [FATAL]", n);
\r
4197 myexit (EXIT_FAILURE);
\r
4201 name=vtmpnam ( NULL);
\r
4202 fp=vfopen ( name, "w");
\r
4203 if ( start && end &&!keep_name)fprintf (fp, ">%s_%d_%d\n",S->name[seq],start, end);
\r
4204 else if ( start && end==0 && !keep_name)fprintf (fp, ">%s_%d_%d\n",S->name[seq],start,(int)strlen ( S->seq_al[seq]));
\r
4205 else fprintf (fp, ">%s\n", S->name[seq]);
\r
4207 if ( start==0 && end==0){fprintf (fp, "%s\n", S->seq_al[seq]);}
\r
4208 else if (end==0){fprintf (fp, "%s\n", S->seq_al[seq]+start-1);}
\r
4211 for ( a=start-1; a<end; a++){fprintf ( fp, "%c", S->seq_al[seq][a]);}
\r
4212 fprintf ( fp, "\n");
\r
4217 OUT_S=get_fasta_sequence_num (name, NULL);
\r
4224 Sequence * extract_sub_seq( Sequence *COOR, Sequence *S)
\r
4229 for ( a=0; a< S->nseq; a++)
\r
4231 if ( (s=name_is_in_list ( S->name[a], COOR->name, COOR->nseq, 100))!=-1)
\r
4234 sscanf ( COOR->seq_comment[s], "%d %d", &start, &end);
\r
4235 for (c=0,b=start-1; b< end; b++, c++)S->seq[a][c]=S->seq[a][b];
\r
4236 S->seq[a][c]='\0';
\r
4237 sprintf ( S->seq_comment[a], "%s",COOR->seq_comment[s]);
\r
4241 S=reorder_seq ( S, COOR->name, COOR->nseq);
\r
4247 char * aln_column2string (Alignment *A, int p)
\r
4251 if (p>=A->len_aln)
\r
4253 HERE ("ERROR: index (p=%d) loger than aln (l=%d) [FATAL]", p, A->len_aln);
\r
4254 exit (EXIT_FAILURE);
\r
4258 s=vcalloc (A->nseq+1, sizeof (char));
\r
4259 for (a=0; a< A->nseq; a++)s[a]=A->seq_al[a][p];
\r
4263 Alignment * fix_aln_seq ( Alignment *A, Sequence *S)
\r
4266 char *buf1, *buf2;
\r
4267 int g0, g1, nr0, nr1;
\r
4272 /*This function establishes the correspondance between every (1..N+1) residue of each aligned sequence
\r
4273 and its correspondance in S:
\r
4274 A->seq_cache[a][b]=x means that residue b of aligned sequence a corresponds to residue x of the sequence with tye same index in S
\r
4275 A->seq_cache[a][b]=0 means there is no correspondance.
\r
4276 a is the index of the sequence
\r
4277 Applying this function is needed for turning an alignment into a constraint list
\r
4281 if ( S==NULL)return A;
\r
4283 A->seq_cache=declare_int ( S->nseq, MAX((A->len_aln+1), S->max_len+1));
\r
4285 for (a=0; a< S->nseq; a++)
\r
4286 for ( b=0; b< A->len_aln; b++)A->seq_cache[a][b]=-1;
\r
4289 for ( a=0; a< S->nseq; a++)
\r
4291 for (b=0; b< A->nseq; b++)
\r
4293 if (strm ( S->name[a], A->name[b]))
\r
4298 buf1=vcalloc ( A->len_aln+1, sizeof (char));
\r
4299 sprintf (buf1, "%s", A->seq_al[b]);
\r
4301 upper_string (buf1);
\r
4304 buf2=vcalloc (strlen(S->seq[a])+1, sizeof (char));
\r
4305 sprintf (buf2, "%s",S->seq[a]);
\r
4307 upper_string (buf2);
\r
4311 if ( strm (buf1,buf2))
\r
4314 for ( c=0; c<S->len[a]; c++)A->seq_cache[a][c+1]=c+1;
\r
4319 B=align_two_sequences (buf2,buf1,"blosum62mt",-4,-1, "myers_miller_pair_wise");
\r
4320 if ( getenv ("DEBUG_RECONCILIATION"))
\r
4322 fprintf (stderr, "\n[DEBUG_RECONCILIATION:fix_aln_seq]\nReconciliation of %s\nA=Ref_sequence\nB=New_seq", S->name[a]);
\r
4326 for (id=0, tot=0,nr0=0,nr1=0,c=0; c<B->len_aln; c++)
\r
4328 g0=is_gap(B->seq_al[0][c]);
\r
4329 g1=is_gap(B->seq_al[1][c]);
\r
4335 id+=(B->seq_al[0][c]==B->seq_al[1][c])?1:0;
\r
4336 A->seq_cache[a][nr1]=nr0;
\r
4338 else if (g0 && !g1)
\r
4340 A->seq_cache[a][nr1]=0;
\r
4343 if ( ((id*100)/tot)<20)
\r
4346 fprintf ( stderr, "\nTwo different sequences have the same name: %s", S->name[a]);
\r
4347 fprintf ( stderr, "\nIf %s is a PDBID, Make sure it identifies the right chain (A, B, 1, 2...)", S->name[a]);
\r
4348 fprintf ( stderr, "\nChain number or index must be added to the PDB id (i.e. 1gowA)");
\r
4349 fprintf ( stderr, "\nIf You want to use %s anyway, rename it with a non-PDB identifier such as seq_%s\n",S->name[a],S->name[a]);
\r
4350 myexit (EXIT_FAILURE);
\r
4353 free_sequence ( B->S, -1);
\r
4360 vfree(buf1);vfree(buf2);
\r
4364 Sequence * add_prf2seq ( char *file, Sequence *S)
\r
4369 if ( !is_aln (file)&& !is_seq (file))return S;
\r
4376 R=fill_R_template(file,file, S);
\r
4379 ((R->VR)->A)->expand=1;
\r
4380 new_seq=declare_char (1,A->len_aln+1);
\r
4381 sprintf ( new_seq[0], "%s",aln2cons_seq_mat(A, "blosum62mt"));
\r
4383 NS=fill_sequence_struc(1, new_seq,A->file);
\r
4384 S=add_sequence (NS, S, 0);
\r
4385 (S->T[S->nseq-1])->R=R;
\r
4387 free_sequence (NS, NS->nseq);
\r
4388 free_char( new_seq, -1);
\r
4393 int prf_in_seq ( Sequence *S)
\r
4397 if ( !S) return 0;
\r
4400 for ( a=0; a< S->nseq; a++)
\r
4401 if (seq2R_template_profile(S, a)) return 1;
\r
4405 Sequence * add_sequence ( Sequence *IN, Sequence *OUT, int i)
\r
4413 OUT=duplicate_sequence (IN);
\r
4416 for (a=0; a<OUT->nseq; a++)
\r
4419 P=seq2R_template_profile (OUT, a);
\r
4421 else if (name_is_in_list (IN->name[i], P->name, P->nseq, 100)!=-1) return OUT;
\r
4424 /*Adds sequence i of IN at the end of OUT*/
\r
4426 if ((s=name_is_in_list ( IN->name[i], OUT->name, OUT->nseq,STRING))==-1 )
\r
4428 OUT=realloc_sequence (OUT, OUT->nseq+1, IN->len[i]);
\r
4429 sprintf ( OUT->name[OUT->nseq],"%s",IN->name[i]);
\r
4430 sprintf ( OUT->file[OUT->nseq],"%s",IN->file[i]);
\r
4431 sprintf ( OUT->seq_comment[OUT->nseq],"%s",IN->seq_comment[i]);
\r
4432 sprintf ( OUT->aln_comment[OUT->nseq],"%s",IN->aln_comment[i]);
\r
4434 sprintf ( OUT->seq[OUT->nseq],"%s",IN->seq[i]);
\r
4435 OUT->len[OUT->nseq]=IN->len[i];
\r
4436 OUT->T[OUT->nseq][0]=IN->T[i][0];
\r
4440 else if ( s!=-1 && !case_insensitive_strcmp ( IN->seq[i], OUT->seq[s]))
\r
4443 if ( getenv4debug("DEBUG_RECONCILIATION"))fprintf ( stderr,"[DEBUG_RECONCILIATION:add_sequence]\n%s\n%s\n", IN->seq[i], OUT->seq[s]);
\r
4445 add_warning (stderr, "WARNING: DISCREPANCY:%s in [%s] and [%s]\n", IN->name[i], IN->file[i], OUT->file[s]);
\r
4448 if (((buf=build_consensus(IN->seq[i], OUT->seq[s],"cfasta_pair_wise" ))!=NULL)||((buf=build_consensus(IN->seq[i], OUT->seq[s],"myers_miller_pair_wise" ))!=NULL))
\r
4451 OUT->max_len=MAX(OUT->max_len, strlen(buf));
\r
4452 OUT->min_len=MIN(OUT->min_len, strlen(buf));
\r
4453 OUT->seq =realloc_char ( OUT->seq, -1, -1,OUT->nseq,OUT->max_len+1);
\r
4455 sprintf ( OUT->seq[s],"%s",buf);
\r
4456 OUT->len[s]=strlen (buf);
\r
4462 fprintf ( stderr, "IMPOSSIBLE TO RECONCILIATE SOME SEQUENCES[FATAL:%s]\n", PROGRAM);
\r
4463 print_aln ( align_two_sequences (IN->seq[i], OUT->seq[s], "idmat", 0, 0, "fasta_pair_wise"));
\r
4464 myexit (EXIT_FAILURE);
\r
4476 Sequence * trim_seq ( Sequence *A, Sequence *B)
\r
4481 if (A->nseq>B->nseq)
\r
4487 R=declare_sequence (MIN(A->min_len,B->min_len), MAX(A->max_len, B->max_len), MIN(A->nseq, B->nseq));
\r
4490 for (a=0; a< A->nseq; a++)
\r
4492 if ( name_is_in_list ( A->name[a], B->name, B->nseq,STRING+1)!=-1)
\r
4494 sprintf ( R->name[R->nseq], "%s", A->name[a]);
\r
4495 sprintf ( R->seq[R->nseq], "%s", A->seq[a]);
\r
4496 sprintf ( R->file[R->nseq], "%s", A->file[a]);
\r
4497 sprintf ( R->aln_comment[R->nseq], "%s", A->aln_comment[a]);
\r
4498 sprintf ( R->seq_comment[R->nseq], "%s", A->seq_comment[a]);
\r
4500 R->len[R->nseq]=A->len[a];
\r
4507 Sequence * trim_aln_seq ( Alignment *A, Alignment *B)
\r
4510 static char **name_list;
\r
4512 Sequence *SA, *SB;
\r
4513 int **cache_A=NULL;
\r
4514 int **cache_B=NULL;
\r
4517 /*This function inputs two alignments A and B
\r
4518 It removes sequences that are not common to both of them
\r
4519 It rearange the sequences so that they are in the same order
\r
4520 A decides on the order
\r
4521 The Sequences (A->S) and (B->S) are treated the same way
\r
4522 Sequences are also merged in order to detects discrepencies.
\r
4523 A pointer to S is returned
\r
4525 if (name_list)free_char (name_list, -1);
\r
4526 name_list=declare_char (MAX(A->nseq, B->nseq), STRING+1);
\r
4528 for ( a=0; a< A->nseq; a++)
\r
4530 if ( name_is_in_list ( A->name[a], B->name, B->nseq,STRING)!=-1)
\r
4532 sprintf ( name_list[n++], "%s", A->name[a]);
\r
4538 reorder_aln ( A, name_list, n);
\r
4539 if (A->seq_cache)cache_A=duplicate_int (A->seq_cache, -1, -1);
\r
4540 if (B->seq_cache)cache_B=duplicate_int (B->seq_cache, -1, -1);
\r
4541 reorder_aln ( B, name_list, n);
\r
4542 for ( a=0; a< n; a++)
\r
4546 p=A->seq_cache[A->order[a][0]];
\r
4547 A->seq_cache[A->order[a][0]]=cache_A[a];
\r
4552 p=B->seq_cache[B->order[a][0]];
\r
4553 B->seq_cache[B->order[a][0]]=cache_B[a];
\r
4556 A->order[a][0]=B->order[a][0]=a;
\r
4558 free_int(A->seq_cache, -1);
\r
4559 free_int(B->seq_cache, -1);
\r
4561 A->seq_cache=cache_A;
\r
4562 B->seq_cache=cache_B;
\r
4569 A->S=B->S=merge_seq (SA, SB);
\r
4572 Sequence * trim_aln_seq_name ( Alignment *A, Alignment *B)
\r
4577 /*This function inputs two alignments A and B
\r
4578 It removes sequences that are not common to both of them
\r
4579 It rearange the sequences so that they are in the same order
\r
4580 A decides on the order
\r
4582 S=declare_sequence ( 1, 1, A->nseq+B->nseq);
\r
4584 for ( a=0; a< A->nseq; a++)
\r
4586 if ( name_is_in_list ( A->name[a], B->name, B->nseq,STRING)!=-1)
\r
4588 sprintf ( S->name[S->nseq++], "%s", A->name[a]);
\r
4596 char ** rm_name_tag (char **name, int nseq, char *tag)
\r
4601 char **template_list;
\r
4602 if ( !name )return NULL;
\r
4604 tag_list=declare_char (10, 4);
\r
4608 ntag=1; sprintf ( tag_list[0], "%s", tag);
\r
4613 sprintf ( tag_list[ntag++], "_S_");
\r
4614 sprintf ( tag_list[ntag++], "_G_");
\r
4616 template_list=declare_char (nseq, 100);
\r
4617 for ( a=0; a<nseq ; a++)
\r
4619 for ( b=0; b<ntag; b++)
\r
4621 s=strstr(name[a], tag_list[b]);
\r
4626 sprintf ( template_list[a], ">%s _%s_ %s", name[a], s+1, s+3);
\r
4632 free_char (tag_list, -1);
\r
4633 return template_list;
\r
4635 Sequence * swap_header ( Sequence *S, Sequence *H)
\r
4639 for ( a=0; a< S->nseq; a++)
\r
4641 if ( (n=name_is_in_list (S->name[a],H->name, H->nseq, 1000))!=-1)
\r
4646 list=string2list (H->seq_comment[n]);
\r
4647 if ( list==NULL || atoi(list[0])==1)continue;
\r
4648 S->seq_comment[a]='\0';
\r
4649 sprintf (S->name[a], "%s%s%s",H->name[n], list[1], list[2]);
\r
4650 vfree ( S->seq_comment[a]);S->seq_comment[a]=vcalloc ( strlen (H->seq_comment[n])+1, sizeof (char));
\r
4651 for (b=3; b< atoi(list[0]); b++)S->seq_comment[a]=strcat (S->seq_comment[a], list[b]);
\r
4652 free_char (list, -1);
\r
4659 Sequence * profile_seq2template_seq ( Sequence *S, char *template_file, Fname *F)
\r
4661 /*This function fetches potential templates associated with sequences within a profile*/
\r
4666 if ( !check_file_exists (template_file)) return S;
\r
4667 tmp=vtmpnam (NULL);
\r
4668 for ( i=0; i< S->nseq; i++)
\r
4670 if ( (A=seq2R_template_profile (S, i)))
\r
4672 printf_system ("cp %s %s", template_file, tmp);//seq2template over-writes the temnplate file with a list of the templates effectively encounter
\r
4674 A->S=seq2template_seq (A->S, tmp, F);
\r
4675 if (!A->S)return NULL;
\r
4682 Sequence * seq2template_type(Sequence *Seq)
\r
4687 struct X_template *S=NULL;
\r
4688 struct X_template *P=NULL;
\r
4689 struct X_template *R=NULL;
\r
4690 struct X_template *G=NULL;
\r
4691 struct X_template *F=NULL;
\r
4692 struct X_template *T=NULL;
\r
4693 struct X_template *E=NULL;
\r
4694 struct X_template *U=NULL;
\r
4699 for (a=0; a< Seq->nseq; a++)
\r
4701 if (!Seq->T[a])continue;
\r
4702 //HERE ADD a Template
\r
4703 P=seq_has_template (Seq, a, "_P_");
\r
4704 S=seq_has_template (Seq, a, "_S_");
\r
4705 R=seq_has_template (Seq, a, "_R_");
\r
4706 G=seq_has_template (Seq, a, "_G_");
\r
4707 F=seq_has_template (Seq, a, "_F_");
\r
4708 T=seq_has_template (Seq, a, "_T_");
\r
4709 E=seq_has_template (Seq, a, "_E_");
\r
4710 U=seq_has_template (Seq, a, "_U_");
\r
4713 sprintf ( (Seq->T[a])->seq_type, "%c%c%c%c%c%c%c%c", (P)?'P':e, (S)?'S':e, (S &&!P)?'s':e,(R)?'R':e, (G)?'G':e,(T)?'T':e,(E)?'E':e,(U)?'U':e);
\r
4715 if (R && (A=seq2R_template_profile (Seq,a)))
\r
4718 A->S=seq2template_type ( A->S);
\r
4724 char * string_contains_template_tag (char *string_in)
\r
4728 if ( strstr (string, "_P_"))return "_P_";
\r
4729 if ( strstr (string, "_S_"))return "_S_";
\r
4730 if ( strstr (string, "_R_"))return "_R_";
\r
4731 if ( strstr (string, "_G_"))return "_G_";
\r
4732 if ( strstr (string, "_F_"))return "_F_";
\r
4733 if ( strstr (string, "_T_"))return "_T_";
\r
4734 if ( strstr (string, "_E_"))return "_E_";
\r
4735 if ( strstr (string, "_U_"))return "_U_";
\r
4739 static int check_blast_is_installed (char *server);
\r
4743 static int check_blast_is_installed (char *server)
\r
4745 if (strm (server, "EBI"));
\r
4746 else if ( strm (server, "NCBI"))
\r
4747 return check_program_is_installed (NCBIWEBBLAST_4_TCOFFEE,NULL, NULL,NCBIWEBBLAST_ADDRESS, INSTALL_OR_DIE);
\r
4748 else if ( strm (server, "LOCAL"))
\r
4749 return check_program_is_installed (NCBIBLAST_4_TCOFFEE,NULL, NULL,NCBIBLAST_ADDRESS, INSTALL_OR_DIE);
\r
4754 Sequence * vremove_seq_template_files(Sequence *S)
\r
4756 return handle_seq_template_file (S, "remove");
\r
4758 Sequence * display_seq_template_files(Sequence *S)
\r
4760 return handle_seq_template_file (S, "display");
\r
4762 Sequence * handle_seq_template_file (Sequence *S, char *mode)
\r
4767 for (a=0; a< S->nseq; a++)
\r
4772 handle_X_template_files (T->P, mode);
\r
4773 handle_X_template_files (T->F, mode);
\r
4774 handle_X_template_files (T->R, mode);
\r
4775 handle_X_template_files (T->T, mode);
\r
4776 handle_X_template_files (T->E, mode);
\r
4782 int handle_X_template_files ( X_template *T, char *mode)
\r
4786 if ( strm (mode, "remove"))
\r
4788 vremove (T->template_file);
\r
4789 vremove (T->template_name);
\r
4791 else if (strm (mode, "display"))
\r
4794 sprintf ( buf, "Template %s", template_type2type_name (T->template_type));
\r
4795 if (check_file_exists (T->template_name))display_output_filename ( stdout,buf,T->template_format,T->template_name, STORE);
\r
4799 printf_exit (EXIT_FAILURE, stderr, "\nUnkonwn mode %s for template handling [FATAL:%s]", mode, PROGRAM);
\r
4803 Sequence * seq2template_seq ( Sequence *S, char *template_list, Fname *F)
\r
4805 /*Expected format for the template file:
\r
4806 >seq_name _X_ Target_template
\r
4807 X: S for Structures
\r
4808 G for genomes (Exoset)
\r
4809 When alternative templates are given for a sequence, the first one superseeds all the others
\r
4812 /*Fill the sequences*/
\r
4813 /*1: No template*/
\r
4819 char *pdb_db,*prot_db;
\r
4821 int remove_template_file=0;
\r
4823 remove_template_file=get_int_variable ("remove_template_file");
\r
4824 server=get_string_variable ("blast_server");
\r
4825 pdb_db=get_string_variable ("pdb_db");
\r
4826 prot_db=get_string_variable ("prot_db");
\r
4828 PmI=get_int_variable ("pdb_min_sim");
\r
4829 PMI=get_int_variable ("pdb_max_sim");
\r
4830 PmC=get_int_variable ("pdb_min_cov");
\r
4832 BmI=get_int_variable ("prot_min_sim");
\r
4833 BMI=get_int_variable ("prot_max_sim");
\r
4834 BmC=get_int_variable ("prot_min_cov");
\r
4836 if ( (template_list && template_list[0]=='\0') || strm ( template_list, "no_template"))
\r
4840 else if ( strstr (template_list, "MODE_"))//pre_set mode
\r
4842 return seq2template_seq ( S,template_list+strlen ("MODE_"),F);
\r
4844 else if ( strm ( template_list, "SSP")|| strm ( template_list, "GOR"))
\r
4847 /*use GOR to Predict the secondary structure*/
\r
4848 check_program_is_installed (GOR4_4_TCOFFEE,NULL, NULL,GOR4_ADDRESS, INSTALL_OR_DIE);
\r
4849 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#ssp_template@seq#%s/%s@obs#%s/%s@cache#%s@type#_E_",get_mcoffee_4_tcoffee(), "New_KS.267.seq", get_mcoffee_4_tcoffee(), "New_KS.267.obs", get_cache_dir());
\r
4850 S=seq2template_seq (S,buf, F);
\r
4853 else if ( strm ( template_list, "PSISSP") || strm (template_list, "PSIGOR"))
\r
4856 /*Computes a GOR consensus on a psi-blast output*/
\r
4857 check_program_is_installed (GOR4_4_TCOFFEE,NULL, NULL,GOR4_ADDRESS, INSTALL_OR_DIE);
\r
4858 check_blast_is_installed(server);
\r
4860 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#psissp_template@seq#%s/%s@obs#%s/%s@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_E_",get_mcoffee_4_tcoffee(), "New_KS.267.seq", get_mcoffee_4_tcoffee(), "New_KS.267.obs", get_cache_dir(), BmI,BMI,BmC,server);
\r
4861 S=seq2template_seq (S,buf, F);
\r
4864 else if ( strm ( template_list, "TM"))
\r
4867 /*predict transmembrane structure*/
\r
4868 check_program_is_installed (HMMTOP_4_TCOFFEE,NULL, NULL,HMMTOP_ADDRESS, INSTALL_OR_DIE);
\r
4869 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#tm_template@arch#%s/%s@psv#%s/%s@type#_T_",get_mcoffee_4_tcoffee(), "hmmtop.arch", get_mcoffee_4_tcoffee(), "hmmtop.psv");
\r
4870 S=seq2template_seq (S,buf, F);
\r
4873 else if ( strm ( template_list, "PSITM"))
\r
4876 /*predict transmembrane structure*/
\r
4877 check_program_is_installed (HMMTOP_4_TCOFFEE,NULL, NULL,HMMTOP_ADDRESS, INSTALL_OR_DIE);
\r
4878 check_blast_is_installed(server);
\r
4880 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#psitm_template@arch#%s/%s@psv#%s/%s@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_T_",get_mcoffee_4_tcoffee(), "hmmtop.arch", get_mcoffee_4_tcoffee(), "hmmtop.psv",get_cache_dir(), BmI,BMI,BmC,server);
\r
4881 S=seq2template_seq (S,buf, F);
\r
4885 else if (strm ( template_list, "PSIBLAST"))
\r
4887 check_blast_is_installed(server);
\r
4888 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#psiprofile_template@database#%s@method#psiblast@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_R_", prot_db,get_cache_dir(),BmI,BMI,BmC,server);
\r
4889 S=seq2template_seq (S,buf, F);
\r
4893 else if (strm ( template_list, "BLAST") )
\r
4895 check_blast_is_installed(server);
\r
4896 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#profile_template@database#%s@method#blastp@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_R_", prot_db,get_cache_dir(),BmI,BMI,BmC,server);
\r
4897 S=seq2template_seq (S,buf, F);
\r
4901 else if ( strm ( template_list, "EXPRESSO") || strm (template_list, "PDB"))
\r
4903 check_blast_is_installed(server);
\r
4907 for (i= 0; i < S->len[0]; ++i)
\r
4909 isRNA = (isRNA || is_rna(S->seq[0][i]));
\r
4914 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#pdb_template@database#%s@method#blastn@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_P_",pdb_db, get_cache_dir(),PmI,PMI,PmC, server);
\r
4918 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#pdb_template@database#%s@method#blastp@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_P_",pdb_db, get_cache_dir(),PmI,PMI,PmC, server);
\r
4920 return seq2template_seq (S,buf, F);
\r
4923 else if ( strm (template_list, "RCOFFEE") || strm (template_list, "RNA"))
\r
4925 char *file_struc_clac = vtmpnam (NULL);
\r
4926 FILE* struc_calc_f =vfopen(file_struc_clac,"w");
\r
4929 for (i = 0; i< S->nseq; ++i)
\r
4934 fprintf(struc_calc_f,"%s %s\n",S->name[i],S->T[i]->P->template_file);
\r
4938 vfclose(struc_calc_f);
\r
4939 check_program_is_installed (RNAPLFOLD_4_TCOFFEE,NULL, NULL,RNAPLFOLD_ADDRESS, IS_FATAL);
\r
4940 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#RNA_template@type#_F_");
\r
4943 S = seq2template_seq (S,buf,F);
\r
4944 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#calc_rna_template@pdbfile#%s@cache#%s@type#_F_", file_struc_clac,get_cache_dir());
\r
4946 // printf("IN T_\n");
\r
4947 return seq2template_seq (S,buf,F);
\r
4950 /*2: Templates from seqnames (SELF) or named like the sequences (SEQFILE)*/
\r
4951 else if ( strstr (template_list, "SELF_") ||strstr (template_list, "SEQFILE_") )
\r
4957 for (a=0; a< S->nseq; a++)
\r
4960 if ( (p=strstr (template_list,"SELF_")))p=S->name[a];
\r
4961 else if ( strstr (template_list, "SEQFILE_"))p=template_list;
\r
4964 fprintf ( stderr, "\nUnkown mode for Template [FATAL:%s]\n", PROGRAM);
\r
4965 myexit (EXIT_FAILURE);
\r
4968 if ( strstr (template_list, "_P_") && !(S->T[a])->P)
\r
4970 (S->T[a])->P =fill_P_template ( S->name[a], p,S);//PDB
\r
4973 else if ( strstr (template_list, "_S_") && !(S->T[a])->S)(S->T[a])->S =fill_S_template ( S->name[a], p,S);//Sequence
\r
4974 else if ( strstr (template_list, "_R_" )&& !(S->T[a])->R)(S->T[a])->R =fill_R_template ( S->name[a], p,S);//pRofile
\r
4975 else if ( strstr (template_list, "_G_" )&& !(S->T[a])->G)(S->T[a])->G =fill_G_template ( S->name[a], p,S);//Genomic
\r
4976 else if ( strstr (template_list, "_F_" )&& !(S->T[a])->F)(S->T[a])->F =fill_F_template ( S->name[a], p,S);//Fold
\r
4977 else if ( strstr (template_list, "_T_" )&& !(S->T[a])->T)(S->T[a])->T =fill_T_template ( S->name[a], p,S);//Trans Membrane
\r
4978 else if ( strstr (template_list, "_E_" )&& !(S->T[a])->E)(S->T[a])->E =fill_E_template ( S->name[a], p,S);//Secondary Structure
\r
4979 else if ( strstr (template_list, "_U_" )&& !(S->T[a])->U)(S->T[a])->U =fill_U_template ( S->name[a], p,S);//unicode, list template
\r
4985 /*2: Templates comes in a template_file*/
\r
4986 else if ( template_list==NULL || format_is_fasta (template_list))
\r
4991 T=(template_list!=NULL)?get_fasta_sequence (template_list, NULL):S;
\r
4992 for (a=0; a< T->nseq; a++)
\r
4996 if ((i=name_is_in_list(T->name[a], S->name, S->nseq, MAXNAMES))!=-1)
\r
4998 if ( (p=strstr (T->seq_comment[a], " _P_ ")) && !(S->T[i])->P &&( (S->T[i])->P=fill_P_template (S->name[i],p,S)))
\r
5002 else if ( (p=strstr (T->seq_comment[a], " _F_ ")) && !(S->T[i])->F &&( (S->T[i])->F=fill_F_template (S->name[i],p,S)))ntemp++;
\r
5003 else if ( (p=strstr (T->seq_comment[a], " _S_ ")) && !(S->T[i])->S &&( (S->T[i])->S=fill_S_template (S->name[i],p,S)))ntemp++;
\r
5005 else if ( (p=strstr (T->seq_comment[a], " _R_ ")) && !(S->T[i])->R &&( (S->T[i])->R=fill_R_template (S->name[i],p,S)))ntemp++;
\r
5006 else if ( (p=strstr (T->seq_comment[a], " _G_ ")) && !(S->T[i])->G &&( (S->T[i])->G=fill_G_template (S->name[i],p,S)))ntemp++;
\r
5007 else if ( (p=strstr (T->seq_comment[a], " _T_ ")) && !(S->T[i])->T &&( (S->T[i])->T=fill_T_template (S->name[i],p,S)))ntemp++;
\r
5008 else if ( (p=strstr (T->seq_comment[a], " _E_ ")) && !(S->T[i])->E &&( (S->T[i])->E=fill_E_template (S->name[i],p,S)))ntemp++;
\r
5009 else if ( (p=strstr (T->seq_comment[a], " _U_ ")) && !(S->T[i])->U &&( (S->T[i])->E=fill_U_template (S->name[i],p,S)))ntemp++;
\r
5011 if (T!=S)strcat (S->seq_comment[i], T->seq_comment[a]);
\r
5015 if (T!=S)free_sequence (T, -1);
\r
5017 if ( remove_template_file==2 || ntemp==0)
\r
5019 vremove (template_list);
\r
5022 if (template_list)display_output_filename ( stdout, "Template_List","fasta_seq", template_list, STORE);
\r
5026 /*3 Templates are generated with a script*/
\r
5027 else if (strstr (template_list, "SCRIPT_") && get_string_variable ("multi_core") && strstr (get_string_variable ("multi_core"), "templates") && get_nproc()>1)
\r
5029 char *tmp1,*command;
\r
5031 char **temp_file,**seq_file;
\r
5032 int * pid_list, pid, npid, submited;
\r
5033 int nproc, max_nproc;
\r
5035 char outfile[1000];
\r
5036 static char *script;
\r
5040 if (!script)script=vcalloc ( 1000, sizeof(char));
\r
5044 command=vcalloc ( 1000, sizeof (char));
\r
5045 tmp1=vtmpnam (NULL);
\r
5047 A=seq2aln (S,NULL, 0);
\r
5048 string_array_upper(A->seq_al, A->nseq);
\r
5049 output_fasta_seq (tmp1, A);
\r
5050 sprintf ( script, "%s", after_strstr (template_list, "SCRIPT_"));
\r
5052 if ((p=strstr (template_list, "@type#")))
\r
5053 p+=strlen ("@type#");
\r
5056 sprintf (outfile, "%s%s_%s%d.template_list", F->path,F->name,template_type2short_type_name(p),ntemp);
\r
5060 F=parse_fname (S->file[0]);
\r
5061 sprintf (outfile, "%s%s_%s%d.template_list",F->path, F->name,template_type2short_type_name(p),ntemp);
\r
5065 nproc=get_nproc();
\r
5066 max_nproc=2*nproc;
\r
5068 script=substitute(script, "@", " -");
\r
5069 script=substitute(script, "#", "=");
\r
5071 temp_file=vcalloc ( A->nseq, sizeof (char*));
\r
5072 seq_file =vcalloc (A->nseq, sizeof (char*));
\r
5073 pid_list =vcalloc (MAX_N_PID, sizeof (int *));
\r
5075 fprintf ( stderr, "\n\t------ Fetch %Templates [Multi Core Mode %d CPUs]\n",get_nproc());
\r
5076 for (npid=0, submited=0,i=0; i<S->nseq; i++)
\r
5079 seq_file[i]=vtmpnam (NULL);
\r
5080 temp_file[i]=vtmpnam (NULL);
\r
5081 fp2=vfopen (seq_file[i], "w");
\r
5082 fprintf ( fp2, ">%s\n%s\n", S->name[i], S->seq[i]);
\r
5088 initiate_vtmpnam (NULL);
\r
5089 if ( strstr (script, "tc_generic_method"))
\r
5091 //sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s -tmpdir=%s",get_string_variable ("t_coffee"),script,seq_file[i],temp_file[i],get_tmp_4_tcoffee());
\r
5092 sprintf ( command, "%s -infile=%s -outfile=%s -tmpdir=%s",script,seq_file[i],temp_file[i],get_tmp_4_tcoffee());
\r
5093 if (strstr (command, "EBI"))get_email ();
\r
5096 //sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s",get_string_variable("t_coffee"),script,seq_file[i],temp_file[i]);
\r
5097 sprintf ( command, "%s -infile=%s -outfile=%s",script,seq_file[i],temp_file[i]);
\r
5098 command=substitute(command, "@", " ");
\r
5099 my_system ( command);
\r
5100 exit (EXIT_SUCCESS);
\r
5104 pid_list[pid]=npid;
\r
5108 submited=vwait_npid(submited,max_nproc,nproc);
\r
5112 submited=vwait_npid(submited,0,0);
\r
5113 //Concatenate all the files
\r
5114 vremove (outfile);
\r
5115 for (i=0; i<npid; i++) file_cat (temp_file[i],outfile);
\r
5117 //Free the process table
\r
5118 vfree (temp_file);
\r
5123 if ( check_file_exists (outfile) && format_is_fasta(outfile))
\r
5125 S=seq2template_seq (S, outfile, F);
\r
5127 else if (strstr (command, "webblast.pl"))return S;
\r
5131 add_warning (stderr, "\nWARNING: Could not Run %s to find templates[%s]\n",command, PROGRAM);
\r
5139 else if (strstr (template_list, "SCRIPT_"))
\r
5142 char *tmp1,*command;
\r
5144 char outfile[1000];
\r
5145 static char *script;
\r
5149 if (!script)script=vcalloc ( 1000, sizeof(char));
\r
5153 command=vcalloc ( 1000, sizeof (char));
\r
5154 tmp1=vtmpnam (NULL);
\r
5156 A=seq2aln (S,NULL, 0);
\r
5157 string_array_upper(A->seq_al, A->nseq);
\r
5158 output_fasta_seq (tmp1, A);
\r
5159 sprintf ( script, "%s", after_strstr (template_list, "SCRIPT_"));
\r
5160 fprintf ( stderr, "\n");
\r
5161 if ((p=strstr (template_list, "@type#")))
\r
5162 p+=strlen ("@type#");
\r
5165 sprintf (outfile, "%s%s_%s%d.template_list", F->path,F->name,template_type2short_type_name(p),ntemp);
\r
5169 F=parse_fname (S->file[0]);
\r
5170 sprintf (outfile, "%s%s_%s%d.template_list",F->path, F->name,template_type2short_type_name(p),ntemp);
\r
5174 script=substitute(script, "@", " -");
\r
5175 script=substitute(script, "#", "=");
\r
5177 if ( strstr (script, "tc_generic_method"))
\r
5179 sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s -tmpdir=%s",get_string_variable ("t_coffee"),script, tmp1,outfile,get_tmp_4_tcoffee());
\r
5180 if (strstr (command, "EBI"))get_email ();
\r
5182 else sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s",get_string_variable("t_coffee"),script, tmp1, outfile);
\r
5184 vremove (outfile);
\r
5185 command=substitute(command, "@", " ");
\r
5187 my_system ( command);
\r
5191 if ( check_file_exists (outfile) && format_is_fasta(outfile))
\r
5193 S=seq2template_seq (S, outfile, F);
\r
5195 else if (strstr (command, "webblast.pl"))return S;
\r
5199 add_warning (stderr, "\nWARNING: Could not Run %s to find templates[%s]\n",command, PROGRAM);
\r
5210 char* seq2template_file (Sequence *S, char *file)
\r
5215 if (file==NULL)file=vtmpnam (NULL);
\r
5217 seq2template_file2 (S, file, "w");
\r
5218 for (i=0; i<S->nseq; i++)
\r
5219 if ( (A=seq2R_template_profile (S, i)))
\r
5221 seq2template_file2 (A->S, file, "a");
\r
5226 int seq2template_file2 (Sequence *S, char *file, char *mode)
\r
5232 struct X_template *X;
\r
5234 fp=vfopen ( file, mode);
\r
5235 for ( i=0; i< S-> nseq; i++)
\r
5238 if ( (X=(S->T[i])->P)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}
\r
5239 /*if ( (X=(S->T[i])->S)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}*/
\r
5240 if ( (X=(S->T[i])->R)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}
\r
5241 if ( (X=(S->T[i])->G)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}
\r
5242 if (buf1[0])fprintf ( fp, ">%s %s\n", S->name[i], buf1);
\r
5245 return EXIT_SUCCESS;
\r
5251 int seq2n_X_template ( Sequence *S, char *type)
\r
5255 for (n=0,a=0; a< S->nseq; a++)
\r
5257 if ( strm2 (type, "_P_","_*_") && (S->T[a])->P)n++;
\r
5258 if ( strm2 (type, "_F_","_*_") && (S->T[a])->F)n++;
\r
5259 if ( strm2 (type, "_S_","_*_") && (S->T[a])->S)n++;
\r
5260 if ( strm2 (type, "_R_","_*_") && (S->T[a])->R)n++;
\r
5261 if ( strm2 (type, "_G_","_*_") && (S->T[a])->G)n++;
\r
5265 struct X_template *fill_X_template ( char *name, char *p, char *token)
\r
5267 struct X_template *X;
\r
5274 X=vcalloc (1, sizeof (X_template));
\r
5275 sprintf ( X->seq_name, "%s", name);
\r
5276 if ( (k=strstr (p, token)))sscanf (k+strlen(token), "%s",X->template_name);
\r
5277 else sprintf (X->template_name, "%s", p);
\r
5280 /*Add a Structure HERE*/
\r
5281 sprintf ( X->template_type, "%s", token);
\r
5282 if ( strm (token, "_P_"))X->VP=vcalloc (1, sizeof (P_template));
\r
5283 if ( strm (token, "_F_"))X->VF=vcalloc (1, sizeof (F_template));
\r
5285 if ( strm (token, "_S_"))X->VS=vcalloc (1, sizeof (S_template));
\r
5286 if ( strm (token, "_R_"))X->VR=vcalloc (1, sizeof (R_template));
\r
5287 if ( strm (token, "_G_"))X->VG=vcalloc (1, sizeof (G_template));
\r
5288 if ( strm (token, "_T_"))X->VT=vcalloc (1, sizeof (T_template));
\r
5289 if ( strm (token, "_E_"))X->VE=vcalloc (1, sizeof (E_template));
\r
5290 if ( strm (token, "_U_"))X->VU=vcalloc (1, sizeof (U_template));
\r
5295 struct X_template* free_X_template ( struct X_template *X)
\r
5307 free_sequence ((X->VS)->S, -1);
\r
5312 free_aln ((X->VR)->A);
\r
5317 free_sequence ((X->VG)->S, -1);
\r
5325 FILE * display_sequence_templates (Sequence *S,int i, FILE *io)
\r
5329 io=display_X_template ( (S->T[i])->P, io);
\r
5331 io=display_X_template ( (S->T[i])->F, io);
\r
5333 io=display_X_template ( (S->T[i])->S, io);
\r
5335 io=display_X_template ( (S->T[i])->R, io);
\r
5336 io=display_X_template ( (S->T[i])->G, io);
\r
5337 io=display_X_template ( (S->T[i])->T, io);
\r
5338 io=display_X_template ( (S->T[i])->E, io);
\r
5343 FILE * display_X_template (struct X_template *X, FILE *io)
\r
5346 if ( !X) return io;
\r
5347 if ( !strm (X->template_type, "_S_"))fprintf (io, "\n\t%s: Template=%s, File=%s",template_type2type_name (X->template_type), X->template_name,X->template_file);
\r
5350 char *template_type2short_type_name (char *type)
\r
5353 if (!type)return "";
\r
5354 else if ( strstr (type, "_P_")) return "pdb";
\r
5355 else if ( strstr (type, "_F_")) return "rfold";
\r
5356 else if ( strstr (type, "_S_")) return "seq";
\r
5357 else if ( strstr (type, "_R_")) return "prf";
\r
5358 else if ( strstr (type, "_G_")) return "genome";
\r
5359 else if ( strstr (type, "_E_")) return "ssp";
\r
5360 else if ( strstr (type, "_T_")) return "tmp";
\r
5361 else if ( strstr (type, "_U_")) return "unicode";
\r
5364 char *template_type2type_name (char *type)
\r
5367 if ( strstr (type, "_P_")) return "PDB struc";
\r
5368 else if ( strstr (type, "_F_")) return "RNA Fold";
\r
5369 else if ( strstr (type, "_S_")) return "Sequeence";
\r
5370 else if ( strstr (type, "_R_")) return "Profile";
\r
5371 else if ( strstr (type, "_G_")) return "Genomic";
\r
5372 else if ( strstr (type, "_E_")) return "Protein Secondary Structure";
\r
5373 else if ( strstr (type, "_T_")) return "Protein Trans Membrane Structure ";
\r
5374 else if ( strstr (type, "_U_")) return "Unicode and strings";
\r
5378 struct X_template *fill_F_template ( char *name,char *p, Sequence *S)
\r
5380 /*Profile template*/
\r
5381 struct X_template *F;
\r
5383 F=fill_X_template ( name, p, "_F_");
\r
5384 sprintf (F->template_format , "TCOFFEE_LIBRARY");
\r
5385 if (!F || !check_file_exists (F->template_name))
\r
5387 fprintf ( stderr, "\nWARNING: Could Not Fill _F_ (Fold) template for sequence |%s|", name);
\r
5388 free_X_template (F);
\r
5391 else if ( check_file_exists (F->template_name))
\r
5393 sprintf ( F->template_file, "%s", F->template_name);
\r
5401 struct X_template *fill_P_template ( char *name,char *p, Sequence *S)
\r
5403 struct X_template *P;
\r
5410 P=fill_X_template ( name, p, "_P_");
\r
5411 sprintf (P->template_format , "pdb");
\r
5415 //fprintf ( stderr, "\nWARNING: Could Not Fill _P_ template for sequence |%s|", name);
\r
5416 free_X_template (P);
\r
5419 else if ( check_file_exists (P->template_name))
\r
5422 sprintf ( P->template_file, "%s", P->template_name);
\r
5423 buf=path2filename (P->template_name);
\r
5424 if (P->template_name!=buf)
\r
5426 sprintf ( P->template_name, "%s",buf );
\r
5435 st=is_pdb_struc (P->template_name);
\r
5438 if (st!=P->template_file)sprintf ( P->template_file, "%s", st);
\r
5442 /*Make a first run to fix relaxed PDB files*/
\r
5443 buf=fix_pdb_file (P->template_file);
\r
5445 if ( buf!=P->template_file)
\r
5448 sprintf ( P->template_file, "%s",buf);
\r
5452 /*Check the PDB FILE EXISTS*/
\r
5453 if (!is_pdb_file (P->template_file))
\r
5456 add_warning(stderr, "\nWARNING: _P_ Template |%s| Could Not Be Found\n",p);
\r
5457 free_X_template (P);
\r
5462 buf= get_pdb_id (P->template_file);
\r
5463 if (buf!=(P->VP)->pdb_id)
\r
5465 sprintf ((P->VP)->pdb_id, "%s", buf);
\r
5470 /*Check the target sequence is similar enough*/
\r
5472 PS=get_pdb_sequence (P->template_file);
\r
5477 add_warning( stderr, "\nWARNING: _P_ Template |%s| Could Not be Used for Sequence |%s|: Structure Not Found", P->template_name, name);
\r
5478 free_X_template (P);P=NULL;
\r
5482 int minsim=get_int_variable ("pdb_min_sim");
\r
5483 int mincov=get_int_variable ("pdb_min_cov");
\r
5486 i=name_is_in_list (name, S->name, S->nseq, 100);
\r
5488 A=align_two_sequences (S->seq[i], PS->seq[0],"idmat",-3,0, "fasta_pair_wise");
\r
5489 cov=aln2coverage (A, 0);
\r
5490 sim=aln2sim (A, "idmat");
\r
5494 add_warning( stderr, "\nWARNING: _P_ Template %s Could Not be Used for Sequence %s: Similarity too low [%d, Min=%d]",P->template_name,name, sim, minsim);
\r
5495 free_X_template (P);
\r
5498 else if ( cov<mincov)
\r
5500 add_warning( stderr, "\nWARNING: _P_ Template |%s| Could Not be Used for Sequence |%s|: Coverage too low [%d, Min=%d]", P->template_name,name, cov, mincov);
\r
5501 free_X_template (P);P=NULL;
\r
5504 free_sequence (PS, -1);
\r
5510 struct X_template *fill_S_template ( char *name,char *p, Sequence *Seq)
\r
5512 struct X_template *S;
\r
5513 S=fill_X_template ( name, p, "_S_");
\r
5514 if ( strm (name, p))sprintf ( S->template_file, "%s",output_fasta_seqX (NULL,"w",Seq,NULL, seq_name2index (name, Seq)));
\r
5515 (S->VS)->S=get_fasta_sequence (S->template_file, NULL);
\r
5518 struct X_template *fill_R_template ( char *name,char *p, Sequence *S)
\r
5520 /*Profile template*/
\r
5521 struct X_template *R;
\r
5524 R=fill_X_template ( name, p, "_R_");
\r
5525 sprintf (R->template_format , "fasta_aln");
\r
5528 if (!is_aln(R->template_name) && !is_seq (R->template_name))
\r
5531 add_warning ( stderr, "\nWARNING: _R_ Template %s Could Not Be Found\n",R->template_name);
\r
5532 free_X_template (R);
\r
5541 (R->VR)->A=main_read_aln (R->template_name, NULL);
\r
5544 sprintf ( R->template_file, "%s", R->template_name);
\r
5547 s=name_is_in_list(name, S->name, S->nseq, 100);
\r
5550 S1=fill_sequence_struc (1, &S->seq[s], &S->name[s]);
\r
5551 A1=seq2aln (S1,NULL, RM_GAP);
\r
5553 (R->VR)->A=trim_aln_with_seq (A1, (R->VR)->A);
\r
5555 sprintf ( R->template_file, "%s", vtmpnam (NULL));
\r
5556 output_clustal_aln (R->template_file, (R->VR)->A);
\r
5559 sprintf ( R->template_file, "%s", R->template_name);
\r
5561 (R->VR)->A=aln2profile ((R->VR)->A);
\r
5566 struct X_template *fill_T_template ( char *name,char *p, Sequence *S)
\r
5568 /*Profile template*/
\r
5569 struct X_template *T;
\r
5571 T=fill_X_template ( name, p, "_T_");
\r
5572 sprintf (T->template_format , "fasta_seq");
\r
5574 if (!is_aln(T->template_name) && !is_seq (T->template_name))
\r
5577 add_warning ( stderr, "\nWARNING: _T_ Template %s Could Not Be Found\n",T->template_name);
\r
5578 free_X_template (T);
\r
5584 (T->VT)->S=main_read_seq(T->template_name);
\r
5585 sprintf ( T->template_file, "%s", T->template_name);
\r
5590 struct X_template *fill_U_template ( char *name,char *p, Sequence *S)
\r
5592 /*Profile template*/
\r
5593 struct X_template *U;
\r
5595 U=fill_X_template ( name, p, "_U_");
\r
5596 sprintf (U->template_format , "string list");
\r
5598 if (!check_file_exists(U->template_name))
\r
5600 add_warning ( stderr, "\nWARNING: _U_ Template %s Could Not Be Found\n",U->template_name);
\r
5601 free_X_template (U);
\r
5606 //(U->VU)->list=file2string(U->template_name);
\r
5607 sprintf ( U->template_file, "%s", U->template_name);
\r
5611 struct X_template *fill_E_template ( char *name,char *p, Sequence *S)
\r
5613 /*Profile template*/
\r
5614 struct X_template *E;
\r
5617 E=fill_X_template ( name, p, "_E_");
\r
5618 sprintf (E->template_format , "fasta_seq");
\r
5620 if (!is_aln(E->template_name) && !is_seq (E->template_name))
\r
5623 add_warning ( stderr, "\nWARNING: _E_ Template %s Could Not Be Found\n",E->template_name);
\r
5624 free_X_template (E);
\r
5629 (E->VE)->S=main_read_seq (E->template_name);
\r
5630 sprintf ( E->template_file, "%s", E->template_name);
\r
5634 struct X_template *fill_G_template ( char *name,char *p, Sequence *S)
\r
5636 struct X_template *G;
\r
5637 G=fill_X_template ( name, p, "_G_");
\r
5638 sprintf (G->template_format , "fasta_seq");
\r
5640 /*1: Get the sequence from another file if needed*/
\r
5641 if ( strm (name, p))sprintf ( G->template_file, "%s",output_fasta_seqX (NULL,"w",S,NULL, seq_name2index (name, S)));
\r
5642 else if ( strstr (p, "SEQFILE_"))
\r
5648 ST=main_read_seq (after_strstr ( p,"SEQFILE_G_"));
\r
5650 i2=seq_name2index (name, ST);
\r
5653 sprintf ( G->template_file, "%s",output_fasta_seqX (NULL,"w",ST,NULL, i2));
\r
5654 sprintf ( G->template_name, "%s", name);
\r
5656 free_sequence (ST, -1);
\r
5658 else sprintf (G->template_file, "%s", G->template_name);
\r
5661 /*2: Put the template in VG->S*/
\r
5662 if (!is_seq (G->template_file))
\r
5664 add_warning ( stderr, "\nWARNING: _G_ Template %s Could Not Be Found \n",p);
\r
5666 free_X_template (G);
\r
5671 (G->VG)->S=get_fasta_sequence (G->template_file, NULL);
\r
5677 char *seq2T_value ( Sequence *S, int n, char *value, char *type)
\r
5679 static char *rv_buf;
\r
5682 if ( !rv_buf)rv_buf=vcalloc (100, sizeof(char));
\r
5683 if (!(X=seq_has_template (S, n, type)))return NULL;
\r
5686 if (strm (value, "template_file"))return X->template_file;
\r
5687 else if ( strm (value, "template_name"))return X->template_name;
\r
5688 else if ( strm (value, "seq_name"))return X->seq_name;
\r
5689 else if (strm (type, "_P_"))
\r
5691 if ( strm (value, "pdb_id"))return (X->VP)->pdb_id;
\r
5693 else if ( strm (type, "_R_"))
\r
5695 if ( strm (value, "A"))
\r
5697 if ((X->VR)->A){sprintf ( rv_buf, "%d", (int)(X->VR)->A);return rv_buf;}
\r
5705 char *seq2P_pdb_id (Sequence *S, int n)
\r
5707 if (!S->T || !S->T[n] || !(S->T[n])->P ) return NULL;
\r
5708 else return ((S->T[n])->P)->template_name;
\r
5712 char *seq2P_template_file(Sequence *S, int n)
\r
5715 return seq2T_value (S, n, "template_file", "_P_");
\r
5718 char *profile2P_template_file (Sequence *S, int n)
\r
5724 if ( !(A=seq2R_template_profile (S, n)))return NULL;
\r
5725 for (a=0; a<A->nseq; a++)
\r
5727 if ((p=seq2P_template_file (A->S, a))!=NULL)return p;
\r
5731 Alignment * seq2R_template_profile (Sequence *S, int n)
\r
5735 return (Alignment *)atop(seq2T_value (S, n, "A", "_R_"));
\r
5737 char * seq2E_template_string (Sequence *S, int n)
\r
5739 struct X_template *T;
\r
5741 if ( (T=seq_has_template (S, n, "_E_"))!=NULL)
\r
5742 return ((T->VE)->S)->seq[0];
\r
5747 int* seq2U_template (Sequence *S, int n)
\r
5749 struct X_template *T;
\r
5751 if ( (T=seq_has_template (S, n, "_U_"))!=NULL)
\r
5752 return (T->VU)->list;
\r
5756 char * seq2T_template_string (Sequence *S, int n)
\r
5758 struct X_template *T;
\r
5760 if ( (T=seq_has_template (S, n, "_T_"))!=NULL)
\r
5761 return ((T->VT)->S)->seq[0];
\r
5766 struct X_template* seq_has_template ( Sequence *S, int n, char *mode)
\r
5770 if ( !S || !mode) return NULL;
\r
5771 else if ( n<0 || n>=S->nseq)return NULL;
\r
5772 else if ( !(S->T)) return NULL;
\r
5773 else if ( !(S->T[n]))return NULL;
\r
5778 if ( strm (mode, "_P_"))return T->P;
\r
5779 else if ( strm (mode, "_F_"))return T->F;
\r
5780 else if ( strm (mode, "_S_"))return T->S;
\r
5781 else if ( strm (mode, "_R_"))return T->R;
\r
5782 else if ( strm (mode, "_T_"))return T->T;
\r
5783 else if ( strm (mode, "_E_"))return T->E;
\r
5784 else if ( strm (mode, "_U_"))return T->U;
\r
5785 else if ( strm (mode, "_G_"))return T->G;
\r
5789 char ** name2random_subset (char **in_name, int n_in, int n_out)
\r
5799 out_name=declare_char (n_out,MAXNAMES+1 );
\r
5800 list=declare_int (n_in, 2);
\r
5802 for (a=0; a<n_in; a++)
\r
5805 list[a][1]=rand ()%max;
\r
5807 sort_int ( list,2, 1, 0, n_in-1);
\r
5809 for ( a=0; a<n_out; a++)
\r
5810 sprintf ( out_name[a], "%s", in_name[list[a][0]]);
\r
5811 free_int (list, -1);
\r
5815 Alignment * aln2random_order (Alignment *A)
\r
5820 name_list=name2random_subset (A->name, A->nseq, A->nseq);
\r
5821 A=reorder_aln (A, name_list, A->nseq);
\r
5822 free_char (name_list, -1);
\r
5825 Alignment *aln2jacknife (Alignment *A, int nseq, int len)
\r
5829 if (nseq!=0 && nseq<A->nseq)
\r
5833 name=name2random_subset (A->name, A->nseq, nseq);
\r
5834 A=reorder_aln (A, name, nseq);
\r
5835 free_char (name, -1);
\r
5838 if (len!=0 && len<A->len_aln)
\r
5843 l=declare_int (A->len_aln, 2);
\r
5844 for (a=0; a< A->len_aln; a++)
\r
5847 l[a][1]=rand()%(A->len_aln*1000);
\r
5849 sort_int ( l,2, 1, 0, A->len_aln-1);
\r
5850 B=copy_aln (A, NULL);
\r
5851 for ( a=0; a< len; a++)
\r
5853 for ( b=0; b<A->nseq; b++)
\r
5855 A->seq_al[b][a]=B->seq_al[b][l[a][0]];
\r
5858 for (b=0; b<A->nseq; b++)A->seq_al[b][len]='\0';
\r
5864 Alignment * aln2scramble_seq (Alignment *A)
\r
5873 list=declare_int (A->nseq, 2);
\r
5874 name_list=vcalloc (A->nseq, sizeof (char*));
\r
5877 for (a=0; a<A->nseq; a++)
\r
5880 list[a][1]=rand ()%max;
\r
5882 sort_int ( list,2, 1, 0, A->nseq-1);
\r
5884 for ( a=0; a< A->nseq; a++)
\r
5885 name_list[a]=A->seq_al[a];
\r
5886 for (a=0; a<A->nseq; a++)
\r
5888 A->seq_al[a]=name_list[list[a][0]];
\r
5890 vfree (name_list);
\r
5891 free_int (list, -1);
\r
5892 return aln2random_order (A);
\r
5897 Alignment * reorder_aln ( Alignment *A, char **name, int nseq)
\r
5904 if ( name==NULL)return aln2random_order(A);
\r
5907 BUF=copy_aln ( A,NULL);
\r
5908 for ( a=0; a<nseq; a++)
\r
5910 sn =name_is_in_list ( name[a],BUF->name, A->nseq,STRING);
\r
5919 SWAPP(A->order[n], BUF->order[sn], tpp_int);
\r
5920 sprintf ( A->name[n], "%s", BUF->name[sn]);
\r
5921 sprintf ( A->seq_al[n], "%s",BUF->seq_al[sn]);
\r
5922 sprintf ( A->seq_comment[n], "%s", BUF->seq_comment[sn]);
\r
5929 for ( a=n; a< A->nseq; a++)A->name[a][0]=A->seq_al[a][0]='\0';
\r
5932 if ( A->A)A->A=reorder_aln(A->A, name, nseq);
\r
5936 Sequence * reorder_seq_2 ( Sequence *A, int **order,int field, int nseq)
\r
5941 if (!A || !order) return A;
\r
5942 name=declare_char (A->nseq, 100);
\r
5943 for (a=0; a<nseq; a++)
\r
5944 sprintf ( name[a], "%s", A->name[order[a][field]]);
\r
5945 A=reorder_seq (A, name,nseq);
\r
5946 free_char (name, -1);
\r
5949 Sequence * reorder_seq ( Sequence *A, char **name, int nseq)
\r
5955 nA=duplicate_sequence (A);
\r
5958 for ( a=0; a< nseq; a++)
\r
5960 sn=name_is_in_list (name[a] ,nA->name, nA->nseq, 100);
\r
5961 if (sn==-1)continue;
\r
5963 if ( nA->file) sprintf ( A->file[a], "%s", nA->file[sn]);
\r
5965 if ( nA->seq_comment)sprintf ( A->seq_comment[a], "%s", nA->seq_comment[sn]);
\r
5966 if ( nA->aln_comment)sprintf ( A->aln_comment[a], "%s", nA->aln_comment[sn]);
\r
5967 sprintf ( A->seq[a], "%s", nA->seq[sn]);
\r
5968 A->len[a]=nA->len[sn];
\r
5969 sprintf ( A->name[a], "%s", nA->name[sn]);
\r
5970 A->T[a][0]=nA->T[sn][0];
\r
5973 free_sequence (nA, nA->nseq);
\r
5978 char * concatenate_seq ( Sequence *S, char *conc, int *order)
\r
5983 conc=vcalloc ( S->nseq*S->max_len, sizeof (char));
\r
5985 for ( a=0; a< S->nseq; a++)
\r
5987 conc=strcat ( conc, S->seq[order[a]]);
\r
5996 Alignment * rotate_aln ( Alignment *A, char *name)
\r
6001 B=declare_aln2 (A->len_aln, A->nseq+1);
\r
6002 for ( a=0; a< A->nseq; a++)
\r
6003 for ( b=0; b< A->len_aln; b++)
\r
6005 B->seq_al[b][a]=A->seq_al[a][b];
\r
6007 for (a=0; a< A->len_aln; a++)
\r
6008 if (name && name[0])sprintf ( B->name[a], "%s_%s%d", name, (a<9)?"0":"",a+1);
\r
6010 sprintf ( B->name[a], "%d", a+1);
\r
6013 for (a=0; a< A->len_aln; a++)B->seq_al[a][A->nseq]='\0';
\r
6014 B->len_aln=A->nseq;
\r
6015 B->nseq=A->len_aln;
\r
6020 Alignment * invert_aln ( Alignment *A)
\r
6025 for ( a=0; a< A->nseq; a++)
\r
6027 l=strlen ( A->seq_al[a]);
\r
6028 buf=vcalloc ( l+1,sizeof (char) );
\r
6030 for ( c=l-1,b=0; b< l; b++, c--)
\r
6032 buf[c]=A->seq_al[a][b];
\r
6035 sprintf ( A->seq_al[a], "%s", buf);
\r
6040 char * complement_string (char *s)
\r
6046 for ( b=0; b< l; b++)
\r
6050 if ( r=='a')r='t';
\r
6051 else if (r=='A')r='T';
\r
6052 else if (r=='t')r='a';
\r
6053 else if (r=='T')r='A';
\r
6054 else if (r=='g')r='c';
\r
6055 else if (r=='G')r='C';
\r
6056 else if (r=='c')r='g';
\r
6057 else if (r=='C')r='G';
\r
6061 return invert_string (s);
\r
6063 Alignment * complement_aln ( Alignment *A)
\r
6068 for ( a=0; a< A->nseq; a++)
\r
6070 A->seq_al[a]=complement_string (A->seq_al[a]);
\r
6076 Alignment * extract_nol_local_aln(Alignment *A, int start, int max_end)
\r
6078 A=extract_aln ( A, start, max_end);
\r
6079 A=trunkate_local_aln (A);
\r
6083 Alignment * alnpos_list2block (Alignment *A, int n, char **in_list)
\r
6088 int list_declared=0;
\r
6091 if (check_file_exists (in_list[0]))
\r
6096 mn=count_n_line_in_file (in_list[0]);
\r
6097 list=declare_char (mn, 100);
\r
6099 tmp_list=file2list (in_list[0], " ");
\r
6102 while (tmp_list[a])
\r
6104 if (tmp_list[a][1][0]!='!')
\r
6106 sprintf (list[n++], "%s", tmp_list[a][1]);
\r
6110 free_arrayN ((void **)tmp_list, 3);
\r
6118 pos=vcalloc (A->len_aln, sizeof (int));
\r
6119 for (a=0; a<n; a++)
\r
6122 if (strstr (list[a], "-"))
\r
6124 int start, end, x;
\r
6125 x=sscanf (list[a], "%d-%d", &start, &end);
\r
6126 if (x!=2 || !A || start<=0 || start>=end || end>A->len_aln+1)
\r
6128 add_warning ( stderr, "\nWARNING: Illegal coordinates in extract_pos_list [%s]", list[a]);
\r
6132 for (a=start; a<end; a++)pos[a]=1;
\r
6138 if (p<1 || p>A->len_aln)
\r
6140 add_warning ( stderr, "\nWARNING: Illegal coordinates in extract_pos_list [%s]", list[a]);
\r
6146 B=alnpos2block(A, pos, NULL);
\r
6148 if ( list_declared)free_char (list, -1);
\r
6152 Alignment * aln2block (Alignment *A, int start, int end, Alignment *B)
\r
6154 if ( !A || start<=0 || start>=end || end>A->len_aln+1)
\r
6156 add_warning ( stderr, "\nWARNING: Illegal coordinates in extract_block start=%d end=%d len=%d [Note : [start-end[, with [1...n]", start, end, A->len_aln);
\r
6164 pos=vcalloc (A->len_aln, sizeof (int));
\r
6165 for (p=start;p<end;p++)
\r
6169 B=alnpos2block (A, pos, B);
\r
6174 Alignment * alnpos2block (Alignment *A, int *pos, Alignment *B)
\r
6177 //extract a subset of B without over-writing A
\r
6180 B=copy_aln (A, B);
\r
6182 for (a=0; a<=A->len_aln; a++)
\r
6184 if ( pos[a]!=0 || a==A->len_aln)
\r
6186 for ( b=0; b<A->nseq; b++)
\r
6187 B->seq_al[b][B->len_aln]=A->seq_al[b][a];
\r
6188 if ( a!=A->len_aln)B->len_aln++;
\r
6194 Alignment * extract_aln ( Alignment *A, int start, int end)
\r
6196 return extract_aln2 ( A, start, end, "cons");
\r
6199 Alignment * extract_aln2 ( Alignment *A, int in_start, int in_end, char *seq)
\r
6205 tmp=vtmpnam (NULL);
\r
6206 fp=vfopen (tmp, "w");
\r
6207 fprintf ( fp, "%s %d %d\n", seq, in_start, in_end);
\r
6209 return extract_aln3 (A,tmp);
\r
6211 Alignment * extract_aln3 ( Alignment *B, char *file)
\r
6215 int n, i, s, nline=0;
\r
6217 Alignment *A=NULL;
\r
6219 char name[MAXNAMES];
\r
6220 char line[VERY_LONG_STRING];
\r
6228 seqname start end[
\r
6229 modifies the incoming alignment
\r
6232 offset=vcalloc ( B->nseq+1, sizeof (int));
\r
6233 fp=vfopen (file,"r");
\r
6234 while ( (c=fgetc(fp))!=EOF)
\r
6237 fgets ( line, VERY_LONG_STRING,fp);
\r
6240 sscanf (line, "%s %d", name, &start);
\r
6241 s=name_is_in_list (name,B->name,B->nseq,MAXNAMES);
\r
6249 A=copy_aln (B, A);
\r
6250 col=vcalloc ( A->len_aln, sizeof (int));
\r
6252 fp=vfopen ( file, "r");
\r
6253 while ( (c=fgetc(fp))!=EOF)
\r
6256 if ( c=='#' || c=='!')fgets ( line, VERY_LONG_STRING,fp);
\r
6260 fgets ( line, VERY_LONG_STRING,fp);
\r
6262 if (sscanf (line, "%s %d %d", name, &start, &end)==3);
\r
6263 else if (sscanf (line, "%s %d", name, &start)==2)
\r
6269 add_warning ( stderr, "\nWARNING: wrong format in coordinate file (line=%d)\n", nline);
\r
6272 if ( end==0)end=A->len_aln+1;
\r
6274 s=name_is_in_list (name,A->name,A->nseq,MAXNAMES);
\r
6277 if ( s==-1 && !strm (name, "cons"))
\r
6279 add_warning ( stderr, "\nWARNING: Seq %s does not belong to the alignment (line %d)\n", name,nline);
\r
6282 else if ( start>end)
\r
6284 add_warning ( stderr, "\nWARNING: Illegal coordinates [%s %d %d] (line %d)\n", name,start, end,nline);
\r
6292 start-=offset[s]-1;
\r
6295 for (n=0, a=0; done!=1 && a< A->len_aln; a++)
\r
6297 i=(strm (name, "cons"))?1:!is_gap(A->seq_al[s][a]);
\r
6300 if (n>=start && n<end)
\r
6304 if (n>=end)done=1;
\r
6305 //if (n>=start && n<end && !(i==0 && n==end-1))
\r
6309 //else if ( n>=end)a=A->len_aln;
\r
6313 HERE ("Warning Missing positions in File %s",file );
\r
6322 /*Extract [start-end[*/
\r
6323 for ( b=0,a=0; a< A->len_aln; a++)
\r
6327 for (c=0; c< A->nseq; c++)A->seq_al[c][b]=A->seq_al[c][a];
\r
6333 for (c=0; c< A->nseq; c++)A->seq_al[c][A->len_aln]='\0';
\r
6339 Alignment * trunkate_local_aln ( Alignment *A)
\r
6347 cache=declare_int (return_max_int (A->order,read_size_int ( A->order,sizeof (int*)),0)+1,return_max_int (A->order,read_size_int ( A->order,sizeof (int*)),1)+A->len_aln+1);
\r
6348 pos=aln2pos_simple(A,A->nseq);
\r
6350 for ( b=0; b<A->len_aln; b++)
\r
6351 for ( a=0; a< A->nseq; a++)
\r
6353 seq=A->order[a][0];
\r
6354 if ( pos[a][b]<=0);
\r
6355 else if ( pos[a][b]>0)
\r
6358 if (cache[seq][pos[a][b]]==0)cache[seq][pos[a][b]]++;
\r
6359 else if ( cache[seq][pos[a][b]]>=1)
\r
6361 cache[seq][pos[a][b]]++;
\r
6362 A->seq_al[a][b]='\0';
\r
6367 A->len_aln=get_shortest_string ( A->seq_al, A->nseq, NULL, NULL);
\r
6368 pad_string_array ( A->seq_al, A->nseq, A->len_aln, '-');
\r
6370 free_int (pos, -1);
\r
6371 free_int ( cache,-1);
\r
6377 int get_nol_aln_border ( Alignment *A, int start, int direction)
\r
6384 /*This Function Returns the limit position for a non overlaping alignment*/
\r
6386 cache=declare_int (return_max_int (A->order,read_size_int ( A->order,sizeof (int*)),0)+1,return_max_int (A->order,read_size_int ( A->order,sizeof (int)),1)+A->len_aln+1);
\r
6387 pos=aln2pos_simple(A,A->nseq);
\r
6388 end=(direction==GO_RIGHT)?A->len_aln:-1;
\r
6391 for ( b=start; b!=end;b+=direction)
\r
6392 for ( a=0; a< A->nseq; a++)
\r
6394 seq=A->order[a][0];
\r
6395 if ( pos[a][b]<=0);
\r
6396 else if ( pos[a][b]>0)
\r
6399 if (cache[seq][pos[a][b]]==0)cache[seq][pos[a][b]]++;
\r
6400 else if ( cache[seq][pos[a][b]]>=1)
\r
6402 cache[seq][pos[a][b]]++;
\r
6403 free_int(cache, -1);
\r
6404 return b-direction;
\r
6409 free_int ( cache,-1);
\r
6410 free_int (pos, -1);
\r
6411 return end-direction;
\r
6418 char * extract_defined_seq ( char *in, int in_of, int in_start, int *aa_def, int dir, int *out_start, char *out)
\r
6420 int start=0, end,l;
\r
6425 if ( dir==GO_LEFT){start=in_start-1;}
\r
6426 else if ( dir==GO_RIGHT){start=in_start+1;}
\r
6429 while (aa_def[end]!=UNDEFINED)
\r
6435 if (end<start)SWAP(end,start);
\r
6439 for (b=0,d=0,c=in_of;b<l; b++)
\r
6441 c+=1-is_gap(in[b]);
\r
6442 if ( c>=start && c<=end)
\r
6444 if ( out_start[0]==-1)out_start[0]=c-!is_gap(in[b]);
\r
6453 Alignment * concatenate_aln ( Alignment *A1, Alignment *A2, char *spacer)
\r
6458 A=declare_aln2( A1->nseq+A2->nseq , A1->len_aln+A2->len_aln+1);
\r
6459 for ( a=0; a< A1->nseq; a++)
\r
6461 if ((i=name_is_in_list ( A1->name[a], A2->name, A2->nseq, 100))!=-1)
\r
6463 sprintf ( A->name[A->nseq], "%s", A1->name[a]);
\r
6464 sprintf (A->seq_al[A->nseq], "%s%s%s", A1->seq_al[a],(spacer)?spacer:"", A2->seq_al[i]);
\r
6470 buf=generate_string (A2->len_aln, '-');
\r
6471 sprintf ( A->name[A->nseq], "%s", A1->name[a]);
\r
6472 sprintf (A->seq_al[A->nseq], "%s%s", A1->seq_al[a], buf);
\r
6477 for ( a=0; a< A2->nseq; a++)
\r
6479 if ((i=name_is_in_list ( A2->name[a], A1->name, A1->nseq, 100))==-1)
\r
6482 buf=generate_string (A1->len_aln, '-');
\r
6483 sprintf ( A->name[A->nseq], "%s", A2->name[a]);
\r
6484 sprintf (A->seq_al[A->nseq], "%s%s", buf, A2->seq_al[a]);
\r
6489 A->len_aln=A1->len_aln+A2->len_aln;
\r
6492 Alignment * aln_cat ( Alignment *A, Alignment *B)
\r
6496 if ( A->nseq!=B->nseq)
\r
6498 fprintf ( stderr, "\nERROR IN ALN CAT: DIFFERENT NSEQ\n");
\r
6499 myexit(EXIT_FAILURE);
\r
6502 A=realloc_alignment2(A, A->nseq,A->len_aln+B->len_aln+1);
\r
6504 for ( a=0;a< A->nseq; a++)
\r
6506 strcat ( A->seq_al[a], B->seq_al[a]);
\r
6508 A->len_aln+=B->len_aln;
\r
6511 int verify_aln ( Alignment *A, Sequence *S, char *message)
\r
6516 for ( a=0;a< A->nseq; a++)
\r
6520 for ( b=0, c=0; b< A->len_aln; b++)
\r
6522 if ( !is_gap(A->seq_al[a][b]))
\r
6524 if (tolower(A->seq_al[a][b])!=tolower(S->seq[s][c+r]))
\r
6526 fprintf ( stderr, "\n%s\nResidue [%c %d, %c %d] line %d seq %d",message,A->seq_al[a][b], b,S->seq[s][c+r], c+r,a,s);
\r
6527 output_Alignment_with_res_number(A, stderr);
\r
6528 myexit(EXIT_FAILURE);
\r
6538 Alignment *adjust_est_aln ( Alignment *PW, Alignment *M, int s)
\r
6540 /*This function reajusts M, threading M onto PW
\r
6541 two seqences in PW
\r
6544 seq 0 PW ----> 0->s-1 in M
\r
6545 seq 1 PW ----> 1->s in M;
\r
6549 static char **array;
\r
6558 array=declare_char (500, 100000);
\r
6561 for ( a=0; a< PW->len_aln; a++)
\r
6563 if ( is_gap(PW->seq_al[0][a]))
\r
6565 for ( b=0; b< s; b++)
\r
6570 for ( b=0; b< s; b++)
\r
6571 array[b][a]=M->seq_al[b][top_M];
\r
6575 if ( is_gap(PW->seq_al[1][a]))
\r
6582 array[s][a]=M->seq_al[s][bottom_M];
\r
6587 M->len_aln=PW->len_aln;
\r
6588 for (a=0; a<s; a++)
\r
6590 for (b=0; b<PW->len_aln; b++)
\r
6591 M->seq_al[a][b]=array[a][b];
\r
6592 M->seq_al[a][b]='\0';
\r
6602 Alignment * rename_seq_in_aln (Alignment *A, char ***list)
\r
6610 while ( list[n][0][0])
\r
6612 if ( (i=name_is_in_list (list[n][0], A->name, A->nseq, 100))!=-1)
\r
6614 sprintf ( A->name[i], "%s", list[n][1]);
\r
6619 A->S=rename_seq_in_seq (A->S, list);
\r
6622 Sequence * rename_seq_in_seq (Sequence *A, char ***list)
\r
6625 if ( !A || !list)return A;
\r
6628 while ( list[n][0][0])
\r
6630 if ( (i=name_is_in_list (list[n][0], A->name, A->nseq, 100))!=-1)
\r
6632 sprintf ( A->name[i], "%s", list[n][1]);
\r
6638 /********************************************************************/
\r
6640 /* FLOAT SIMILARITIES */
\r
6644 /********************************************************************/
\r
6645 float get_seq_fsim ( char *string1, char *string2, char *ignore, char *similarity_set,int **matrix, int MODE )
\r
6647 int len, a, r1, r2, nr1=0, nr2=0;
\r
6648 float pos=0, sim=0;
\r
6651 len=MIN((strlen (string1)),(strlen (string2)));
\r
6652 if ( len==0)return 0;
\r
6654 for ( a=0; a< len; a++)
\r
6662 if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
\r
6665 if ( matrix)sim+=matrix[r1-'A'][r2-'A'];
\r
6666 else if (is_in_same_group_aa(r1,r2,0, NULL,similarity_set))
\r
6672 if ( MODE==UNGAPED_POSITIONS)return ( sim*100)/pos;
\r
6673 else if ( MODE==ALIGNED_POSITIONS)return (sim*100)/len;
\r
6674 else if ( MODE==AVERAGE_POSITIONS)return (sim*200)/(nr1+nr2);
\r
6681 float get_seq_fsim2 ( char *string1, char *string2, char *ignore, char *in_mode)
\r
6689 float r=0, pos1, pos2, pos0, gap, sim;
\r
6692 sprintf ( mode, "%s", in_mode);
\r
6694 /*mode: <mat>__<sim_mode>
\r
6695 mat: idscore to get the alignment done
\r
6696 any legal cw matrix
\r
6697 sim_mode: sim1->identities/matches
\r
6698 sim2->identities/min len
\r
6702 if ( (p=strstr (mode, "_"))!=NULL)
\r
6709 if (strstr (mode, "idscore"))
\r
6712 if (!mat) mat=read_matrice ("blosum62mt");
\r
6713 return idscore_pairseq (string1, string2, -12, -1, mat,mode);
\r
6717 len1=strlen (string1);
\r
6718 for ( sim=pos1=pos2=pos0=gap=0,a=0; a< len1; a++)
\r
6722 p1=1-is_in_set (r1, ignore);
\r
6723 p2=1-is_in_set (r2, ignore);
\r
6724 pos1+=p1; pos2+=p2;
\r
6728 if (is_in_same_group_aa(r1,r2,0, NULL, mode))
\r
6733 else if (p1+p2==1)
\r
6739 if ( p==NULL || strm (p, "sim1") || strm (p, "sim"))
\r
6741 r=(pos0==0)?0:(sim*MAXID)/pos0;
\r
6743 else if ( strm (p, "sim2"))
\r
6745 r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MIN(pos1,pos2);
\r
6747 else if ( strm (p, "sim3"))
\r
6749 r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MAX(pos1,pos2);
\r
6751 else if ( strm (p, "gap1"))
\r
6753 r=(len1==0)?MAXID:(gap*MAXID)/len1;
\r
6756 else if ( strm (p, "logid"))
\r
6758 r=logid_score (pos0, sim);
\r
6765 /********************************************************************/
\r
6767 /* ALIGNMENT ANALYSES */
\r
6771 /********************************************************************/
\r
6772 int **dist_array2sim_array ( int **p, int max)
\r
6775 s1=read_array_size ((void *)p, sizeof (void *));
\r
6776 s2=read_array_size ((void*)p[0],sizeof (int));
\r
6777 /* s2=read_array_size ((void*)p[0],sizeof (void *)); OLD before 64 BITS*/
\r
6778 for ( a=0; a< s1; a++)
\r
6779 for ( b=0; b< s2; b++)
\r
6781 p[a][b]=max-p[a][b];
\r
6786 int **sim_array2dist_array ( int **p, int max)
\r
6789 s1=read_array_size ((void *)p, sizeof (void *));
\r
6790 s2=read_array_size ((void*)p[0],sizeof (int));
\r
6792 /*s2=read_array_size ((void*)p[0],sizeof (void *)); OLD before 64 Bits stuff*/
\r
6793 for ( a=0; a< s1; a++)
\r
6794 for ( b=0; b< s2; b++)
\r
6796 p[a][b]=max-(int)p[a][b];
\r
6801 int **normalize_array (int **p, int max, int norm)
\r
6804 s1=read_array_size ((void *)p, sizeof (void *));
\r
6805 s2=read_array_size ((void*)p[0],sizeof (int));
\r
6807 /*s2=read_array_size ((void*)p[0],sizeof (void *)); OLD before 64 Bits stuff*/
\r
6808 for ( a=0; a< s1; a++)
\r
6809 for ( b=0; b< s2; b++)
\r
6811 p[a][b]=(p[a][b]*norm)/max;
\r
6816 int aln2most_similar_sequence ( Alignment *A, char *mode)
\r
6820 int avg, best_avg=0, best_seq=0;
\r
6825 if ( !A) return -1;
\r
6826 else if ( A->nseq==1)return 0;
\r
6829 buf=vcalloc ( A->len_aln+1, sizeof (char));
\r
6830 w=get_sim_aln_array ( A, mode);
\r
6832 for ( a=0; a< A->nseq; a++)
\r
6834 sprintf ( buf, "%s", A->seq_al[a]);
\r
6836 coverage=(strlen(buf)*MAXID)/A->len_aln;
\r
6838 for ( avg=0,b=0; b< A->nseq; b++)avg+=w[a][b]*coverage;
\r
6839 if ( avg>best_avg){best_avg=avg; best_seq=a;}
\r
6850 int aln2coverage ( Alignment *A, int ref_seq)
\r
6853 int cov_pos=0, npos=0;
\r
6855 for ( a=0; a< A->len_aln; a++)
\r
6857 if ( !is_gap ( A->seq_al[ref_seq][a]))
\r
6860 for ( b=0; b< A->nseq; b++)
\r
6862 if ( b!=ref_seq && !is_gap ( A->seq_al[b][a])){cov_pos++;break;}
\r
6866 return (int) (npos==0)?0:(( MAXID*cov_pos)/A->len_aln);
\r
6870 int sub_aln2sim ( Alignment *A, int *ns, int **ls, char *mode)
\r
6876 if (!A || (ns==NULL && A->nseq<2))return -1;
\r
6877 else if (ns==NULL)
\r
6879 for (a=0; a< A->nseq-1; a++)
\r
6880 for ( b=a+1; b< A->nseq;b++, n++)
\r
6881 avg+=generic_get_seq_sim (A->seq_al[a], A->seq_al[b], NULL, mode);
\r
6885 for (a=0; a<ns[0]; a++)
\r
6886 for (b=0; b< ns[1]; b++, n++)
\r
6888 avg+=generic_get_seq_sim (A->seq_al[ls[0][a]], A->seq_al[ls[1][b]], NULL, mode);
\r
6891 return (int)(n==0)?0:((float)avg/(float)n);
\r
6893 int sub_aln2max_sim ( Alignment *A, int *ns, int **ls, char *mode)
\r
6899 if (!A || (ns==NULL && A->nseq<2))return -1;
\r
6900 else if (ns==NULL)
\r
6902 for (a=0; a< A->nseq-1; a++)
\r
6903 for ( b=a+1; b< A->nseq;b++, n++)
\r
6904 avg=MAX(avg,generic_get_seq_sim (A->seq_al[a], A->seq_al[b], NULL, mode));
\r
6908 for (a=0; a<ns[0]; a++)
\r
6909 for (b=0; b< ns[1]; b++, n++)
\r
6911 avg=MAX(avg,generic_get_seq_sim (A->seq_al[ls[0][a]], A->seq_al[ls[1][b]], NULL, mode));
\r
6918 double aln2entropy (Alignment *A, int *in_ls, int in_ns, float gap_threshold)
\r
6920 int ns, a, s, col, r,ncol;
\r
6926 ls=vcalloc ( A->nseq, sizeof (int));
\r
6927 count=vcalloc ( 26, sizeof (double));
\r
6933 for ( a=0; a< ns; a++)ls[a]=in_ls[a];
\r
6938 for ( a=0; a< ns; a++)ls[a]=a;
\r
6943 vfree(ls);vfree(count);return 0;
\r
6945 for (ncol=0,col=0; col<A->len_aln; col++)
\r
6947 for (ng=0,a=0; a< ns; a++)
\r
6950 ng+=is_gap(A->seq_al[s][col]);
\r
6953 if ( ng>gap_threshold)continue;
\r
6957 for ( a=0; a<ns; a++)
\r
6960 r=tolower(A->seq_al[s][col]);
\r
6961 if (!is_gap(r))count[r-'a']++;
\r
6963 for (a=0; a<26; a++)
\r
6965 if ( count[a]==0);
\r
6968 count[a]/=(double)ns;
\r
6970 entropy+=count[a]*log(count[a]);
\r
6976 vfree (ls); vfree(count);
\r
6980 int aln2sim ( Alignment *A, char *mode)
\r
6982 return sub_aln2sim ( A, NULL, NULL, mode);
\r
6984 if ( !A || A->nseq<2) return -1;
\r
6985 w=get_sim_aln_array ( A, mode);
\r
6987 for (c=0, a=0; a< A->nseq-1; a++)
\r
6988 for ( b=a+1; b< A->nseq; b++, c++)
\r
6990 avg+=(float)w[a][b];
\r
6993 return (int)((float)avg/(float)c);
\r
6997 int aln_is_aligned ( Alignment *A)
\r
7002 for (a=0; a< A->nseq; a++)
\r
7003 for ( b=A->len_aln-1; b>0; b--)
\r
7005 if (!is_gap(A->seq_al[a][b]) && is_gap(A->seq_al[a][b-1]))return 1;
\r
7011 int seq2aln2sim_old ( char *seq1, char *seq2, char *mode_aln, char *mode_id)
\r
7016 A=align_two_sequences (seq1, seq2, "pam250mt", -10, -1, mode_aln);
\r
7017 sim=aln2sim (A, mode_id);
\r
7021 int seq2aln2sim ( char *seq1, char *seq2, char *mode_aln, char *mode_id)
\r
7030 m=read_matrice ("blosum62mt");
\r
7031 gop=get_avg_matrix_mm(m, AA_ALPHABET)*10;
\r
7035 A=align_two_sequences (seq1, seq2, "blosum62mt",gop,-1, mode_aln);
\r
7036 sim=aln2sim (A, mode_id);
\r
7040 int* get_cdna_seq_winsim ( int *cache, char *string1, char *string2, char *ignore, char *mode,int *w )
\r
7046 len1=strlen (string1);
\r
7047 len2=strlen (string2);
\r
7051 fatal_exit( stderr,EXIT_FAILURE, "\nTHE TWO cDNAs DO NOT HAVE THE SAME LENGTH [FATAL:get_cdna_seq_sim:%s", PROGRAM);
\r
7054 x=get_cdna_seq_sim(cache, string1, string2, ignore, "");
\r
7055 for ( a=0; a< len1; a++)
\r
7058 add_warning (stderr, "\nWARNING: winsim not implemented for cDNA");
\r
7062 int get_cdna_seq_sim ( int *cache, char *string1, char *string2, char *ignore, char *mode)
\r
7071 len1=strlen (string1);
\r
7072 len2=strlen (string2);
\r
7078 fprintf ( stderr, "\nTHE TWO cDNAs DO NOT HAVE THE SAME LENGTH [FATAL:get_cdna_seq_sim:%s", PROGRAM);
\r
7082 for ( a=0; a< len1;)
\r
7085 if ( cache[a]==0){a++;continue;}
\r
7086 else if ( cache[a]==1)
\r
7089 r1=translate_dna_codon (string1+a, 'x');
\r
7090 r2=translate_dna_codon (string2+a, 'x');
\r
7094 if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
\r
7097 if (is_in_same_group_aa(r1,r2,0, NULL,mode+4))
\r
7109 return (int) (sim*MAXID)/pos;
\r
7113 int* get_seq_winsim ( char *string1, char *string2, char *ignore, char *mode, int*w)
\r
7115 int len1, len2, len;
\r
7122 len1=strlen (string1);
\r
7123 len2=strlen (string2);
\r
7124 window=atoi(mode);
\r
7127 if ( len1!=len2)return 0;
\r
7128 if (window==0 || (window*2+1)>=len1)
\r
7130 sim=get_seq_sim (string1, string2, ignore, "");
\r
7131 for (a=0; a<len1; a++)w[a]=sim;
\r
7136 for ( a=0; a< len1; a++)
\r
7139 left =MAX(0, a-window);
\r
7140 right=MIN(len1, left+len);
\r
7141 for (sim=0,b=left; b<right; b++)
\r
7145 if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
\r
7150 w[a]=(sim*MAXID)/len;
\r
7156 int get_seq_sim ( char *string1, char *string2, char *ignore, char *in_mode)
\r
7160 int pos1, pos2, pos0,gap=0, sim;
\r
7162 int r=0,r1=0,r2=0;
\r
7164 static char *mode;
\r
7166 if (!mode)mode=vcalloc (100, sizeof (char));
\r
7167 else mode[0]='\0';
\r
7170 while (in_mode[0]=='_')in_mode++;
\r
7171 sprintf ( mode, "%s", in_mode);
\r
7174 /*mode: <mat>__<sim_mode>
\r
7175 mat: idscore to get the alignment done
\r
7176 any legal cw matrix
\r
7177 sim_mode: sim1->identities/matches
\r
7178 sim2->identities/min len
\r
7182 if ( (p=strstr (mode, "_"))!=NULL)
\r
7189 if (strstr (mode, "idscore"))
\r
7192 if (!mat) mat=read_matrice ("blosum62mt");
\r
7193 return idscore_pairseq (string1, string2, -12, -1, mat,mode);
\r
7196 len1=strlen (string1);
\r
7197 for ( sim=pos1=pos2=pos0=0,a=0; a< len1; a++)
\r
7201 p1=1-is_in_set (r1, ignore);
\r
7202 p2=1-is_in_set (r2, ignore);
\r
7204 pos1+=p1; pos2+=p2;
\r
7208 if (is_in_same_group_aa(r1,r2,0, NULL, mode))
\r
7213 else if (p1+p2==1)
\r
7219 if ( strstr (mode, "cov"))
\r
7221 r=(pos0+gap==0)?0:(pos0*MAXID)/(pos0+gap);
\r
7223 else if ( p==NULL || strm (p, "sim1") || strm (p, "sim"))
\r
7225 r=(pos0==0)?0:(sim*MAXID)/pos0;
\r
7227 else if ( strm (p, "sim2"))
\r
7229 r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MIN(pos1,pos2);
\r
7231 else if ( strm (p, "sim3"))
\r
7233 r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MAX(pos1,pos2);
\r
7235 else if ( strm (p, "gap1"))
\r
7237 r=(len1==0)?MAXID:(gap*MAXID)/len1;
\r
7240 else if ( strm (p, "logid"))
\r
7242 r=logid_score (pos0, sim);
\r
7244 else if ( strstr (mode, "sim"))
\r
7246 r=(pos0==0)?0:(sim*MAXID)/pos0;
\r
7253 int get_seq_sim_2 ( char *string1, char *string2, char *ignore, char **gr, int ng)
\r
7263 len1=strlen (string1);
\r
7264 len2=strlen (string2);
\r
7266 if ( len1!=len2)return 0;
\r
7268 for ( a=0; a< len1; a++)
\r
7272 if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
\r
7275 if (is_in_same_group_aa(r1,r2,ng, gr, NULL))
\r
7285 return (int) (sim*MAXID)/pos;
\r
7289 int get_seq_sim_3 ( char *string1, char *string2, char *ignore, int **mat)
\r
7299 len1=strlen (string1);
\r
7300 len2=strlen (string2);
\r
7302 if ( len1!=len2)return 0;
\r
7304 for ( a=0; a< len1; a++)
\r
7308 if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
\r
7310 sim+=mat[r1-'A'][r2-'A'];
\r
7316 int * get_aln_col_weight ( Alignment *A, char *mode)
\r
7322 col=vcalloc ( A->nseq, sizeof (int));
\r
7323 weight=vcalloc (A->len_aln, sizeof (int));
\r
7325 for (a=0; a< A->len_aln; a++)
\r
7327 for ( b=0; b< A->nseq; b++)
\r
7328 col[b]=A->seq_al[b][a];
\r
7329 weight[a]=(find_group_aa_distribution (col, A->nseq,0,NULL,NULL, mode )*MAXID)/A->nseq;
\r
7336 int analyse_aln_column ( Alignment *B, int col)
\r
7342 static int ng_cw_star;
\r
7343 static char **cw_star;
\r
7344 int *cw_star_count;
\r
7346 static int ng_cw_col;
\r
7347 static char **cw_col;
\r
7348 int *cw_col_count;
\r
7350 static int ng_cw_dot;
\r
7351 static char **cw_dot;
\r
7352 int *cw_dot_count;
\r
7359 if ( !B->S || !(B->S)->type)B= get_aln_type (B);
\r
7361 if ( !mat)mat=vcalloc ( STRING, sizeof (char));
\r
7365 cw_star=make_group_aa ( &ng_cw_star, strcpy ( mat,"idmat"));
\r
7366 cw_col=make_group_aa ( &ng_cw_col, strcpy (mat,"clustalw_col"));
\r
7367 cw_dot=make_group_aa ( &ng_cw_dot, strcpy (mat, "clustalw_dot"));
\r
7370 cw_star_count=vcalloc (ng_cw_star, sizeof (int));
\r
7371 cw_col_count=vcalloc ( ng_cw_col, sizeof (int));
\r
7372 cw_dot_count=vcalloc (ng_cw_dot, sizeof (int));
\r
7374 for ( a=0; a< B->nseq; a++)
\r
7376 c=tolower (B->seq_al[a][col]);
\r
7377 if (is_gap(c)){r=' ';break;}
\r
7379 for ( b=0; b< ng_cw_star; b++)
\r
7380 cw_star_count[b]+=is_in_set (c, cw_star[b]);
\r
7381 for ( b=0; b< ng_cw_col; b++)
\r
7382 cw_col_count[b]+=is_in_set (c, cw_col[b]);
\r
7383 for ( b=0; b< ng_cw_dot; b++)
\r
7384 cw_dot_count[b]+=is_in_set (c, cw_dot[b]);
\r
7391 if ( !is_gap(c) && r==' ')
\r
7392 for ( b=0; b< ng_cw_star; b++)if ( cw_star_count[b]==B->nseq){r='*'; break;}
\r
7393 if ( !is_gap(c) && r==' ' && !(strm((B->S)->type, "DNA")||strm ((B->S)->type,"RNA")))
\r
7394 for ( b=0; b< ng_cw_col ; b++)if ( cw_col_count [b]==B->nseq){r=':'; break;}
\r
7395 if ( !is_gap(c) && r==' ' && !(strm((B->S)->type, "DNA")||strm ((B->S)->type,"RNA")))
\r
7396 for ( b=0; b< ng_cw_dot ; b++)if ( cw_dot_count [b]==B->nseq){r='.'; break;}
\r
7400 vfree(cw_star_count);
\r
7401 vfree(cw_col_count);
\r
7402 vfree(cw_dot_count);
\r
7408 int ** get_cov_aln_array ( Alignment *A, char *mode)
\r
7413 w=declare_int ( A->nseq, A->nseq);
\r
7416 for ( a=0; a< A->nseq-1; a++)
\r
7419 for ( t=0,b=a+1; b< A->nseq; b++)
\r
7421 for ( c=0; c< A->len_aln; c++)
\r
7423 t+=(!is_gap(A->seq_al[a][c]) &&!is_gap(A->seq_al[b][c]));
\r
7425 w[a][b]=w[b][a]=(t*100)/A->len_aln;
\r
7431 int ** get_cov_master_aln_array ( Alignment *A,int n, char *mode)
\r
7436 w=declare_int ( A->nseq, A->nseq);
\r
7439 for (b=0; b< A->nseq; b++)
\r
7442 for (t=0, c=0; c< A->len_aln; c++)
\r
7444 t+=(!is_gap(A->seq_al[n][c]) &&!is_gap(A->seq_al[n][c]));
\r
7446 w[n][b]=w[b][n]=(t*100)/A->len_aln;
\r
7451 int ** get_sim_master_aln_array ( Alignment *A,int n, char *mode)
\r
7456 w=declare_int ( A->nseq, A->nseq);
\r
7459 for ( a=0; a< A->nseq; a++)
\r
7461 if ( strm (mode, "cdna"))
\r
7462 w[n][a]=w[a][n]=get_cdna_seq_sim ( A->cdna_cache[0], A->seq_al[a], A->seq_al[n],GAP_LIST, mode);
\r
7464 w[n][a]=w[a][n]=get_seq_sim ( A->seq_al[n], A->seq_al[a],GAP_LIST, mode);
\r
7468 int ** get_dist_aln_array ( Alignment *A, char *mode)
\r
7473 w=get_sim_aln_array ( A, mode);
\r
7474 return sim_array2dist_array(w,MAXID);
\r
7476 Sequence * seq2filter (Sequence *Sin, int min, int max)
\r
7480 Sequence *S, *Sout;
\r
7486 S=duplicate_sequence (Sin);
\r
7487 for (a=0; a<S->nseq; a++)ungap(S->seq[a]);
\r
7488 keep=vcalloc (S->nseq, sizeof (int));
\r
7489 M=read_matrice ("blossum62mt");
\r
7490 for (a=0; a<S->nseq; a++)
\r
7492 output_completion ( stderr, a, S->nseq, 100, "Distance Matrix Computation: ");
\r
7493 for ( b=a+1; b<S->nseq; b++)
\r
7496 sim=idscore_pairseq(S->seq[a], S->seq[b],-10, -2,M, "sim");
\r
7497 if ( sim>min && sim<max)keep[a]=keep[b]=1;
\r
7498 fprintf ( stderr, "\nSim %d Min %d Max %d", sim, min, max);
\r
7502 tmpfile=vtmpnam (NULL);
\r
7503 fp=vfopen (tmpfile, "w");
\r
7504 for (n=0,a=0; a< S->nseq; a++)
\r
7507 fprintf ( fp, ">%s %s\n%s", S->name[a], S->seq_comment[a], S->seq[a]);
\r
7511 if (n==0) return NULL;
\r
7512 Sout=main_read_seq(tmpfile);
\r
7513 free_int (M, -1); vfree (keep); free_sequence (S, -1);
\r
7517 Alignment * grep_seq (Alignment *S,char *field, char *mode, char *string)
\r
7524 tmp=vtmpnam (NULL);
\r
7525 fp=vfopen (tmp, "w");
\r
7527 if ( !strm(mode, "KEEP") && ! strm (mode, "REMOVE"))
\r
7529 add_warning ( stderr, "\nERROR: +grep <field> <KEEP|REMOVE> <string> [FATAL: %s]", PROGRAM);
\r
7530 myexit (EXIT_FAILURE);
\r
7532 else if ( !strm(field, "SEQ") && ! strm (field, "COMMENT") && ! strm(field, "NAME"))
\r
7534 add_warning ( stderr, "\nERROR: +grep <NAME|COMMENT|SEQ> <mode> <string> [FATAL: %s]", PROGRAM);
\r
7535 myexit (EXIT_FAILURE);
\r
7539 for (n=0, a=0; a< S->nseq; a++)
\r
7543 if (strm(field, "NAME") && perl_strstr (S->name[a], string))found=1;
\r
7544 else if (strm(field, "COMMENT") && S->seq_comment[a][0] && perl_strstr (S->seq_comment[a], string) )found=1;
\r
7545 else if (strm(field, "SEQ") && perl_strstr (S->seq_al[a], string))found=1;
\r
7547 if ( (strm (mode, "KEEP") && found) || (strm (mode, "REMOVE") && !found))
\r
7550 fprintf (fp, ">%s", S->name[a]);
\r
7551 if (S->seq_comment[a][0])fprintf (fp, " %s", S->seq_comment[a]);
\r
7552 fprintf (fp, "\n%s\n", S->seq_al[a]);
\r
7559 if ( n==0) return NULL;
\r
7561 return main_read_aln (tmp, NULL);
\r
7564 Alignment * modify_seq (Alignment *S, char *field, char *string1, char *string2)
\r
7570 tmp=vtmpnam (NULL);
\r
7571 fp=vfopen (tmp, "w");
\r
7572 for ( a=0; a< S->nseq; a++)
\r
7574 if (strm(field, "NAME"))S->name[a]=substitute ( S->name[a], string1, string2);
\r
7575 else if (strm(field, "COMMENT"))S->seq_comment[a]=substitute ( S->seq_comment[a], string1, string2);
\r
7576 else if (strm(field, "SEQ"))S->seq_al[a]=substitute ( S->seq_al[a], string1, string2);
\r
7577 fprintf (fp, ">%s", S->name[a]);
\r
7578 if (S->aln_comment[a][0])fprintf (fp, " %s", S->aln_comment[a]);
\r
7579 fprintf (fp, "\n%s\n", S->seq_al[a]);
\r
7583 S=main_read_aln (tmp, NULL);
\r
7587 int ** seq2sim_mat (Sequence *S, char *mode)
\r
7589 return seq2comp_mat ( S,mode, "sim");
\r
7591 int ** seq2cov_mat (Sequence *S, char *mode)
\r
7593 return seq2comp_mat ( S,mode, "cov");
\r
7596 int ** seq2comp_mat (Sequence *S, char *mode, char *comp_mode)
\r
7605 /*Use pre_computed value if available in the current dir*/
\r
7607 name=path2filename(S->file[0]);
\r
7608 sprintf ( file, "%s%s.%s.%s_file", get_cache_dir(),name, mode, comp_mode);
\r
7609 A=seq2aln(S,NULL, RM_GAP);
\r
7610 if ( check_file_exists (file) && is_distance_matrix_file (file) && (sim=input_similarities(file, A, NULL))!=NULL)
\r
7612 display_input_filename (stderr, "SIMILARITY_MATRIX", "SIMILARITY_MATRIX_FORMAT_01", file, CHECK);
\r
7613 fprintf ( stderr, "\n");
\r
7620 M=read_matrice (mode);
\r
7621 sim=declare_int ( S->nseq, S->nseq);
\r
7622 for ( a=0; a< S->nseq; a++)
\r
7624 ungap (S->seq[a]);
\r
7628 for ( a=0; a<S->nseq-1; a++)
\r
7631 output_completion4halfmat ( stderr, a, S->nseq, 100, "Similarity Matrix Computation: ");
\r
7632 for ( b=a+1; b< S->nseq; b++)
\r
7634 sim[a][b]=sim[b][a]=idscore_pairseq(S->seq[a], S->seq[b],-12, -1,M, comp_mode);
\r
7638 sprintf ( mode2, "_memory_%ld", (long int)sim);
\r
7639 output_similarities( file, A, mode2);
\r
7640 display_output_filename (stderr, "SIMILARITY_MATRIX", "SIMILARITY_MATRIX_FORMAT_01", file, CHECK);
\r
7641 fprintf ( stderr, "\n");
\r
7647 int ** fast_aln2sim_list (Alignment *A, char *mode, int *ns, int **ls)
\r
7650 int p1, p2, p3, r1, r2;
\r
7651 int gap,pos0,pos1,pos2,len,sim;
\r
7652 int a, b, c, m, s=0,s1, s2, n;
\r
7658 ns=vcalloc (2, sizeof (int));
\r
7659 ns[0]=ns[1]=A->nseq;
\r
7660 ls=declare_int (2, A->nseq);
\r
7661 for ( a=0; a< 2; a++)
\r
7662 for (b=0; b<A->nseq; b++)
\r
7667 simm=declare_int (ns[0]*ns[1]+1, 3);
\r
7669 if (strstr (mode, "sim1"))m=0;
\r
7670 else if (strstr (mode, "sim2"))m=1;
\r
7671 else if (strstr (mode, "sim3"))m=2;
\r
7672 else if (strstr (mode, "gap1"))m=3;
\r
7673 else if (strstr (mode, "cov1"))m=4;
\r
7674 else if (strstr (mode, "logid"))m=5;
\r
7679 for (n=0,a=0; a<ns[0]; a++)
\r
7682 for ( b=0; b<ns[1]; b++, n++)
\r
7685 gap=pos0=pos1=pos2=len=sim=0;
\r
7687 for ( c=0; c< A->len_aln; c++)
\r
7689 r1=tolower (A->seq_al[s1][c]);
\r
7690 r2=tolower (A->seq_al[s2][c]);
\r
7694 if ( p3==0)continue;
\r
7696 if ( r1==r2)sim++;
\r
7699 pos0+=(p3==2)?1:0;
\r
7703 if (m==0)s=(pos0==0)?0:(sim*MAXID)/pos0; //sim1
\r
7704 else if (m==1) s=(MIN(pos1,pos2)==0)?0:(sim*MAXID)/MIN(pos1,pos2);//sim2
\r
7705 else if (m==2) s=(MAX(pos1,pos2)==0)?0:(sim*MAXID)/MAX(pos1,pos2);//sim3
\r
7706 else if (m==3) s=(len==0) ?0:((len-gap)*MAXID)/len;//gap1
\r
7707 else if (m==4) s=(len==0) ?0:((pos0)*MAXID)/len; //cov
\r
7710 s=logid_score ( sim, len);
\r
7718 if ( free_ns) {vfree(ns); free_int (ls, -1);}
\r
7723 int ** fast_aln2sim_mat (Alignment *A, char *mode)
\r
7726 int p1, p2, p3, r1, r2;
\r
7727 int gap,pos0,pos1,pos2,len,sim;
\r
7730 simm=declare_int (A->nseq, A->nseq);
\r
7734 if (strstr (mode, "sim1"))m=0;
\r
7735 else if (strstr (mode, "sim2"))m=1;
\r
7736 else if (strstr (mode, "sim3"))m=2;
\r
7737 else if (strstr (mode, "gap1"))m=3;
\r
7738 else if (strstr (mode, "cov1"))m=4;
\r
7739 else if (strstr (mode, "logid"))m=5;
\r
7744 for ( a=0; a< A->nseq-1; a++)
\r
7747 for ( b=a+1; b< A->nseq; b++)
\r
7749 gap=pos0=pos1=pos2=len=sim=0;
\r
7751 for ( c=0; c< A->len_aln; c++)
\r
7753 r1=tolower (A->seq_al[a][c]);
\r
7754 r2=tolower (A->seq_al[b][c]);
\r
7758 if ( p3==0)continue;
\r
7760 if ( r1==r2)sim++;
\r
7763 pos0+=(p3==2)?1:0;
\r
7767 if (m==0)simm[a][b]=simm[b][a]=(pos0==0)?0:(sim*MAXID)/pos0; //sim1
\r
7768 else if (m==1) simm[a][b]=simm[b][a]=(MIN(pos1,pos2)==0)?0:(sim*MAXID)/MIN(pos1,pos2);//sim2
\r
7769 else if (m==2) simm[a][b]=simm[b][a]=(MAX(pos1,pos2)==0)?0:(sim*MAXID)/MAX(pos1,pos2);//sim3
\r
7770 else if (m==3) simm[a][b]=simm[b][a]=(len==0) ?0:((len-gap)*MAXID)/len;//gap1
\r
7771 else if (m==4) simm[a][b]=simm[b][a]=(len==0) ?0:((pos0)*MAXID)/len; //cov
\r
7775 //Inspired from Muscle +mafft 5
\r
7776 simm[a][b]=simm[b][a]=logid_score ( sim, len);
\r
7782 int logid_score ( int sim, int len)
\r
7786 if ( len==0)return (int)(0.33*(float)MAXID);
\r
7788 score=(float)sim/(float)len;
\r
7789 if (score>0.9) score=1.0;
\r
7790 else score=-log10 (1.0-score);
\r
7792 score=(score*MAXID);
\r
7795 int ** aln2sim_mat (Alignment *A, char*mode)
\r
7799 if ( strstr (mode, "idmat"))return fast_aln2sim_mat(A, mode);
\r
7800 return get_sim_aln_array(A, mode);
\r
7802 int ** aln2cov (Alignment *A)
\r
7805 int r1, r2, gr1, gr2, pos0, gap;
\r
7807 cov=declare_int (A->nseq, A->nseq);
\r
7809 for (a=0; a< A->nseq-1; a++)
\r
7812 for ( b=a+1; b<A->nseq; b++)
\r
7814 for (gap=0,pos0=0,c=0;c<A->len_aln; c++)
\r
7816 r1=A->seq_al[a][c];
\r
7817 r2=A->seq_al[b][c];
\r
7818 gr1=is_gap(r1); gr2=is_gap(r2);
\r
7819 if ( gr1+gr2==0)pos0++;
\r
7820 else if ( gr1+gr2<2)gap++;
\r
7822 cov[a][b]=cov[b][a]=((gap+pos0)==0)?0:((pos0*100)/(gap+pos0));
\r
7827 int ** get_raw_sim_aln_array (Alignment *A, char *mode)
\r
7831 int a, b, c, r1, r2, set, max, min;
\r
7833 w=declare_int (A->nseq, A->nseq);
\r
7834 if (strstr(mode, "sar"))M=NULL;
\r
7835 else M=read_matrice (mode);
\r
7837 HERE ("RAW STUFF");
\r
7839 for ( set=0,a=0; a< A->nseq; a++)
\r
7840 for (b=a; b<A->nseq; b++)
\r
7844 for (c=0; c<A->len_aln; c++)
\r
7846 r1=A->seq_al[a][c];
\r
7847 r2=A->seq_al[b][c];
\r
7849 if ( !is_gap(r1) && !is_gap(r2))
\r
7850 w[a][b]+=M[r1-'A'][r2-'A'];
\r
7853 else if ( strm (mode, "sarmat2"))
\r
7855 w[a][b]=get_sar_sim2 (A->seq_al[a], A->seq_al[b]);
\r
7859 HERE ("ERROR: %s is an unknown mode of raw_sim\n", mode); exit (0);
\r
7863 if (!set){min=max=w[a][b];set=1;}
\r
7864 min=MIN(min,w[a][b]);
\r
7865 max=MAX(max,w[a][b]);
\r
7867 for (a=0; a<A->nseq; a++)
\r
7868 for (b=a; b<A->nseq; b++)
\r
7870 w[b][a]=((max-min)==0)?0:((w[b][a]-min)*100)/(max-min);
\r
7876 int ** get_sim_aln_array ( Alignment *A, char *mode)
\r
7882 w=declare_int ( A->nseq, A->nseq);
\r
7884 for ( a=0; a< A->nseq-1; a++)
\r
7886 for ( b=a+1; b< A->nseq; b++)
\r
7889 w[a][b]=w[b][a]=generic_get_seq_sim ( A->seq_al[a], A->seq_al[b], (A->cdna_cache)?A->cdna_cache[0]:NULL, mode);
\r
7894 int generic_get_seq_sim ( char *seq1, char *seq2, int*cache, char *mode)
\r
7898 if ( strm (mode, "cdna"))
\r
7899 return get_cdna_seq_sim ( cache, seq1, seq2,GAP_LIST, mode);
\r
7900 else if ( strnm (mode, "ktup",4))
\r
7901 return ktup_comparison (seq1, seq2,atoi(mode+4));
\r
7902 else if ( strstr (mode, "sarmat2"))
\r
7905 return get_sar_sim2 (seq1, seq2);
\r
7907 else if ( strstr (mode, "sarmat"))
\r
7908 return (int) get_sar_sim (seq1,seq2);
\r
7911 return get_seq_sim ( seq1,seq2,GAP_LIST, mode);
\r
7914 int *** get_winsim_aln_array ( Alignment *A,char *mode, int ***w)
\r
7917 for ( a=0; a< A->nseq; a++)
\r
7918 for ( b=0; b< A->nseq; b++)
\r
7920 if ( strm (mode, "cdna"))
\r
7921 w[a][b]=get_cdna_seq_winsim ( A->cdna_cache[0], A->seq_al[a], A->seq_al[b],GAP_LIST, mode, w[a][b]);
\r
7923 w[a][b]=get_seq_winsim ( A->seq_al[a], A->seq_al[b],GAP_LIST, mode, w[a][b]);
\r
7928 Alignment * seq2profile (Sequence *S, int i)
\r
7932 if ((A=seq2R_template_profile (S, i)))
\r
7940 tmp=vtmpnam (NULL);
\r
7941 fp=vfopen ( tmp, "w");
\r
7942 fprintf (fp, ">%s\n%s\n", S->name[i], S->seq[i]);
\r
7945 (S->T[i])->R=fill_R_template (S->name[i], tmp, S);
\r
7947 return seq2R_template_profile (S, i);
\r
7951 Alignment* aln2sub_aln_file (Alignment *A, int n, char **string)
\r
7956 list=vcalloc (A->nseq, sizeof (char***));
\r
7957 if ( n==0)return A;
\r
7963 for (l=0,a=0; a< n; a++)l+=strlen (string[a]);
\r
7964 buf=vcalloc ( 2*n+l+1, sizeof (char));
\r
7965 for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");}
\r
7966 list[0]=string2list (buf);
\r
7969 else if ( file_exists (NULL,string[0]))
\r
7971 list=read_group (string[0]);
\r
7976 fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM);
\r
7977 myexit (EXIT_FAILURE);
\r
7986 n=atoi (list[a][0]);
\r
7987 fp=vfopen (list[a][1], "w");
\r
7988 for (b=2; b<n; b++)
\r
7990 i=name_is_in_list (list[a][b], A->name, A->nseq, MAXNAMES);
\r
7991 if (n==3)ungap (A->seq_al[i]);
\r
7992 fprintf (fp, ">%s\n%s\n", A->name[i], A->seq_al[i]);
\r
7995 free_char (list[a], -1);
\r
8001 Sequence *remove_empty_sequence (Sequence *S)
\r
8007 c=vcalloc ( S->max_len+1, sizeof (char));
\r
8009 for (a=0, b=0; a< S->nseq; a++)
\r
8011 sprintf ( c, "%s",S->seq[a]);
\r
8013 if ( strlen (c)==0)
\r
8015 //vfree (S->seq[a]);
\r
8017 add_warning ( stderr, "WARNING: Sequence %s does not contain any residue: automatically removed from the set [WARNING:%s]",S->name[a], PROGRAM);
\r
8020 NS=duplicate_sequence (S);
\r
8021 free_sequence (S, S->nseq);
\r
8025 Alignment* aln2sub_seq (Alignment *A, int n, char **string)
\r
8031 list=vcalloc (A->nseq, sizeof (char***));
\r
8032 if ( n==0)return A;
\r
8038 for (l=0,a=0; a< n; a++)l+=strlen (string[a]);
\r
8039 buf=vcalloc ( 2*n+l+1, sizeof (char));
\r
8040 for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");}
\r
8041 list[0]=string2list (buf);
\r
8044 else if ( file_exists (NULL,string[0]))
\r
8046 list=read_group (string[0]);
\r
8051 fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM);
\r
8052 myexit (EXIT_FAILURE);
\r
8065 B=main_read_aln (list[a][1], NULL);
\r
8066 t=aln2most_similar_sequence(B, "idmat");
\r
8067 subS=extract_one_seq(B->name[t],0,0,B,KEEP_NAME);
\r
8068 S=add_sequence (subS,S,0);
\r
8069 free_aln (B);free_sequence (subS, -1);
\r
8070 vremove (list[a][1]);
\r
8074 return seq2aln (S, NULL, RM_GAP);
\r
8077 Alignment * aln2collapsed_aln (Alignment * A, int n, char **string)
\r
8084 int a, b,c, ns, m, l;
\r
8087 list=vcalloc (A->nseq, sizeof (char***));
\r
8089 if ( n==0)return A;
\r
8092 for (l=0,a=0; a< n; a++)l+=strlen (string[a]);
\r
8093 buf=vcalloc ( 2*n+l+1, sizeof (char));
\r
8094 for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");}
\r
8096 list[0]=string2list (buf);ns=1;
\r
8099 else if ( file_exists (NULL,string[0]))
\r
8101 /*Format: Fasta like, the name fo the group followed with the name of the sequences
\r
8102 ><Group name> <First Seq> <second seq> ....
\r
8103 Groups must NOT be overlaping
\r
8105 l=measure_longest_line_in_file (string[0])+1;
\r
8106 buf=vcalloc (l, sizeof (char));
\r
8108 fp=vfopen (string[0], "r");
\r
8109 while ((c=fgetc(fp))!=EOF)
\r
8111 buf=fgets (buf,l-1, fp);
\r
8112 if ( c=='>')list[ns++]=string2list (buf);
\r
8118 fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM);
\r
8119 myexit (EXIT_FAILURE);
\r
8122 vfree (buf); buf=NULL;
\r
8124 /*Identify lost sequences*/
\r
8125 collapsed=vcalloc (A->nseq, sizeof (int));
\r
8126 for ( a=0; a< ns; a++)
\r
8128 m=atoi (list[a][0]);
\r
8129 for (b=2; b<m ; b++)
\r
8131 c=name_is_in_list (list[a][b], A->name, A->nseq, MAXNAMES);
\r
8132 if ( c>=0)collapsed[c]=1;
\r
8135 for ( a=0; a< A->nseq; a++)
\r
8137 if ( collapsed[a]==0)
\r
8139 list[ns]=declare_char (3, MAXNAMES);
\r
8140 sprintf ( list[ns][0], "3");
\r
8141 sprintf ( list[ns][1], "%s", A->name[a]);
\r
8142 sprintf ( list[ns][2], "%s", A->name[a]);
\r
8146 vfree (collapsed);
\r
8152 list2=declare_char (A->nseq, 100);
\r
8153 /*1 Collapse the alignment*/
\r
8154 for ( a=0; a< ns; a++)
\r
8156 sprintf ( list2[a], "%s", list[a][2]);
\r
8158 B=extract_sub_aln2 ( A, ns, list2);
\r
8159 /*2 Rename the sequences*/
\r
8160 for ( a=0; a< ns; a++)
\r
8162 sprintf ( B->name[a], "%s", list[a][1]);
\r
8164 /*replace sequence with consensus*/
\r
8166 for ( a=0; a< ns; a++)
\r
8168 m=atoi (list[a][0]);
\r
8169 for (c=0, b=2; b<m;c++, b++)
\r
8171 sprintf ( list2[c], "%s", list[a][b]);
\r
8173 buf=sub_aln2cons_seq_mat2 ( A,m-2,list2, "blosum62mt");
\r
8174 sprintf (B->seq_al[a], "%s", buf);
\r
8182 Alignment * aln2profile (Alignment * A)
\r
8184 Alignment *B=NULL;
\r
8189 A->P=declare_profile (AA_ALPHABET,A->len_aln+1);
\r
8191 B=copy_aln (A, B);
\r
8192 free_int ((A->P)->count, -1);
\r
8193 free_int ((A->P)->count2, -1);
\r
8194 free_int ((A->P)->count3, -1);
\r
8195 (A->P)->count=aln2count_mat (A);
\r
8196 (A->P)->count2=aln2count_mat2 (A);
\r
8198 cons=aln2cons_seq_mat (A, "blosum62mt");
\r
8200 sprintf (B->seq_al[0], "%s", cons);
\r
8202 (A->P)->count3=aln2count_mat2 (B);
\r
8212 int** aln2count_mat2 ( Alignment *A)
\r
8214 return sub_aln2count_mat2 (A, 0, NULL);
\r
8217 int sub_aln2nseq_prf ( Alignment *A, int ns, int *ls)
\r
8230 ls=vcalloc (n, sizeof (int));
\r
8231 for ( a=0; a<A->nseq; a++)ls[a]=a;
\r
8239 for (c=0,a=0; a<ns; a++)
\r
8242 if ( A->S && (R=seq2R_template_profile (A->S, A->order[s][0]))!=NULL)
\r
8252 if ( free_ls) vfree (ls);
\r
8256 int** sub_aln2count_mat2 ( Alignment *A, int ns, int *ls)
\r
8268 p=vcalloc ( n, sizeof (char*));
\r
8269 ls=vcalloc (n, sizeof (int));
\r
8270 for ( a=0; a<A->nseq; a++)ls[a]=a;
\r
8276 p=vcalloc (n, sizeof (char*));
\r
8279 for (c=0,a=0; a<ns; a++)
\r
8282 if ( A->S && (R=seq2R_template_profile (A->S, A->order[s][0]))!=NULL)
\r
8285 p=vrealloc (p, n*sizeof (char*));
\r
8286 for (b=0; b<R->nseq; b++)
\r
8288 p[c++]=R->seq_al[b];
\r
8294 w=A->order[s][4]+1;
\r
8296 for (b=0; b<w; b++)
\r
8297 p[c++]=A->seq_al[s];
\r
8300 count=sub_aln2count_mat3 (p,c);
\r
8302 if ( free_ls) vfree (ls);
\r
8305 int** sub_aln2count_mat3 (char **al, int ns)
\r
8316 /*count[x][0]=n symbols in column
\r
8317 count[x][1]=total_size of line
\r
8318 count[x][2]=Gap frequency
\r
8320 count[x][n]=symbol n
\r
8321 count[x][n+1]=N occurence symbol n;
\r
8322 count[x][n+2]=N frequence symbol n*100;
\r
8324 special multi-channeling
\r
8325 count[x][count[x][1]]=Nseq
\r
8326 count[x][count[x][1]+s]=residue col x, sequence s
\r
8330 for (a=0; a< 1000; a++)used[a]=0;
\r
8331 len=strlen (al[0]);
\r
8333 count=declare_int (len+2,100+ns+2);
\r
8334 count[len][0]=END_ARRAY;
\r
8336 count[len][2]=len;
\r
8340 for (a=0; a<len; a++)
\r
8342 for (us=ns, b=0; b<ns; b++)
\r
8344 r=tolower (al[b][a]);
\r
8346 if (is_gap(r))us--;
\r
8349 count[a][used[r]*3+1]++;
\r
8353 used[r]=++count[a][0];
\r
8354 count[a][used[r]*3]=r;
\r
8355 count[a][used[r]*3+1]++;
\r
8358 count[a][1]=count[a][0]*3+2;
\r
8359 /*count[a][2]=(A->nseq-us)*100/A->nseq;*/
\r
8360 count[a][2]=ns-us;
\r
8362 for (b=3; b<count[a][1]; b+=3)
\r
8364 count[a][b+2]=(count[a][b+1]*100)/us;
\r
8365 used[count[a][b]]=0;
\r
8369 /*Option for multi channeling*/
\r
8372 count[a][count[a][1]]=A->nseq;
\r
8373 for (b=1; b<=A->nseq; b++)
\r
8374 count [a][count[a][1]+b]=(is_gap(A->seq_al[b-1][a]))?0:A->seq_al[b-1][a];
\r
8378 HERE ("Display ");
\r
8379 for (a=0; a< 5; a++)
\r
8381 fprintf ( stderr, "\n");
\r
8382 for ( b=3; b< count[a][1]; b+=3)
\r
8384 fprintf ( stderr, "[%c %d]", count[a][b], count[a][b+1]);
\r
8386 fprintf ( stderr, "\n");
\r
8387 for ( b=0; b<ns; b++)
\r
8389 fprintf ( stderr, "%c", al[b][a]);
\r
8392 HERE ("End of Display");
\r
8397 int** aln2count_mat ( Alignment *A)
\r
8399 function documentation: start
\r
8401 int output_freq_mat ( char *outfile, Aligmnent *A)
\r
8403 This function counts the number of residues in each column of an alignment (Prot/NA)
\r
8404 It outputs these values in the following format
\r
8406 This format can be piped into:
\r
8407 The routine used for computing the p-value gmat-inf-gc-v2c
\r
8409 function documentation: end
\r
8416 alp_size=sizeof (AA_ALPHABET);
\r
8417 freq_mat=declare_int (alp_size+2, A->len_aln);
\r
8420 for ( a=0; a<A->len_aln; a++)
\r
8422 for ( b=0; b< A->nseq; b++)
\r
8424 if ( is_gap ( A->seq_al[b][a]))freq_mat[alp_size][a]++;
\r
8427 x=tolower(A->seq_al[b][a]);
\r
8428 freq_mat[x-'a'][a]++;
\r
8429 freq_mat[alp_size+1][a]++;
\r
8437 char *aln2random_seq (Alignment *A, int pn1, int pn2, int pn3, int gn)
\r
8443 Given the frequencies in A ( read as total counts of each Residue in
\r
8444 freq[A->nseq][A->len_aln], and pn1, pn2 and pn3:
\r
8446 1-Generate a new amino-acid at each position
\r
8447 2-Insert Gaps, using a HMM.
\r
8450 pn3=Weight of the noise induced with sub mat.
\r
8452 pn1=% noise type 1 ( Varies with entropi)
\r
8453 n1=Ratio noise type 1
\r
8456 t1=Noise 1 expressed in Nseq
\r
8458 ncat=number of non 0 cat for a given position
\r
8459 ICi initial count for residue i
\r
8462 t1=T*n1*(1-1/ncat);
\r
8465 Ci= ICi*(T-(t1+t2))/T +(t1)/al+(t2)/al
\r
8471 float T, tot_t1, tot_t2,tot_t3, n1, n2, n3;
\r
8475 double *init_freq;
\r
8476 double *blur_freq;
\r
8477 double *t1, *t2,*t3;
\r
8481 /*Viterbi Parameters */
\r
8484 int AL=0; /*Allowed Transition*/
\r
8485 int F=-100000; /*Forbiden Transition*/
\r
8487 int GAP_TRANSITION;
\r
8488 int IGAP=0, IAA=1;
\r
8490 int state,best_state=0, score, best_score=0;
\r
8496 int **transitions;
\r
8500 seq=vcalloc ( A->len_aln+1, sizeof (char));
\r
8501 count=aln2count_mat(A);
\r
8502 freq=aln2count_mat(A);
\r
8506 n1=(float)pn1/100;
\r
8507 n2=(float)pn2/100;
\r
8508 n3=(float)pn3/100;
\r
8510 for ( a=0; a< A->len_aln; a++)
\r
8512 for ( b=0; b<26; b++)
\r
8513 freq[b][a]=freq[b][a]*((T)/(A->nseq-freq[26][a]));
\r
8514 freq[26][a]= (freq[26][a]*T)/A->nseq;
\r
8518 init_freq=vcalloc ( 26, sizeof (double));
\r
8519 blur_freq=vcalloc ( 26, sizeof (double));
\r
8521 tot_t1=tot_t2=tot_t3=0;
\r
8523 t1=vcalloc ( 27, sizeof (double));
\r
8524 t2=vcalloc ( 27, sizeof (double));
\r
8525 t3=vcalloc ( 27, sizeof (double));
\r
8526 for (a=0; a< A->len_aln; a++)
\r
8529 /*Compute Frequencies*/
\r
8530 for (tot=0, b=0; b<26; b++)
\r
8532 if ( is_aa(b+'A'))
\r
8534 init_freq[b]=freq[b][a];
\r
8538 /*Count the number of different amino acids*/
\r
8539 for ( ncat=0, b=0; b<=26; b++)
\r
8541 ncat+=(freq[b][a]!=0)?1:0;
\r
8543 /*Blurr the distribution using */
\r
8544 blur_freq=compute_matrix_p (init_freq,tot);
\r
8547 /*compute noise 1: biased with blurred content * enthropy--> keeps prosite motifs*/
\r
8548 tot_t1=T*n1*(1-1/ncat);
\r
8549 for ( b=0; b< 26; b++)if ( is_aa(b+'A')){t1[b]=blur_freq[b]*(1-1/ncat)*n1;}
\r
8551 /*Compute noise 2: completely random*/
\r
8553 for ( b=0; b< 26; b++)if ( is_aa(b+'A')){t2[b]=tot_t2/21;}
\r
8555 /*compute noise 3: biased with the sole content(pam250mt)*/
\r
8557 for ( b=0; b<26; b++)if ( is_aa(b+'A')){t3[b]=blur_freq[b]*n3;}
\r
8559 for ( b=0; b<26; b++)
\r
8561 if ( is_aa('A'+b))
\r
8562 freq[b][a]=freq[b][a]*(T-(tot_t1+tot_t2+(tot_t3)))/T+t1[b]+t2[b]+t3[b];
\r
8565 /*end of the loop that mutates position a*/
\r
8568 vfree (blur_freq);
\r
8569 vfree (init_freq);
\r
8572 /*1-Generate the amino acids of the new sequence new*/
\r
8577 for ( a=0; a< A->len_aln; a++)
\r
8580 for (T=0,b=0; b<26; b++)T+=freq[b][a];
\r
8581 x=rand ()%((int)T);
\r
8582 for (c=0,b=0; b<26; b++)
\r
8592 if ( c!=-1)seq[a]='-';
\r
8597 /*2 Generate the gaps in the new sequence*/
\r
8605 transitions=declare_int ( nstate, nstate);
\r
8606 score_tab=declare_int ( A->len_aln+2, nstate );
\r
8607 state_tab=declare_int ( A->len_aln+2, nstate );
\r
8611 for (a=0; a<nstate;a++)
\r
8612 for (b=0; b<nstate;b++)
\r
8613 {transitions[a][b]=F;}
\r
8615 GAP_TRANSITION=AL-gn;
\r
8617 transitions[IGAP ][IGAP ]=AL;
\r
8618 transitions[IAA][IAA]=AL;
\r
8619 transitions[IAA ][IGAP]=GAP_TRANSITION;
\r
8620 transitions[IGAP][IAA ]=GAP_TRANSITION;
\r
8623 for ( p=1; p<=A->len_aln; p++){for (state=0; state< nstate; state++){score_tab[p][state]=F;state_tab[p][state]=-1;} }
\r
8625 for (p=1; p<= A->len_aln; p++)
\r
8627 for (max=0,a=0; a<26; a++)max=MAX(max, freq[a][p-1]);
\r
8628 max=(max*(A->nseq-count[26][p-1]))/A->nseq;
\r
8630 for (state=0; state< nstate; state++)
\r
8635 if ( state==IGAP) e=gf-50;
\r
8636 else if ( state==IAA ) e=max-50;
\r
8637 for (p_state=0; p_state<nstate; p_state++)
\r
8639 score=(score_tab[p-1][p_state]==F)?F:(e+transitions[p_state][state]+score_tab[p-1][p_state]);
\r
8640 if(p_state==0 || score>best_score){ best_score=score;best_state=p_state;}
\r
8642 score_tab[p][state]=best_score;
\r
8643 state_tab[p][state]=best_state;
\r
8647 for (state=0; state<nstate; state++)
\r
8649 if (state==0 || score_tab[p-1][state]>best_score){best_score=score_tab[p-1][state]; best_state=state;}
\r
8652 for (p=A->len_aln; p>0;)
\r
8654 if ( best_state==IGAP)
\r
8658 else if ( best_state==IAA)
\r
8660 seq[p-1]=seq[p-1];
\r
8662 best_state=state_tab[p][best_state];
\r
8667 free_int (freq, -1);
\r
8671 /********************************************************************/
\r
8673 /* Weighting functions */
\r
8677 /********************************************************************/
\r
8678 Alignment * master_trimseq( Alignment *A, Sequence *S,char *mode)
\r
8683 int use_aln=0, upper_sim=0, min_nseq=0, lower_sim=0;
\r
8684 float f_upper_sim, f_lower_sim;
\r
8685 char weight_mode[1000];
\r
8686 char method[1000];
\r
8688 int trim_direction=TOP;
\r
8689 float **sim_weight;
\r
8698 (trim)_<seq or aln>_%<percentage of tot weight to keep>_n<number of seq to keep>_w<weight mode>
\r
8703 seq_list=vcalloc ( S->nseq, sizeof (int));
\r
8704 for ( a=0; a< A->nseq; a++)
\r
8710 use_aln=aln_is_aligned(A);
\r
8712 if ( mode[0]=='\0')
\r
8718 sprintf (weight_mode, "pwsim");
\r
8719 sprintf ( method, "clustering2");
\r
8724 upper_sim=lower_sim=min_nseq;
\r
8725 sprintf (weight_mode, "pwsim");
\r
8726 sprintf ( method, "clustering2");
\r
8730 U or % (deprecated) Upper bound for pairwise similarity
\r
8731 L or m (depercated) Lower bound for pairwise similarity
\r
8732 n max number of sequences
\r
8733 N max number of sequences as a fraction of thet total
\r
8734 S print Statistics
\r
8735 T print Table of distances
\r
8740 while ( (p=strtok(mode, "_")))
\r
8743 if (strm (p, "seq"))use_aln=0;
\r
8744 else if ( strm(p,"aln"))use_aln=1;
\r
8745 else if (p[0]=='s')statistics=1;
\r
8746 else if (p[0]=='t')table=1;
\r
8747 else if (p[0]=='U')upper_sim=atoi(p+1);
\r
8748 else if (p[0]=='L')lower_sim=atoi(p+1);
\r
8749 else if (p[0]=='n')min_nseq=atoi(p+1);
\r
8750 else if (p[0]=='N')min_nseq=atoi(p+1)*-1;
\r
8751 else if (p[0]=='B')trim_direction=BOTTOM;
\r
8752 else if (p[0]=='T')trim_direction=TOP;
\r
8753 else if (p[0]=='W')sprintf (weight_mode, "%s", p+1);
\r
8754 else if (p[0]=='M')sprintf (method, "%s", p+1);
\r
8755 else if (p[0]=='K')
\r
8758 while ((p=strtok(NULL, ":")))
\r
8763 seq_list[atoi(p+1)-1]=2;
\r
8765 else if ( (a=name_is_in_list (p, A->name, A->nseq, 100))!=-1)
\r
8774 if ( !upper_sim && !min_nseq && !lower_sim)upper_sim=50;
\r
8780 fprintf ( stderr, "\ntrimseq requires a set of sequences[FATAL:%s]\n", PROGRAM);
\r
8784 else if ( min_nseq> S->nseq)
\r
8788 else if ( min_nseq<0)
\r
8790 if ( min_nseq<-100)
\r
8792 add_warning ( stderr, "\nWARNING: trimseq: Nseq(N) max_val=100%% [Automatic reset]\n");
\r
8796 min_nseq=(int)((float)S->nseq*((float)min_nseq/100)*-1);
\r
8800 NA=seq2subseq3 (A, S,use_aln,lower_sim,upper_sim,min_nseq,trim_direction, weight_mode,&sim_weight, seq_list );
\r
8804 fprintf ( stderr, "\nSIMILARITY MATRIX\n");
\r
8805 for ( a=0; a< A->nseq-1; a++)
\r
8806 for ( b=a+1; b< A->nseq; b++)
\r
8808 fprintf ( stderr, "%15s Vs %15s : %3.2f %% id\n", A->name[a], A->name[b], 100-sim_weight[a][b]);
\r
8813 f_upper_sim=(upper_sim>100)?((float)upper_sim/(float)100):upper_sim;
\r
8814 f_lower_sim=(upper_sim>100)?((float)lower_sim/(float)100):lower_sim;
\r
8816 fprintf ( stderr, "\nTRIM Informations:\n");
\r
8817 fprintf ( stderr, "\tUse...........: %s\n",(use_aln)?"multiple_aln":"pairwise_aln");
\r
8818 fprintf ( stderr, "\tcluster_mode..: %s\n" ,method);
\r
8819 fprintf ( stderr, "\tsim_mode......: %s\n" ,weight_mode);
\r
8820 fprintf ( stderr, "\tlower_id_bound: %.2f%%\n" ,(f_lower_sim==0)?-1:f_lower_sim);
\r
8821 fprintf ( stderr, "\tupper_id_bound: %.2f%%\n",(f_upper_sim==0)?-1:f_upper_sim);
\r
8822 fprintf ( stderr, "\tnseq_kept.....: %d (out of %d)\n" ,NA->nseq, S->nseq);
\r
8823 fprintf ( stderr, "\treduction.....: %d%% of original set\n" ,(NA->nseq*100)/S->nseq);
\r
8824 fprintf ( stderr, "\tTrim_direction: From %s \n" ,(trim_direction==BOTTOM)?"Bottom":"Top");
\r
8830 Alignment *sim_filter (Alignment *A, char *in_mode, char *seq)
\r
8835 int maxnseq, nseq_ratio, nc;
\r
8841 int direction=1;//remove the higher than
\r
8842 int coverage=0; //remove based on coverage
\r
8843 static char *field;
\r
8844 int maxsim, minsim, maxcov, mincov;
\r
8846 if ( !field) field=vcalloc (1000, sizeof (char));
\r
8848 mode=vcalloc ( strlen (in_mode)+10, sizeof (char));
\r
8849 sprintf ( mode, "_%s_", in_mode);
\r
8851 strget_param ( mode, "_I", "100", "%d", &maxsim);
\r
8852 strget_param ( mode, "_i", "0", "%d", &minsim);
\r
8853 strget_param ( mode, "_C", "100", "%d", &maxcov);
\r
8854 strget_param ( mode, "_c", "0", "%d", &mincov);
\r
8860 keep=vcalloc ( A->nseq, sizeof (int));
\r
8861 list=vcalloc ( A->nseq, sizeof (int));
\r
8869 else s=name_is_in_list (seq, A->name, A->nseq, 100);
\r
8873 if ( s==-1)printf_exit (EXIT_FAILURE, stderr, "ERROR: %s is not a valid sequence", seq);
\r
8878 //get the distances
\r
8879 if ( strstr (mode, "_seq_"))
\r
8884 M=read_matrice ("blosum62mt");
\r
8885 seq=declare_char (A->nseq, A->len_aln+1);
\r
8886 for (a=0; a<A->nseq; a++)
\r
8888 sprintf ( seq[a], "%s", A->seq_al[a]);
\r
8892 sim=declare_int (A->nseq, A->nseq);
\r
8893 cov=declare_int (A->nseq, A->nseq);
\r
8895 for (a=0; a<A->nseq; a++)
\r
8899 sim[s][a]=sim[a][s]=idscore_pairseq(seq[s], seq[a],-12, -1,M,"sim");
\r
8900 cov[s][a]=cov[a][s]=idscore_pairseq(seq[s], seq[a],-12, -1,M,"cov");
\r
8904 free_char (seq, -1);
\r
8909 sim=aln2sim_mat (A, "idmat");
\r
8913 for (a=0; a< A->nseq; a++)
\r
8915 if (a==s)continue;
\r
8918 if ( sim[s][a]>maxsim || sim[s][a]<minsim|| cov[s][a]<mincov||cov[s][a]>maxcov)keep[a]=-1;
\r
8923 for ( n=0, a=0; a< A->nseq; a++)
\r
8931 R=extract_sub_aln (A, n, list);
\r
8932 free_int (sim, -1); free_int (cov, -1);vfree (list);
\r
8938 static int find_worst_seq ( int **sim, int n, int *keep, int max, int direction);
\r
8939 Alignment *simple_trimseq (Alignment *A, Alignment *K, char *in_mode, char *seq_list)
\r
8944 int maxnseq, maxsim, nseq_ratio, nc;
\r
8950 int direction=1;//remove the higher than
\r
8951 int coverage=0; //remove based on coverage
\r
8952 static char *field;
\r
8955 if ( !field) field=vcalloc (1000, sizeof (char));
\r
8957 mode=vcalloc ( strlen (in_mode)+10, sizeof (char));
\r
8958 sprintf ( mode, "_%s_", in_mode);
\r
8960 strget_param ( mode, "_%%", "0", "%d", &maxsim);
\r
8961 strget_param ( mode, "_n", "0", "%d", &maxnseq);
\r
8962 strget_param ( mode, "_N", "0", "%d", &nseq_ratio);
\r
8963 strget_param ( mode, "_F", "0", "%d", &nc);
\r
8964 strget_param ( mode, "_O", "0", "%d", &outlayers);
\r
8965 strget_param ( mode, "_f", "NAME", "%s", field);
\r
8967 if ( strstr (mode, "_min"))direction=-1;
\r
8970 if ( strstr (mode, "_cov"))coverage=1;
\r
8976 maxnseq=(A->nseq*nseq_ratio)/100;
\r
8979 else if ( maxnseq)
\r
8983 else if ( !maxsim)
\r
8989 keep=vcalloc ( A->nseq, sizeof (int));
\r
8990 list=vcalloc ( A->nseq, sizeof (int));
\r
8995 /*Remove Sequences that do not have at least one residue in the first and last nc columns*/
\r
8998 int left, right, full_n,x, y;
\r
9003 full_list=vcalloc ( A->nseq, sizeof (int));
\r
9005 for (x=0; x< A->nseq; x++)
\r
9007 for ( left=0,y=0; y<MIN(A->len_aln,nc); y++)
\r
9008 if (!is_gap(A->seq_al[x][y]))left=1;
\r
9010 for ( right=0,y=MAX(0,(A->len_aln-nc)); y<A->len_aln; y++)
\r
9011 if (!is_gap(A->seq_al[x][y]))right=1;
\r
9013 if ( left && right)full_list[full_n++]=x;
\r
9015 F=extract_sub_aln (A, full_n, full_list);
\r
9017 vfree (full_list);
\r
9021 /*Reorder the sequences according to the tree order: hopefully better phylogenetic coverage after trim*/
\r
9022 if (strstr (mode, "_T"))
\r
9027 sim=sim_array2dist_array ( sim, MAXID);
\r
9028 T=int_dist2nj_tree (sim, A->name, A->nseq, NULL);
\r
9029 O=tree2seq (T[3][0], NULL);
\r
9030 A=reorder_aln (A, O->name, O->nseq);
\r
9032 free_int (sim, -1);
\r
9033 free_sequence (O, -1);
\r
9038 if ( strstr (mode, "seq_"))sim=seq2comp_mat (aln2seq(A), "blosum62mt", "sim");
\r
9039 else sim=aln2sim_mat (A, "idmat");
\r
9044 if ( strstr (mode, "seq_"))sim=seq2comp_mat (aln2seq(A), "blosum62mt", "cov");
\r
9045 else sim=aln2cov (A);
\r
9050 if ( K && K->nseq>0)
\r
9052 for ( a=0; a< K->nseq; a++)
\r
9053 if ( (k=name_is_in_list (K->name[a], A->name, A->nseq, MAXNAMES+1))!=-1)
\r
9061 for ( a=0; a< A->nseq; a++)
\r
9063 if (strstr (field, "NAME") && perl_strstr (A->name[a], seq_list)){keep[a]=1;}
\r
9064 else if (strstr (field, "COMMENT") && A->seq_comment && perl_strstr(A->seq_comment[a], seq_list)){keep[a]=1;}
\r
9065 else if (strstr (field, "SEQ") && perl_strstr((A->S)->seq[a], seq_list)){keep[a]=1;}
\r
9070 for ( a=0; a< A->nseq; a++)
\r
9071 if ( keep[a]) fprintf ( stderr, "\nFORCED KEEP %s", A->name[a]);
\r
9076 while ( (s=find_worst_seq (sim, A->nseq, keep, maxsim, direction))!=-1 && new_nseq>maxnseq)
\r
9078 for ( a=0; a< A->nseq; a++)sim[a][s]=sim[s][a]=-1;
\r
9083 /*Trim Outlayers*/
\r
9087 tot_avg=vcalloc ( A->nseq, sizeof (int));
\r
9089 for (a=0; a<A->nseq; a++)
\r
9091 if ( keep[a]==-1)tot_avg[a]=-1;
\r
9094 for (nn=0, b=0; b< A->nseq; b++)
\r
9096 if (a==b || keep[b]==-1)continue;
\r
9099 tot_avg[a]+=sim[a][b];
\r
9103 tot_avg[a]=(nn==0)?-1:(tot_avg[a])/nn;
\r
9106 for ( a=0; a<A->nseq; a++)
\r
9108 if (tot_avg[a]!=-1 && tot_avg[a]<outlayers)
\r
9110 fprintf ( stderr, "\nREMOVED OUTLAYER: %3d %% avg similarity with remaining sequences [Seq %s]", tot_avg[a],A->name[a]);
\r
9117 for ( n=0, a=0; a< A->nseq; a++)
\r
9125 R=extract_sub_aln (A, n, list);
\r
9126 free_int (sim, -1); vfree (list);
\r
9131 int find_worst_seq ( int **sim, int n, int *keep,int max,int direction)
\r
9137 sc=declare_int (n, 2);
\r
9138 if (direction==-1)max=100-max;
\r
9140 for ( a=0; a< n; a++) sc[a][0]=a;
\r
9141 for ( a=0; a< n-1; a++)
\r
9143 for ( b=a+1; b<n; b++)
\r
9146 if (sim[a][b]>=0)si=(direction==-1)?100-sim[a][b]:sim[a][b];
\r
9147 else si=sim[a][b];
\r
9150 if ( keep[a]!=1)sc[a][1]+=si;
\r
9151 if ( keep[b]!=1)sc[b][1]+=si;
\r
9156 sort_int_inv ( sc, 2, 1, 0, n-1);
\r
9157 if ( sc[0][1]>0)r=sc[0][0];
\r
9160 free_int (sc, -1);
\r
9161 if (r!=-1 && keep && keep[r])return -1;
\r
9165 int find_worst_seq_old ( int **sim, int n, int *keep,int max,int direction)
\r
9170 sc=declare_int (n, 2);
\r
9172 for ( a=0; a< n; a++) sc[a][0]=a;
\r
9173 for ( a=0; a< n-1; a++)
\r
9175 for ( b=a+1; b<n; b++)
\r
9177 if ( direction==1)
\r
9179 if ( sim[a][b]>max)
\r
9181 if ( keep[a]!=1)sc[a][1]+=sim[a][b];
\r
9182 if ( keep[b]!=1)sc[b][1]+=sim[a][b];
\r
9185 else if ( direction == -1)
\r
9187 if ( sim[a][b]<max && sim[a][b]>=0)
\r
9189 if ( keep[a]!=1)sc[a][1]+=sim[a][b];
\r
9190 if ( keep[b]!=1)sc[b][1]+=sim[a][b];
\r
9196 if ( direction ==1) //remove max
\r
9198 sort_int_inv ( sc, 2, 1, 0, n-1);
\r
9199 if ( sc[0][1]>0)r=sc[0][0];
\r
9203 else if ( direction ==-1)//remove min
\r
9205 sort_int_inv ( sc, 2, 1, 0, n-1);
\r
9206 if ( sc[0][1]>=0)r=sc[0][0];
\r
9208 HERE ("** %d %d\n", r,sc[0][1]);
\r
9210 free_int (sc, -1);
\r
9211 if (r!=-1 && keep && keep[r])return -1;
\r
9216 Alignment * trimseq( Alignment *A, Sequence *S,char *mode)
\r
9221 int use_aln=0, upper_sim=0, min_nseq=0, lower_sim=0;
\r
9222 char weight_mode[1000];
\r
9223 char method[1000];
\r
9225 int trim_direction=TOP;
\r
9226 float **sim_weight;
\r
9230 float f_lower_sim, f_upper_sim;
\r
9236 (trim)_<seq or aln>_%<percentage of tot weight to keep>_n<number of seq to keep>_w<weight mode>
\r
9241 seq_list=vcalloc ( S->nseq, sizeof (int));
\r
9242 for ( a=0; a< A->nseq; a++)
\r
9248 use_aln=aln_is_aligned(A);
\r
9251 if ( mode[0]=='\0')
\r
9257 sprintf (weight_mode, "pwsim_fragment");
\r
9258 sprintf ( method, "clustering2");
\r
9263 upper_sim=lower_sim=min_nseq;
\r
9264 sprintf (weight_mode, "pwsim_fragment");
\r
9265 sprintf ( method, "clustering2");
\r
9269 U or % (deprecated) Upper bound for pairwise similarity
\r
9270 L or m (depercated) Lower bound for pairwise similarity
\r
9271 n max number of sequences
\r
9272 N max number of sequences as a fraction of thet total
\r
9273 S print Statistics
\r
9274 T print Table of distances
\r
9279 while ( (p=strtok(mode, "_")))
\r
9282 if (strm (p, "seq"))use_aln=0;
\r
9283 else if ( strm(p,"aln"))use_aln=1;
\r
9284 else if (p[0]=='s')statistics=1;
\r
9285 else if (p[0]=='t')table=1;
\r
9286 else if (p[0]=='p')print_name=1;
\r
9287 else if (p[0]=='U')upper_sim=atoi(p+1);
\r
9288 else if (p[0]=='L')lower_sim=atoi(p+1);
\r
9289 else if (p[0]=='n')min_nseq=atoi(p+1);
\r
9290 else if (p[0]=='N')min_nseq=atoi(p+1)*-1;
\r
9291 else if (p[0]=='B')trim_direction=BOTTOM;
\r
9292 else if (p[0]=='T')trim_direction=TOP;
\r
9293 else if (p[0]=='W')sprintf (weight_mode, "%s", p+1);
\r
9294 else if (p[0]=='M')sprintf (method, "%s", p+1);
\r
9295 else if (p[0]=='K')
\r
9298 while ((p=strtok(NULL, ":")))
\r
9301 if ( (a=name_is_in_list (p, A->name, A->nseq, 100))!=-1)
\r
9309 if ( !upper_sim && !min_nseq && !lower_sim)upper_sim=50;
\r
9315 fprintf ( stderr, "\ntrimseq requires a set of sequences[FATAL:%s]\n", PROGRAM);
\r
9319 else if ( min_nseq> S->nseq)
\r
9323 else if ( min_nseq<0)
\r
9325 if ( min_nseq<-100)
\r
9327 add_warning ( stderr, "\nWARNING: trimseq: Nseq(N) max_val=100%% [Automatic reset]\n");
\r
9331 min_nseq=(int)((float)S->nseq*((float)min_nseq/100)*-1);
\r
9335 NA=seq2subseq2 (A, S,use_aln,lower_sim,upper_sim,min_nseq,trim_direction, weight_mode,&sim_weight, seq_list );
\r
9339 fprintf ( stderr, "\nSIMILARITY MATRIX\n");
\r
9340 for ( a=0; a< A->nseq-1; a++)
\r
9341 for ( b=a+1; b< A->nseq; b++)
\r
9343 fprintf ( stderr, "%15s Vs %15s : %3.2f %% id\n", A->name[a], A->name[b], 100-sim_weight[a][b]);
\r
9347 NA=seq_name2removed_seq_name(S, NA,sim_weight);
\r
9351 fprintf ( stderr, "\nList of sequences with their closest removed neighbors\n");
\r
9352 for ( a=0; a< NA->nseq; a++)fprintf ( stderr, "\n%s: %s\n", NA->name[a], NA->seq_comment[a]);
\r
9357 f_lower_sim=(lower_sim>100)?(float)lower_sim/100:lower_sim;
\r
9358 f_upper_sim=(upper_sim>100)?(float)upper_sim/100:upper_sim;
\r
9360 fprintf ( stderr, "\nTRIM seq Informations:\n");
\r
9361 fprintf ( stderr, "\tUse...........: %s\n",(use_aln)?"multiple_aln":"pairwise_aln");
\r
9362 fprintf ( stderr, "\tcluster_mode..: %s\n" ,method);
\r
9363 fprintf ( stderr, "\tsim_mode......: %s\n" ,weight_mode);
\r
9364 fprintf ( stderr, "\tlower_id_bound: %.2f%%\n" ,(f_lower_sim==0)?-1:f_lower_sim);
\r
9365 fprintf ( stderr, "\tupper_id_bound: %.2f%%\n",(f_upper_sim==0)?-1:f_upper_sim);
\r
9366 fprintf ( stderr, "\tnseq_kept.....: %d (out of %d)\n" ,NA->nseq, S->nseq);
\r
9367 fprintf ( stderr, "\treduction.....: %d%% of original set\n" ,(NA->nseq*100)/S->nseq);
\r
9368 fprintf ( stderr, "\tTrim_direction: From %s \n" ,(trim_direction==BOTTOM)?"Bottom":"Top");
\r
9374 Alignment * tc_trimseq( Alignment *A, Sequence *S,char *mode)
\r
9378 char *trimfile, *alnfile;
\r
9380 int a, nseq=0, sim=0;
\r
9382 char command[100000];
\r
9383 char keep_list[10000];
\r
9385 int top, bottom, middle, pmiddle;
\r
9387 keep_list[0]='\0';
\r
9389 seq_list=vcalloc ( S->nseq, sizeof (int));
\r
9390 for ( a=0; a< A->nseq; a++)
\r
9395 trimfile=vtmpnam (NULL);
\r
9396 alnfile=vtmpnam (NULL);
\r
9397 if ( !aln_is_aligned (A))
\r
9399 fprintf ( stderr, "\ntrimTC: computation of an Approximate MSA [");
\r
9400 A=compute_tcoffee_aln_quick ( A, NULL);
\r
9401 fprintf ( stderr, "DONE]\n");
\r
9403 output_clustal_aln (alnfile, A);
\r
9406 while ( (p=strtok(mode, "#")))
\r
9411 if (p[0]=='%' || p[0]=='S')sim=(p[1]=='%')?atoi(p+2):atoi(p+1);
\r
9412 else if (p[0]=='n' || p[0]=='N')nseq=atoi(p+1);
\r
9413 else if (p[0]=='K')
\r
9415 if ( (a=name_is_in_list (p+1, A->name, A->nseq, 100))!=-1)
\r
9422 if ( nseq ==0 && sim ==0)
\r
9424 fprintf ( stderr, "\nERROR: trimTC\nIndicate the maximum number of sequences Nnseq\nOR the maximum average similarity of the chosen sequencesSx\nEX: +trimTC S20 OR +trimTC N5");
\r
9425 fprintf ( stderr, "\n[FATAL:%s]", PROGRAM);
\r
9426 myexit (EXIT_FAILURE);
\r
9429 for ( a=0; a<A->nseq; a++)if (seq_list[a]==2){strcat ( keep_list, A->name[a]);strcat ( keep_list," ");}
\r
9433 sprintf ( command , "%s -infile %s -trim -trimfile=%s -split_score_thres %d -convert -iterate 0 ",get_string_variable("t_coffee"), alnfile, trimfile,sim);
\r
9434 if ( keep_list[0]){strcat ( command, " -seq_to_keep ");strcat ( command, keep_list);}
\r
9435 my_system ( command);
\r
9436 TS=read_sequences (trimfile);
\r
9438 else if ( nseq && A->nseq>nseq)
\r
9442 pmiddle=0;middle=50;
\r
9444 sprintf ( command , "%s -infile %s -trim -trimfile=%s -split_score_thres %d -convert -iterate 0",get_string_variable("t_coffee"), alnfile, trimfile,middle);
\r
9445 if ( keep_list[0]){strcat ( command, " -seq_to_keep ");strcat ( command, keep_list);}
\r
9446 my_system ( command);
\r
9448 TS=read_sequences (trimfile);
\r
9449 fprintf ( stderr, "\n\tTrimTC: Sim %d Nseq %d\t",middle, TS->nseq);
\r
9451 if ( TS->nseq>nseq)top=middle;
\r
9452 else if ( TS->nseq<nseq)bottom=middle;
\r
9454 middle=(top-bottom)/2+bottom;
\r
9456 while (TS->nseq!=nseq && pmiddle!=middle)
\r
9459 sprintf ( command , "%s -infile %s -trim -trimfile=%s -split_score_thres %d -convert -iterate 0 ",get_string_variable("t_coffee"), alnfile, trimfile,middle);
\r
9460 if ( keep_list[0]){strcat ( command, " -seq_to_keep ");strcat ( command, keep_list);}
\r
9461 my_system ( command);
\r
9462 free_sequence (TS, -1);
\r
9463 TS=read_sequences (trimfile);
\r
9464 fprintf ( stderr, "\n\tTrimTC: Sim %d Nseq %d\t", middle, TS->nseq);
\r
9466 if ( TS->nseq>nseq)top=middle;
\r
9467 else if ( TS->nseq<nseq)bottom=middle;
\r
9469 middle=(top-bottom)/2+bottom;
\r
9476 NA=seq2aln (TS, NULL, 1);
\r
9477 vremove ( alnfile);
\r
9478 fprintf ( stderr, "\n");
\r
9483 Alignment* seq2subseq3( Alignment *A, Sequence *S,int use_aln, int int_lower_sim,int int_upper_sim, int min_nseq, int trim_direction, char *weight_mode, float ***sim_weight, int *seq_list)
\r
9489 char **seq, **name;
\r
9492 float sim, lower_sim, upper_sim;
\r
9494 lower_sim=(int_lower_sim>100)?(float)int_lower_sim/100:int_lower_sim;
\r
9495 upper_sim=(int_upper_sim>100)?(float)int_upper_sim/100:int_upper_sim;
\r
9497 sim_weight[0]=get_weight ((use_aln)?A:NULL, S, weight_mode);
\r
9499 name=declare_char (S->nseq, (MAXNAMES+1));
\r
9500 seq= declare_char (S->nseq, S->max_len+1);
\r
9503 Remove every sequence that is more than upper_sim and less than lower_sim similar to the master sequences
\r
9504 the master sequence(s) are those for which seq_list[x]==2
\r
9513 for (a=0; a< A->nseq; a++)
\r
9515 if ( seq_list[a]==2)
\r
9518 for ( b=0; b< A->nseq;b++)
\r
9520 sim=100-sim_weight[0][a][b];
\r
9521 if (seq_list[b]==1 && (sim>upper_sim || sim<lower_sim))
\r
9531 /*Prepare the new sequence List*/
\r
9533 for (b=0, a=0; a<S->nseq; a++)
\r
9537 sprintf ( name[b], "%s", S->name[a]);
\r
9538 sprintf ( seq[b] , "%s",(use_aln)?A->seq_al[a]: S->seq[a] );
\r
9544 NS=fill_sequence_struc (new_nseq,seq,name);
\r
9545 NA=seq2aln(NS,NULL,1);
\r
9547 if ( use_aln && A)
\r
9549 NA=realloc_aln2 ( NA,A->max_n_seq,A->len_aln+1);
\r
9551 for (b=0, a=0; a<S->nseq; a++)
\r
9555 sprintf ( NA->seq_al[b] , "%s",A->seq_al[a]);
\r
9560 NA->len_aln=A->len_aln;
\r
9567 Alignment* seq2subseq2( Alignment *A, Sequence *S,int use_aln, int int_lower_sim,int int_upper_sim, int min_nseq, int trim_direction, char *weight_mode, float ***sim_weight, int *seq_list)
\r
9573 char **seq, **name;
\r
9576 float lower_sim, upper_sim;
\r
9578 lower_sim=(int_lower_sim>100)?(float)int_lower_sim/100:int_lower_sim;
\r
9579 upper_sim=(int_upper_sim>100)?(float)int_upper_sim/100:int_upper_sim;
\r
9582 sim_weight[0]=get_weight ((use_aln)?A:NULL, S, weight_mode);
\r
9584 name=declare_char (S->nseq, (MAXNAMES+1));
\r
9585 seq= declare_char (S->nseq, S->max_len+1);
\r
9588 1 REMOVE OUTLAYERS
\r
9589 2 REMOVE CLOSELY RELATED SEQUENCES
\r
9590 3 IF STILL TOO MANY SEQUENCES:
\r
9591 REMOVE THE MOST CLOSELY RELATED ONES
\r
9595 /*1 Remove outlayers*/
\r
9600 /*1 Remove outlayers*/
\r
9601 while ( lower_sim && (extreme_seq(BOTTOM,A,sim_weight[0],seq_list, &seq_index) <lower_sim) && ((new_nseq)>min_nseq) && seq_index!=-1)
\r
9604 if ( seq_list[seq_index]==1)
\r
9606 seq_list[seq_index]=0;
\r
9610 /*2 Remove close relative*/
\r
9613 while ( upper_sim && (extreme_seq(TOP, A,sim_weight[0],seq_list, &seq_index)>upper_sim) && ((new_nseq)>min_nseq)&& seq_index!=-1)
\r
9616 if ( seq_list[seq_index]==1)
\r
9618 seq_list[seq_index]=0;
\r
9624 /*Remove extra sequences*/
\r
9626 while ( min_nseq>0 && new_nseq>min_nseq && seq_index!=-1)
\r
9629 extreme_seq(trim_direction, A,sim_weight[0],seq_list, &seq_index);
\r
9631 if ( seq_index==-1)break;
\r
9632 if ( seq_list[seq_index]==1)
\r
9634 seq_list[seq_index]=0;
\r
9640 /*Prepare the new sequence List*/
\r
9642 for (b=0, a=0; a<S->nseq; a++)
\r
9646 sprintf ( name[b], "%s", S->name[a]);
\r
9647 sprintf ( seq[b] , "%s",(use_aln)?A->seq_al[a]: S->seq[a] );
\r
9653 NS=fill_sequence_struc (new_nseq,seq,name);
\r
9654 NA=seq2aln(NS,NULL,1);
\r
9656 if ( use_aln && A)
\r
9658 NA=realloc_aln2 ( NA,A->max_n_seq,A->len_aln+1);
\r
9660 for (b=0, a=0; a<S->nseq; a++)
\r
9664 sprintf ( NA->seq_al[b],"%s",A->seq_al[a]);
\r
9669 NA->len_aln=A->len_aln;
\r
9677 float extreme_seq (int direction, Alignment *A,float **sim_weight,int *seq_list, int *seq_index)
\r
9680 /*find the closest relative of each sequence
\r
9682 Direction= BOTTOM: the sequence whose closest relative is the most distant
\r
9683 Direction= TOP: the sequence whose closest relative is the closest
\r
9684 weight: different sequences=100
\r
9685 similar sequences =0
\r
9689 float top_sim,bottom_sim, best_sim, sim;
\r
9690 int top_seq, bottom_seq;
\r
9692 bottom_seq=top_seq=seq_index[0]=-1;
\r
9696 for (a=0; a< A->nseq; a++)
\r
9698 if (seq_list[a]!=1)continue;
\r
9700 for ( best_sim=0, b=0; b< A->nseq; b++)
\r
9702 if ( a==b || !seq_list[b])continue;
\r
9704 sim=100-sim_weight[a][b];
\r
9711 if ( best_sim>top_sim)
\r
9717 if ( best_sim<bottom_sim)
\r
9720 bottom_sim=best_sim;
\r
9724 if ( direction==BOTTOM ){seq_index[0]= bottom_seq; return bottom_sim;}
\r
9725 else if ( direction==TOP){seq_index[0]= top_seq; return top_sim;}
\r
9736 Alignment* seq2subseq1( Alignment *A, Sequence *S,int use_aln, int percent,int max_nseq, int ms,char *weight_mode)
\r
9738 float **pw_weight,**sim_weight, **seq_weight;
\r
9740 float sum, chosen,last_chosen, last_nchosen,nchosen;
\r
9741 int condition1, condition2;
\r
9744 char **name, **seq;
\r
9745 float score, best_score;
\r
9747 int *seq_list, *used_seq_list;
\r
9751 (trim)_<seq or aln>_%<percentage of tot weight to keep>_n<number of seq to keep>_w<weight mode>
\r
9754 sim_weight=get_weight ((use_aln)?A:NULL, S, weight_mode);
\r
9755 pw_weight=declare_float (S->nseq, S->nseq);
\r
9756 seq_weight=declare_float ( S->nseq, 2);
\r
9759 for (best_score=0,a=0; a<S->nseq; a++)
\r
9761 for ( b=0; b<S->nseq; b++)
\r
9763 if ( a==b)continue;
\r
9764 seq_weight[a][0]+=sim_weight[a][b];
\r
9766 seq_weight[a][0]=seq_weight[a][0]/(S->nseq-1);
\r
9767 score=seq_weight[a][0]=100-seq_weight[a][0];
\r
9769 if ( score>best_score)
\r
9776 for (a=0; a<S->nseq; a++)
\r
9778 for ( b=0; b<S->nseq; b++)
\r
9780 if ( a==b)continue;
\r
9781 pw_weight[a][b]=sim_weight[a][b]*seq_weight[a][0]*seq_weight[b][0]/(100*100);
\r
9787 seq_list=vcalloc ( S->nseq, sizeof (int));
\r
9788 used_seq_list=vcalloc ( S->nseq, sizeof (int));
\r
9792 name=declare_char (S->nseq, (MAXNAMES+1));
\r
9793 seq= declare_char (S->nseq, S->max_len+1);
\r
9795 /*compute the normalization factor*/
\r
9796 for (sum=0,d=0; d< S->nseq; d++)
\r
9798 for (score=0,c=0; c<S->nseq; c++)
\r
9801 score=MAX(score, 100-sim_weight[c][d]);
\r
9806 /*chose the first sequence */
\r
9807 for ( best_score=0,a=0; a< S->nseq; a++)
\r
9809 for (score=0, b=0; b< S->nseq; b++)
\r
9811 score+=100-sim_weight[a][b];
\r
9813 if ( score>best_score)
\r
9822 last_chosen=chosen=((best_score/S->nseq)*100)/sum;
\r
9823 nchosen=last_nchosen=1;
\r
9824 seq_list[0]=best_seq;
\r
9825 used_seq_list[best_seq]=1;
\r
9827 sprintf ( name[0],"%s", S->name[seq_list[0]]);
\r
9828 sprintf ( seq[0],"%s", S->seq[seq_list[0]]);
\r
9829 nchosen=last_nchosen=1;
\r
9832 fprintf ( stderr, "\nTRIM:\n");
\r
9833 fprintf ( stderr, "\n1-Chosen Sequences\n");
\r
9834 /*Assemble the list of sequences*/
\r
9835 for (a=1; a< S->nseq; a++)
\r
9837 for (best_score=0,b=0; b< S->nseq; b++)
\r
9839 if (used_seq_list[b]);
\r
9842 score=pw_weight[seq_list[0]][b]+1;
\r
9843 for (c=0; c<a; c++)
\r
9844 score=MIN(score,pw_weight[seq_list[c]][b]);
\r
9846 if ( score>=best_score)
\r
9854 seq_list[a]=best_seq;
\r
9855 used_seq_list[best_seq]=1;
\r
9859 for ( chosen=0,d=0; d< S->nseq; d++)
\r
9861 for (score=0, c=0; c<=a; c++)
\r
9863 if ( seq_list[c]!=d)
\r
9864 score=MAX(score, 100-sim_weight[seq_list[c]][d]);
\r
9870 chosen=((chosen/S->nseq)*100)/sum;
\r
9873 condition1= (int)chosen<=(int)percent || !percent;
\r
9874 condition2=(nchosen)<=max_nseq || !max_nseq;
\r
9876 if (condition1 && condition2)
\r
9878 fprintf ( stderr, "\tADD %s (set score: %.2f %%)\n", S->name[seq_list[a]], chosen);
\r
9879 sprintf ( name[a],"%s", S->name[seq_list[a]]);
\r
9880 sprintf ( seq[a],"%s", S->seq[seq_list[a]]);
\r
9887 last_chosen=chosen;
\r
9888 last_nchosen=nchosen;
\r
9891 NS=fill_sequence_struc (last_nchosen,seq,name);
\r
9892 NA=seq2aln(NS,NULL,1);
\r
9893 fprintf ( stderr, "\n2-Informations:\n");
\r
9894 fprintf ( stderr, "\tUse...........: %s\n",(use_aln)?"multiple_aln":"pairwise_aln");
\r
9895 fprintf ( stderr, "\tweight_mode...: %s\n" ,weight_mode);
\r
9896 fprintf ( stderr, "\tpercent_weight: %.2f%% (max=%d%%)\n",last_chosen,percent);
\r
9897 fprintf ( stderr, "\tn_seq.........: %d\n" ,NS->nseq);
\r
9898 fprintf ( stderr, "\treduction.....: %d%% of original set\n" ,(NS->nseq*100)/S->nseq);
\r
9902 float ** get_weight ( Alignment *A, Sequence *S, char *mode)
\r
9905 char *weight_name;
\r
9907 char command[LONG_STRING];
\r
9908 char program[LONG_STRING];
\r
9913 if ( !mode || !mode[0] || strm (mode, "msa"))
\r
9915 if ( getenv ( "SEQ2MSA_WEIGHT")==NULL)sprintf (program, "%s",SEQ2MSA_WEIGHT);
\r
9916 else sprintf ( program, "%s", (getenv ( "SEQ2MSA_WEIGHT")));
\r
9918 else if ( strm(mode, "pwsim") ||strm(mode, "pwsim_fragment") )
\r
9920 return seq2pwsim (A, S, mode);
\r
9924 if (getenv (mode))sprintf ( program, "%s", (getenv (mode)));
\r
9925 else fprintf ( stderr, "\nERROR: %s is not a valid mode for weight computation [FATAL:%s]", mode, PROGRAM);
\r
9929 seq_name=vtmpnam(NULL);
\r
9930 aln_name=vtmpnam(NULL);
\r
9931 weight_name=vtmpnam(NULL);
\r
9932 weight=declare_float (S->nseq+1, 2);
\r
9938 output_clustal_aln (seq_name,A);
\r
9939 output_fasta_seq (aln_name,A);
\r
9940 sprintf ( command, "%s %s -i %s -w %s", program, seq_name, aln_name, weight_name);
\r
9945 output_fasta_seq (seq_name,A);
\r
9946 sprintf ( command, "%s %s -w %s", program, seq_name, weight_name);
\r
9950 my_system ( command);
\r
9952 fp=vfopen( weight_name, "r");
\r
9953 while ( (c=fgetc(fp))!='$');
\r
9956 while ( (fscanf (fp, "%*s %f\n",&(weight[c][1])))==1)
\r
9957 {weight[c][0]=c;c++;}
\r
9964 float **seq2pwsim ( Alignment *A, Sequence *S, char *mode)
\r
9970 W=declare_float (S->nseq, S->nseq);
\r
9974 for (a=0; a< S->nseq; a++)
\r
9975 for ( b=a; b<S->nseq; b++)
\r
9981 B=align_two_sequences ((S)->seq[a], (S)->seq[b],"pam250mt", -10, -1, "fasta_pair_wise");
\r
9982 for (t=0,d=0,c=0; c<B->len_aln; c++)
\r
9984 d+=(B->seq_al[0][c]==B->seq_al[1][c] && !is_gap(B->seq_al[0][c]));
\r
9985 t+=(!is_gap(B->seq_al[0][c]) && !is_gap(B->seq_al[1][c]));
\r
9987 t=(strm ( mode, "pwsim_fragment"))?B->len_aln:t;
\r
9994 for (t=0,d=0,c=0; c<A->len_aln; c++)
\r
9996 d+=(A->seq_al[a][c]==A->seq_al[b][c] && !is_gap(A->seq_al[a][c]));
\r
9997 t+=(!is_gap(A->seq_al[a][c]) && !is_gap(A->seq_al[b][c]));
\r
10003 W[a][b]=W[b][a]=(1-d)*100;
\r
10011 float **seq2pwsim_fragment ( Alignment *A, Sequence *S, char *mode)
\r
10019 W=declare_float (S->nseq, S->nseq);
\r
10024 for (a=0; a< S->nseq; a++)
\r
10025 for ( b=a; b<S->nseq; b++)
\r
10031 B=align_two_sequences ((S)->seq[a], (S)->seq[b],"pam250mt", -10, -1, "fasta_pair_wise");
\r
10032 for (t=0,d=0,c=0; c<B->len_aln; c++)
\r
10034 d+=(B->seq_al[0][c]==B->seq_al[1][c] && !is_gap(B->seq_al[0][c]));
\r
10035 t+=(!is_gap(B->seq_al[0][c]) && !is_gap(B->seq_al[1][c]));
\r
10038 d=d/((t==0)?1:t);
\r
10043 for (t=0,d=0,c=0; c<A->len_aln; c++)
\r
10045 d+=(A->seq_al[a][c]==A->seq_al[b][c] && !is_gap(A->seq_al[a][c]));
\r
10046 t+=(!is_gap(A->seq_al[a][c]) && !is_gap(A->seq_al[b][c]));
\r
10048 d=d/((t==0)?1:t);
\r
10052 W[a][b]=W[b][a]=(1-d)*100;
\r
10060 /********************************************************************/
\r
10062 /* AMINO ACID FUNCTIONS */
\r
10066 /********************************************************************/
\r
10067 //Builds an extended alphabet from a string
\r
10068 char** string2alphabet (char *string, int depth, int *falp_size)
\r
10071 int a, b,c, l, n;
\r
10083 l=strlen (string);
\r
10084 array=vcalloc ( 256, sizeof (int));
\r
10089 falp=declare_char (l+1, 2);
\r
10091 alp=declare_char(l,2);
\r
10094 array=vcalloc ( 256, sizeof (int));
\r
10095 for (a=0;a<l; a++)
\r
10097 if (!array[(int)string[a]])
\r
10099 array[(int)string[a]]=1;
\r
10100 sprintf (alp[alp_size++], "%c", string[a]);
\r
10101 sprintf (falp[falp_size[0]++], "%c", string[a]);
\r
10104 sprintf ( falp[falp_size[0]++], "*");
\r
10109 free_char (alp, -1);
\r
10112 alp2=vcalloc ( depth, sizeof (char**));
\r
10113 alp2_size=vcalloc (depth, sizeof (int));
\r
10115 for (a=0; a<depth; a++)
\r
10118 alp2_size[a]=alp_size;
\r
10122 for (a=2; a<=depth; a++)
\r
10124 char ***result_array;
\r
10126 result_array=generate_array_string_list (a, alp2, alp2_size, &n, NULL, NO_OVERLAP);
\r
10128 falp=vrealloc (falp, sizeof (char**)*max_s);
\r
10129 for (b=0; b<n; b++)
\r
10132 for (c=0; c<a; c++)
\r
10134 strcat (buf, result_array[b][c]);
\r
10136 falp[falp_size[0]]=vcalloc (strlen (buf)+1, sizeof (char));
\r
10137 sprintf ( falp[falp_size[0]++], "%s", buf);
\r
10138 vfree ( result_array[b]);
\r
10140 vfree (result_array);
\r
10144 falp[falp_size[0]]=vcalloc (2, sizeof (char));
\r
10145 sprintf ( falp[falp_size[0]++], "*");
\r
10146 free_char (alp, -1);
\r
10150 char** make_group_aa (int *ngroup, char *mode)
\r
10152 /*mode: indicates which matrix will be used for the grouping*/
\r
10153 /*n_group: pointer to the number of groups */
\r
10154 /*return value: an array of strings containing the AA of each group */
\r
10158 int a, b,c,is_in;
\r
10160 char **group_list;
\r
10161 char *matrix_name;
\r
10163 matrix_name=vcalloc ( 100, sizeof (char));
\r
10165 if (ngroup[0]==-1)extend=1;
\r
10168 group_list=declare_char ( 100, 27);
\r
10172 sprintf ( group_list[ngroup[0]++], "gG");
\r
10173 sprintf ( group_list[ngroup[0]++], "pP");
\r
10174 sprintf ( group_list[ngroup[0]++], "aA");
\r
10175 sprintf ( group_list[ngroup[0]++], "cC");
\r
10176 sprintf ( group_list[ngroup[0]++], "dD");
\r
10177 sprintf ( group_list[ngroup[0]++], "eE");
\r
10179 sprintf ( group_list[ngroup[0]++], "fF");
\r
10180 sprintf ( group_list[ngroup[0]++], "hH");
\r
10181 sprintf ( group_list[ngroup[0]++], "iI");
\r
10182 sprintf ( group_list[ngroup[0]++], "kK");
\r
10183 sprintf ( group_list[ngroup[0]++], "lL");
\r
10184 sprintf ( group_list[ngroup[0]++], "mM");
\r
10185 sprintf ( group_list[ngroup[0]++], "nN");
\r
10186 sprintf ( group_list[ngroup[0]++], "qQ");
\r
10187 sprintf ( group_list[ngroup[0]++], "rR");
\r
10189 sprintf ( group_list[ngroup[0]++], "sS");
\r
10190 sprintf ( group_list[ngroup[0]++], "tT");
\r
10191 sprintf ( group_list[ngroup[0]++], "vV");
\r
10192 sprintf ( group_list[ngroup[0]++], "wW");
\r
10193 sprintf ( group_list[ngroup[0]++], "*");
\r
10196 if ( mode && mode[0]=='_'){mode++;sprintf ( matrix_name, "%s", mode);}
\r
10198 if (mode==NULL || mode[0]=='\0')sprintf ( matrix_name, "idmat");
\r
10199 else if ( strstr (mode, "sim") || strm (mode, "idmat") || mode==NULL)
\r
10201 sprintf ( group_list[ngroup[0]++], "aA");
\r
10202 sprintf ( group_list[ngroup[0]++], "bB");
\r
10203 sprintf ( group_list[ngroup[0]++], "cC");
\r
10204 sprintf ( group_list[ngroup[0]++], "dD");
\r
10205 sprintf ( group_list[ngroup[0]++], "eE");
\r
10206 sprintf ( group_list[ngroup[0]++], "fF");
\r
10207 sprintf ( group_list[ngroup[0]++], "gG");
\r
10208 sprintf ( group_list[ngroup[0]++], "hH");
\r
10209 sprintf ( group_list[ngroup[0]++], "iI");
\r
10210 sprintf ( group_list[ngroup[0]++], "jJ");
\r
10211 sprintf ( group_list[ngroup[0]++], "kK");
\r
10212 sprintf ( group_list[ngroup[0]++], "lL");
\r
10213 sprintf ( group_list[ngroup[0]++], "mM");
\r
10214 sprintf ( group_list[ngroup[0]++], "nN");
\r
10215 sprintf ( group_list[ngroup[0]++], "oO");
\r
10216 sprintf ( group_list[ngroup[0]++], "pP");
\r
10217 sprintf ( group_list[ngroup[0]++], "qQ");
\r
10218 sprintf ( group_list[ngroup[0]++], "rR");
\r
10219 sprintf ( group_list[ngroup[0]++], "sS");
\r
10220 sprintf ( group_list[ngroup[0]++], "tT");
\r
10221 sprintf ( group_list[ngroup[0]++], "uU");
\r
10222 sprintf ( group_list[ngroup[0]++], "vV");
\r
10223 sprintf ( group_list[ngroup[0]++], "wW");
\r
10224 sprintf ( group_list[ngroup[0]++], "xX");
\r
10225 sprintf ( group_list[ngroup[0]++], "yY");
\r
10226 sprintf ( group_list[ngroup[0]++], "zZ");
\r
10227 vfree (matrix_name);
\r
10228 return group_list;
\r
10230 else if ( strm (mode, "simple"))
\r
10232 sprintf ( group_list[ngroup[0]++], "avilmAVILM");
\r
10233 sprintf ( group_list[ngroup[0]++], "dekrDEKR");
\r
10234 sprintf ( group_list[ngroup[0]++], "stcnqhSTCNQH");
\r
10235 sprintf ( group_list[ngroup[0]++], "wfyWFY");
\r
10236 sprintf ( group_list[ngroup[0]++], "gG");
\r
10237 sprintf ( group_list[ngroup[0]++], "pP");
\r
10238 vfree (matrix_name);
\r
10239 return group_list;
\r
10242 else if ( strm (mode, "mafft"))
\r
10246 sprintf ( group_list[ngroup[0]++],"agjopstAGJOPST");
\r
10247 sprintf ( group_list[ngroup[0]++],"ilmvILMV");
\r
10248 sprintf ( group_list[ngroup[0]++],"bdenqzBDENQZ");
\r
10249 sprintf ( group_list[ngroup[0]++],"hkrHKR");
\r
10250 sprintf ( group_list[ngroup[0]++],"fwyFWY");
\r
10251 sprintf ( group_list[ngroup[0]++],"cC");
\r
10252 vfree (matrix_name);
\r
10253 return group_list;
\r
10255 else if ( strm (mode, "clustalw"))
\r
10258 sprintf ( group_list[ngroup[0]++],"astaASTA");
\r
10259 sprintf ( group_list[ngroup[0]++],"bneqkBNEQK");
\r
10260 sprintf ( group_list[ngroup[0]++],"cnhqkCNHQK");
\r
10261 sprintf ( group_list[ngroup[0]++],"dndeqDNDEQ");
\r
10262 sprintf ( group_list[ngroup[0]++],"eqhrkEQHRK");
\r
10263 sprintf ( group_list[ngroup[0]++],"fmilvFMILV");
\r
10264 sprintf ( group_list[ngroup[0]++],"gmilfGMILF");
\r
10265 sprintf ( group_list[ngroup[0]++],"hhyHHY");
\r
10266 sprintf ( group_list[ngroup[0]++],"ifywIFYW");
\r
10267 sprintf ( group_list[ngroup[0]++],"jcJC");
\r
10268 sprintf ( group_list[ngroup[0]++],"kpKP");
\r
10269 vfree (matrix_name);
\r
10270 return group_list;
\r
10272 else if ( strm (mode, "polarity"))
\r
10275 sprintf ( group_list[ngroup[0]++],"eqrsdnkhtEQRSDNKHT");
\r
10276 sprintf ( group_list[ngroup[0]++],"pP");
\r
10277 sprintf ( group_list[ngroup[0]++],"gG");
\r
10278 sprintf ( group_list[ngroup[0]++],"cC");
\r
10279 sprintf ( group_list[ngroup[0]++],"fywFYW");
\r
10280 sprintf ( group_list[ngroup[0]++],"iavlmIAVLM");
\r
10281 vfree (matrix_name);
\r
10282 return group_list;
\r
10284 else if ( strm (mode, "vasiliky"))
\r
10287 sprintf ( group_list[ngroup[0]++], "rkRK");
\r
10288 sprintf ( group_list[ngroup[0]++], "deDE");
\r
10289 sprintf ( group_list[ngroup[0]++], "qhQH");
\r
10290 sprintf ( group_list[ngroup[0]++], "vilmVILM");
\r
10291 sprintf ( group_list[ngroup[0]++], "fyFY");
\r
10292 sprintf ( group_list[ngroup[0]++], "sS");
\r
10293 sprintf ( group_list[ngroup[0]++], "wW");
\r
10294 sprintf ( group_list[ngroup[0]++], "aA");
\r
10295 sprintf ( group_list[ngroup[0]++], "cC");
\r
10296 sprintf ( group_list[ngroup[0]++], "gG");
\r
10297 sprintf ( group_list[ngroup[0]++], "nN");
\r
10298 sprintf ( group_list[ngroup[0]++], "pP");
\r
10299 sprintf ( group_list[ngroup[0]++], "tT");
\r
10300 vfree (matrix_name);
\r
10301 return group_list;
\r
10303 else if ( strm (mode, "clustalw_col"))
\r
10305 sprintf ( group_list[ngroup[0]++], "staSTA");
\r
10306 sprintf ( group_list[ngroup[0]++], "neqkNEQK");
\r
10307 sprintf ( group_list[ngroup[0]++], "nhqkNHQK");
\r
10308 sprintf ( group_list[ngroup[0]++], "ndeqNDEQ");
\r
10309 sprintf ( group_list[ngroup[0]++], "qhrkQHRK");
\r
10310 sprintf ( group_list[ngroup[0]++], "milvMILV");
\r
10311 sprintf ( group_list[ngroup[0]++], "milfMILF");
\r
10312 sprintf ( group_list[ngroup[0]++], "hyHY");
\r
10313 sprintf ( group_list[ngroup[0]++], "fywFYW");
\r
10314 sprintf ( group_list[ngroup[0]++], "gG");
\r
10315 sprintf ( group_list[ngroup[0]++], "pP");
\r
10316 sprintf ( group_list[ngroup[0]++], "cC");
\r
10317 vfree (matrix_name);
\r
10319 return group_list;
\r
10321 else if ( strm (mode, "clustalw_dot"))
\r
10323 sprintf ( group_list[ngroup[0]++], "csaCSA");
\r
10324 sprintf ( group_list[ngroup[0]++], "atvATV");
\r
10325 sprintf ( group_list[ngroup[0]++], "sagSAG");
\r
10326 sprintf ( group_list[ngroup[0]++], "stnkSTNK");
\r
10327 sprintf ( group_list[ngroup[0]++], "stpaSTPA");
\r
10328 sprintf ( group_list[ngroup[0]++], "sgndSGND");
\r
10329 sprintf ( group_list[ngroup[0]++], "sndeqkSNDEQK");
\r
10330 sprintf ( group_list[ngroup[0]++], "ndeqhkNDEQHK");
\r
10331 sprintf ( group_list[ngroup[0]++], "neqhrkNEQHRK");
\r
10332 sprintf ( group_list[ngroup[0]++], "fvlimFVLIM");
\r
10333 sprintf ( group_list[ngroup[0]++], "hfyHFY");
\r
10334 vfree (matrix_name);
\r
10335 return group_list;
\r
10337 else if ( strm (mode, "make_all"))
\r
10340 sprintf ( group_list[0], "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
\r
10341 vfree (matrix_name);
\r
10342 return group_list;
\r
10344 else sprintf ( matrix_name, "%s", mode);
\r
10346 matrix=read_matrice ( matrix_name);
\r
10348 for ( a=0;a< 26; a++)
\r
10350 if ( matrix[a][a]>0)
\r
10352 for ( c=0,b=0;b< 26; b++)
\r
10355 if ( matrix[a][b]>0 && matrix[b][b]>0)
\r
10362 for ( is_in=0,b=0; b< ngroup[0]; b++)if ( strcmp (buf, group_list[b])==0)is_in=1;
\r
10363 if (is_in==0)sprintf ( group_list[ngroup[0]++], "%s", buf);
\r
10367 free_int (matrix, -1);
\r
10368 vfree (matrix_name);
\r
10370 return group_list;
\r
10372 char** make_group_aa_upgma (char*matrix, int max_n)
\r
10374 char **group_list;
\r
10377 int a, b, ba, bb, best, set, l, n;
\r
10380 group_list=declare_char (l+1, l+1);
\r
10381 for (a=0; a<l; a++)group_list[a][0]='a'+a;
\r
10382 mat=read_matrice(matrix);
\r
10383 used=vcalloc ( l, sizeof (int));
\r
10388 for (set=0,a=0; a<l-1; a++)
\r
10389 for (b=a+1; b<l; b++)
\r
10391 if (used[a]||used[b])continue;
\r
10393 if (set==0 || mat[a][b]>best)
\r
10402 for (a=0; a<l; a++)
\r
10404 mat[ba][a]=mat[a][ba]=(mat [ba][a]+mat[bb][a])/2;
\r
10407 strcat (group_list[ba], group_list[bb]);
\r
10408 vfree (group_list[bb]);
\r
10409 group_list[bb]=NULL;
\r
10414 for (n=0,a=0; a<l; a++)
\r
10416 if ( group_list[a])
\r
10417 group_list[n++]=group_list[a];
\r
10419 vfree (used); free_int (mat, -1);
\r
10420 return group_list;
\r
10423 int find_group_aa_distribution (char *col, int nseq,int n_group, char **gl, int *distrib, char *mode )
\r
10425 static int *distribution;
\r
10426 static char **lgl;
\r
10427 static int ln_group;
\r
10436 lgl=make_group_aa ( &ln_group, mode);
\r
10441 n_group2=ln_group;
\r
10446 n_group2=n_group;
\r
10449 if ( distribution==NULL || ln_group<n_group)distribution=vcalloc ( n_group2, sizeof (int));
\r
10450 if ( distrib==NULL)d=distribution;
\r
10454 for ( a=0; a< n_group2; a++)d[a]=0;
\r
10456 for ( a=0; a< nseq; a++)
\r
10458 for ( b=0; b< n_group2; b++)
\r
10459 d[b]+=is_in_set (col[a], gl2[b]);
\r
10462 for ( a=0; a< n_group2; a++)
\r
10463 c=(d[a]>c)?d[a]:c;
\r
10469 int is_in_same_group_aa ( char r1, char r2, int n_group, char **gl, char *mode)
\r
10472 static char **lgl;
\r
10473 static int ln_group;
\r
10478 /*use mode=idmat for similarity based on id*/
\r
10482 if (mode==NULL)return (r1==r2)?1:0;
\r
10484 if ( strm (mode, "clean"))
\r
10486 free_char (lgl, -1);
\r
10491 else if ( strstr (mode, "cov"))
\r
10498 lgl=make_group_aa ( &ln_group, mode);
\r
10504 n_group2=ln_group;
\r
10509 n_group2=n_group;
\r
10512 for ( a=0; a< n_group2; a++)
\r
10513 if ( is_in_set ( r1, gl2[a]) && is_in_set ( r2, gl2[a]))return 1;
\r
10518 Alignment * gene2prot (Alignment *A){return A; }
\r
10519 char * test_gene2prot (Constraint_list *CL, int s1)
\r
10522 int F=-10000000; /*FORBIDEN STATE*/
\r
10523 int AL=0; /*ALLOWED STATE*/
\r
10524 int SPLICE_PENALTY=1000;
\r
10525 int FRAME_PENALTY=1000;
\r
10528 int START, ORF1, ORF2, ORF3, s5NC;
\r
10529 int s3NC,ORF3_G1, ORF3_T2, ORF3_NC, ORF3_A3, ORF3_T4;
\r
10530 int U1_G1, U1_T2, U1_NC, U1_A3, U1_T4;
\r
10531 int U2_G1, U2_T2, U2_NC, U2_A3, U2_T4;
\r
10532 int U1, U2, U3, U4, U5, END;
\r
10535 int **transitions;
\r
10538 int **last_coding;
\r
10543 int orf1, orf2, orf3, ncp, p, state, pstate, e, best_state_p=0, best_state_v=0, best_pstate_p=0, best_pstate_v;
\r
10544 char *seq, *seq2, *seq3;
\r
10550 static int *entry;
\r
10553 seq=vcalloc ( strlen ((CL->S)->seq[s1])+1, sizeof (char));
\r
10554 seq2=vcalloc ( strlen ((CL->S)->seq[s1])+1, sizeof (char));
\r
10555 seq3=vcalloc ( strlen ((CL->S)->seq[s1])+1, sizeof (char));
\r
10556 sprintf ( seq, "%s", (CL->S)->seq[s1]);
\r
10560 for ( a=0; a< l; a++) seq[a]=tolower ( seq[a]);
\r
10561 for ( a=0; a< l; a++) seq[a]=(seq[a]=='t')?'u': seq[a];
\r
10564 potential=vcalloc (l+1, sizeof (int));
\r
10565 CL=index_constraint_list ( CL);
\r
10566 for (nal=0, a=0; a<(CL->S)->nseq; a++)
\r
10567 for ( b=CL->start_index[s1][a]; b< CL->end_index[s1][a];b++)
\r
10569 entry=extract_entry(entry, b, CL);
\r
10570 if ( entry[SEQ1]==s1)potential[entry[R1]-1]+=entry[WE];
\r
10571 else if ( entry[SEQ2]==s1)potential[entry[R2]-1]+=entry[WE];
\r
10577 SPLICE_PENALTY=10000;
\r
10578 FRAME_PENALTY=1000;
\r
10582 START=nstate++; ORF1=nstate++; ORF2=nstate++; ORF3=nstate++; s5NC=nstate++;
\r
10584 ORF3_G1=nstate++;U1_G1=nstate++;U2_G1=nstate++;
\r
10585 ORF3_T2=nstate++;U1_T2=nstate++;U2_T2=nstate++;
\r
10586 ORF3_NC=nstate++;U1_NC=nstate++;U2_NC=nstate++;
\r
10587 ORF3_A3=nstate++;U1_A3=nstate++;U2_A3=nstate++;
\r
10588 ORF3_T4=nstate++;U1_T4=nstate++;U2_T4=nstate++;
\r
10591 U1=nstate++; U2=nstate++; U3=nstate++; U4=nstate++; U5=nstate++;
\r
10594 is_coding=vcalloc ( nstate, sizeof (int));
\r
10595 is_coding[ORF1]=is_coding[ORF2]=is_coding[ORF3]=is_coding[U1]=is_coding[U2]=1;
\r
10596 is_coding[U3]=is_coding[U4]=is_coding[U5]=1;
\r
10598 is_t4=vcalloc ( nstate, sizeof (int));
\r
10599 is_t4[ORF3_T4]=is_t4[U1_T4]=is_t4[U2_T4]=1;
\r
10600 transitions=declare_int ( nstate, nstate);
\r
10601 for (a=0; a< nstate; a++)
\r
10602 for ( b=0; b< nstate; b++)transitions[a][b]=F;
\r
10604 transitions[START][ORF1]=AL;
\r
10605 transitions[START][s5NC]=AL-FRAME_PENALTY;
\r
10606 transitions[s5NC][s5NC]=AL;
\r
10608 transitions[s5NC][ORF1]=AL-FRAME_PENALTY;
\r
10610 transitions[ORF1][ORF2]=AL;
\r
10611 transitions[ORF2][ORF3]=AL;
\r
10612 transitions[ORF3][U1]=AL;
\r
10613 transitions[ORF3][ORF1]=AL;
\r
10614 transitions[ORF3][ORF3_G1]=AL-SPLICE_PENALTY;
\r
10617 transitions[ORF3_G1][ORF3_T2]=AL;
\r
10618 transitions[ORF3_T2][ORF3_NC]=AL;
\r
10619 transitions[ORF3_NC][ORF3_NC]=AL;
\r
10620 transitions[ORF3_NC][ORF3_A3]=AL;
\r
10621 transitions[ORF3_A3][ORF3_T4]=AL;
\r
10622 transitions[ORF3_T4][ORF1]=AL-SPLICE_PENALTY;
\r
10624 transitions[U1][U2]=AL;
\r
10625 transitions[U1][U1_G1]=AL-SPLICE_PENALTY;
\r
10626 transitions[U1_G1][U1_T2]=AL;
\r
10627 transitions[U1_T2][U1_NC]=AL;
\r
10628 transitions[U1_NC][U1_NC]=AL;
\r
10629 transitions[U1_NC][U1_A3]=AL;
\r
10630 transitions[U1_A3][U1_T4]=AL;
\r
10631 transitions[U1_T4][U3]=AL-SPLICE_PENALTY;
\r
10632 transitions[U3][U4]=AL;
\r
10633 transitions[U4][ORF1]=AL;
\r
10635 transitions[U2][U2_G1]=AL-SPLICE_PENALTY;
\r
10636 transitions[U2_G1][U2_T2]=AL;
\r
10637 transitions[U2_T2][U2_NC]=AL;
\r
10638 transitions[U2_NC][U2_NC]=AL;
\r
10639 transitions[U2_NC][U2_A3]=AL;
\r
10640 transitions[U2_A3][U2_T4]=AL;
\r
10641 transitions[U2_T4][U5]=AL-SPLICE_PENALTY;
\r
10642 transitions[U5][ORF1]=AL;
\r
10644 transitions[ORF3][s3NC]=AL-FRAME_PENALTY;
\r
10645 transitions[ORF3][END]=AL;
\r
10646 transitions[s3NC][END]=AL;
\r
10649 v_tab=declare_int ( l+1,nstate);
\r
10650 v_tab_p=declare_int ( l+1,nstate);
\r
10651 last_coding=declare_int ( l+1,nstate);
\r
10652 last_t4=declare_int ( l+1,nstate);
\r
10654 for (a=0; a< l; a++) potential[a]-=200;
\r
10656 codon=vcalloc ( 4, sizeof (char));
\r
10657 best_pstate_p=START;
\r
10660 for ( p=1; p<=l; p++)
\r
10662 if (translate_dna_codon (seq+(p-1), 'x')=='x' || p>(l-2))orf1=F;
\r
10663 else orf1=potential[p-1];
\r
10665 if (p<2 || translate_dna_codon (seq+(p-2), 'x')=='x' || p>(l-1))orf2=F;
\r
10666 else orf2=potential[p-1];
\r
10669 if (p<3 || translate_dna_codon (seq+(p-3), 'x')=='x' || p>l)orf3=F;
\r
10670 else orf3=potential[p-1];
\r
10672 if ( best_int (3, 1, &a, orf1, orf2, orf3)!=F)ncp=-best_int (3, 1, &a, orf1, orf2, orf3);
\r
10675 for ( state=0; state< nstate; state++)
\r
10678 if ( state==ORF1)e=orf1;
\r
10679 else if ( state==ORF2)e=orf2;
\r
10680 else if ( state==ORF3)e=orf3;
\r
10681 else if ( state>=U1 && state<=U3)
\r
10685 else if ( state==U4)
\r
10687 codon[2]=seq[p-1];
\r
10688 codon[1]=seq[last_coding[p-1][U3]-1];
\r
10689 codon[0]=seq[last_coding[p-2][U1_T4]-1];
\r
10690 if ( translate_dna_codon (codon, 'x')=='x')e=F;
\r
10693 else if ( state==U5)
\r
10695 codon[2]=seq[p-1];
\r
10696 codon[1]=seq[last_coding[p-1][U2_T4]-1];
\r
10697 q=seq[last_coding[p-1][U2_T4]];
\r
10698 codon[0]=seq[last_coding[q-1][U1]-1];
\r
10699 if ( translate_dna_codon (codon, 'x')=='x')e=F;
\r
10703 else if (state>=ORF3_G1 && state<=U2_G1)e=(p<l-1 && seq[p-1]=='g' && seq[p]=='u')?ncp:F;
\r
10704 else if ( state>=ORF3_T2 && state<=U2_T2)
\r
10706 e=(p>1 && seq[p-2]=='g' && seq[p-1]=='u')?ncp:F;
\r
10708 else if ( state>=ORF3_A3 && state<=U2_A3)e=(seq[p-1]=='a')?ncp:F;
\r
10709 else if ( state>=ORF3_T4 && state<=U2_T4)e=(seq[p-1]=='u')?ncp:F;
\r
10712 for ( pstate=0; pstate<nstate; pstate++)
\r
10714 if (e==F || transitions[pstate][state]==F || v_tab[p-1][pstate]==F)v=F;
\r
10715 else v=e+transitions[pstate][state]+v_tab[p-1][pstate];
\r
10717 if ( pstate==0 || v>best_pstate_v)
\r
10718 {best_pstate_v=v;best_pstate_p=pstate;}
\r
10720 v_tab[p][state]=best_pstate_v;
\r
10721 v_tab_p[p][state]=best_pstate_p;
\r
10723 if (!is_coding[state])last_coding[p][state]=last_coding[p-1][best_pstate_p];
\r
10724 else if (is_coding[state])last_coding[p][state]=p;
\r
10726 if (!is_t4[state])
\r
10728 if (is_coding[state] && last_t4[p-1][best_pstate_p]==0)last_t4[p][state]=p;
\r
10729 else last_t4[p][state]=last_t4[p-1][best_pstate_p];
\r
10731 else if (is_t4[state])last_t4[p][state]=p;
\r
10733 if (state==0 ||best_pstate_v>best_state_v ){best_state_p=state; best_state_v=best_pstate_v;}
\r
10737 for ( p=l; p>0; p--)
\r
10739 if ( best_state_p>=ORF1 && best_state_p<=ORF3){seq2[tot++]=tolower (seq[p-1]);}
\r
10740 else if ( best_state_p>=U1 && best_state_p<=U5){seq2[tot++]=tolower (seq[p-1]);}
\r
10741 if (best_state_p==ORF1)seq[p-1]=toupper (seq[p-1]);
\r
10742 else if (best_state_p==ORF2 || best_state_p==ORF3)seq[p-1]=tolower (seq[p-1]);
\r
10743 else if ( best_state_p==ORF3_NC || best_state_p==U1_NC || best_state_p==U2_NC) seq[p-1]='.';
\r
10744 else if ( best_state_p==U1 || best_state_p==U2 || best_state_p==U3 || best_state_p==U4 || best_state_p==U5) seq[p-1]=best_state_p-U1+'1';
\r
10745 else seq[p-1]=toupper (seq[p-1]);
\r
10746 best_state_p=v_tab_p[p][best_state_p];
\r
10749 for ( a=0, b=tot-1; b>=0; b--, a++)
\r
10752 fprintf ( stderr, "\n%s\n", seq);
\r
10753 fprintf ( stderr, "\nN coding=%d\n", tot);
\r
10754 for ( a=0; a< tot; a+=3)
\r
10756 b=translate_dna_codon (seq3+a, 'x');
\r
10757 fprintf ( stderr, "%c",b);
\r
10758 if ( b=='x'){fprintf ( stderr, "\n");myexit (EXIT_SUCCESS);}
\r
10761 fprintf ( stderr, "\n");
\r
10762 myexit (EXIT_SUCCESS);
\r
10768 Alignment * dna_aln2_3frame_cdna_aln(Alignment *A,int *ns,int **l_s)
\r
10772 B=realloc_aln2 (NULL,6,strlen(A->seq_al[l_s[0][0]])+strlen(A->seq_al[l_s[1][0]]));
\r
10773 for ( a=0; a< 3; a++)
\r
10775 B->seq_al[a]=translate_dna_seq (A->seq_al[l_s[0][0]]+a, 0, 'o',B->seq_al[a]);
\r
10776 B->seq_al[a+3]=translate_dna_seq (A->seq_al[l_s[1][0]]+a, 0, 'o',B->seq_al[a+3]);
\r
10778 for ( a=1; a<3; a++)
\r
10780 if ( strlen(B->seq_al[a])<strlen(B->seq_al[0])) B->seq_al[a]=strcat ( B->seq_al[a], "x");
\r
10781 if ( strlen(B->seq_al[a+3])<strlen(B->seq_al[3])) B->seq_al[a+3]=strcat ( B->seq_al[a+3], "x");
\r
10785 B->len_aln=strlen (B->seq_al[0]);
\r
10790 //For normal distribution scan
\r
10792 #define PI 3.141592653589793238462643
\r
10795 double normal(double x, double mean, double std)
\r
10797 return (1/(std*sqrt(2.0*PI)))*exp((-0.5*(x-mean)*(x-mean))/(std*std));
\r
10800 int ** get_sim_aln_array_normal_distribution ( Alignment *A, char *mode, int *STD, int *CENTER)
\r
10806 w=declare_int ( A->nseq, A->nseq);
\r
10808 for ( a=0; a< A->nseq-1; a++)
\r
10810 for ( b=a+1; b< A->nseq; b++)
\r
10813 w[a][b]=w[b][a]=generic_get_seq_sim_normal_distribution ( A->seq_al[a], A->seq_al[b], (A->cdna_cache)?A->cdna_cache[0]:NULL, mode, STD, CENTER);
\r
10818 int generic_get_seq_sim_normal_distribution ( char *seq1, char *seq2, int*cache, char *mode, int *STD, int *CENTER)
\r
10820 return get_seq_sim_distribution ( seq1,seq2,GAP_LIST, mode, STD, CENTER);
\r
10823 int get_seq_sim_distribution ( char *string1, char *string2, char *ignore, char *in_mode, int *STD, int *CENTER)
\r
10829 int r=0,r1=0,r2=0;
\r
10836 sprintf ( mode, "%s", in_mode);
\r
10838 /*mode: <mat>__<sim_mode>
\r
10839 mat: idscore to get the alignment done
\r
10840 any legal cw matrix
\r
10841 sim_mode: sim1->identities/matches
\r
10842 sim2->identities/min len
\r
10846 if ( (p=strstr (mode, "_"))!=NULL)
\r
10853 if (strstr (mode, "idscore"))
\r
10855 static int **mat;
\r
10856 if (!mat) mat=read_matrice ("blosum62mt");
\r
10857 return idscore_pairseq (string1, string2, -12, -1, mat,mode);
\r
10860 len1=strlen (string1);
\r
10861 for ( sim=pos0=0,a=0; a< len1; a++)
\r
10865 p1=1-is_in_set (r1, ignore);
\r
10866 p2=1-is_in_set (r2, ignore);
\r
10870 if (is_in_same_group_aa(r1,r2,0, NULL, mode))
\r
10872 sim += normal(a, *CENTER, *STD);
\r
10875 else if (p1+p2==1)
\r
10881 if ( p==NULL || strm (p, "sim1") || strm (p, "sim"))
\r
10883 r=(pos0==0)?0:(sim*MAXID);
\r
10885 /* else if ( strm (p, "sim2"))
\r
10887 r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MIN(pos1,pos2);
\r
10889 else if ( strm (p, "sim3"))
\r
10891 r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MAX(pos1,pos2);
\r
10893 else if ( strm (p, "gap1"))
\r
10895 r=(len1==0)?MAXID:(gap*MAXID)/len1;
\r
10898 else if ( strm (p, "logid"))
\r
10900 r=logid_score (pos0, sim);
\r
10907 Alignment *aln2clean_pw_aln (Alignment *A, OveralnP *F)// char *mode, int t, int f, int p1,int p2, int p3, char *fsa_mode)
\r
10914 if (F->t==0)F->t=2;
\r
10916 C=declare_int ( A->nseq, A->len_aln);
\r
10917 T=declare_int ( A->nseq, A->len_aln);
\r
10918 B=copy_aln (A, NULL);
\r
10920 for (a=0; a< A->nseq;a++)
\r
10922 for (b=0; b<A->nseq; b++)
\r
10925 w=pw_aln2clean_aln_weight (A->seq_al[a], A->seq_al[b], 1,F);//f,p1, p2, p3, fsa_mode);
\r
10926 for (c=0; c<A->len_aln; c++)
\r
10928 if (A->seq_al[a][c]=='-')continue;
\r
10938 for (a=0; a<A->nseq; a++)
\r
10940 for (b=0; b<A->len_aln; b++)
\r
10943 c=A->seq_al[a][b];
\r
10945 else if (T[a][b]==0);
\r
10949 r=(C[a][b]*10)/T[a][b];
\r
10951 if (!F->mode || strm (F->mode, "number"))
\r
10952 B->seq_al[a][b]='0'+r;
\r
10953 else if ( F->mode && strm (F->mode, "unalign"))
\r
10954 B->seq_al[a][b]='0'+r;
\r
10955 else if ( F->mode && strm (F->mode, "lower") )
\r
10957 if (r<=F->t)B->seq_al[a][b]=tolower (B->seq_al[a][b]);
\r
10958 else B->seq_al[a][b]=toupper (B->seq_al[a][b]);
\r
10964 if ( F->mode && strm (F->mode, "unalign"))
\r
10966 A=unalign_aln (A, B, F->t);
\r
10968 B=copy_aln (A, NULL);
\r
10971 free_int (C, -1);
\r
10972 free_int (T, -1);
\r
10977 char **pw_aln2clean_pw_aln_fsa1 (char ** aln, OveralnP *F);
\r
10978 char **pw_aln2clean_pw_aln_fsa2 (char ** aln, OveralnP *F);
\r
10980 int * pw_aln2clean_aln_weight ( char *seq1, char *seq2, int w, OveralnP *F)
\r
10986 if ( (l=strlen (seq1)) !=strlen (seq2))
\r
10988 HERE ("\n%s\n%s\n", seq1, seq2);
\r
10989 printf_exit ( EXIT_FAILURE, stderr, "\nERROR: Comparing unaligned sequences [FATAL:%s]", PROGRAM);
\r
10993 aln=declare_char (2, l+1);
\r
10994 sprintf ( aln[0], "%s", seq1);
\r
10995 sprintf ( aln[1], "%s", seq2);
\r
10998 aln=pw_aln2clean_pw_aln (aln, F);
\r
11000 weight=vcalloc (l+1, sizeof (int));
\r
11001 for (a=0; a<l; a++)
\r
11003 if ( aln[0][a] || seq1[a]=='x' || seq1[a]=='X' || seq2[a]=='x' || seq2[a]=='X')weight[a]=w;
\r
11005 free_char (aln, -1);
\r
11011 char **pw_aln2clean_pw_aln (char ** aln, OveralnP *F)
\r
11014 if ( strm (F->model, "fsa2"))return pw_aln2clean_pw_aln_fsa2 (aln,F);
\r
11015 else if ( strm (F->model, "fsa1"))return pw_aln2clean_pw_aln_fsa1 (aln,F);
\r
11016 else return pw_aln2clean_pw_aln_fsa1 (aln,F);
\r
11019 char **pw_aln2clean_pw_aln_fsa2 (char ** aln, OveralnP *FO)
\r
11021 int a, b, c, d, l, id;
\r
11022 int c1, c2, e0, e1,tb, obs;
\r
11024 int **mat, **tran, **p, **t, *s, *ids;
\r
11026 int S, M1, M2, m1, m2,B1, B2,G1,G2, K;
\r
11028 int MID_EXON_FACTOR=50;
\r
11030 static int **smat;
\r
11031 int model_type=1;
\r
11034 if ( getenv ("MID_EXON_FACTOR"))MID_EXON_FACTOR=atoi (getenv ("MID_EXON_FACTOR"));
\r
11038 if (!smat)smat=read_matrice ( "blosum62mt");
\r
11040 l=strlen (aln[0]);
\r
11042 if ( l!=strlen (aln[1]))
\r
11044 printf_exit ( EXIT_FAILURE, stderr, "\nERROR: unaligned strings");
\r
11049 s=vcalloc (l, sizeof (int));
\r
11050 ids=vcalloc (l, sizeof (int));
\r
11052 //record the id level of each posotion
\r
11053 for (b=0; b<l; b++)
\r
11055 c1=tolower(aln[0][b]);c2=tolower(c2=aln[1][b]);
\r
11057 if (c1=='-' || c2=='-' || c1=='X' || c2=='X' || c1!=c2)ids[b]=0;
\r
11061 //record the state of each position: M, m, T, gap
\r
11062 for (id=0,b=0,a=0;a<l; a++)
\r
11064 c1=aln[0][a];c2=aln[1][a];
\r
11065 if (islower (c1))s[a]=3;
\r
11066 else if (c1=='-' || c2=='-' || c1=='X' || c2=='X')s[a]=2;
\r
11070 sc=smat[c1-'A'][c2-'A'];
\r
11071 if (sc>=2){id++; s[a]=1;}
\r
11079 vfree(s);vfree (ids);
\r
11085 FO->p1=(FO->p1==0)?5:FO->p1;
\r
11086 FO->p2=(FO->p2==0)?15:FO->p2;
\r
11087 FO->p3=(FO->p3==0)?0:FO->p3;
\r
11088 FO->p4=(FO->p4==0)?100:FO->p4;
\r
11091 T1=100*(float)id/(float)b;
\r
11092 T2=(FO->f==0)?30:T1*(float)((float)FO->f/(float)100);
\r
11098 //3: exon boundary
\r
11102 M1=ns++;//1 matched aligned
\r
11103 m1=ns++;//2 mmatched aligned
\r
11104 M2=ns++;//3 matched unaligned
\r
11105 m2=ns++;//4 mmatched unaligned
\r
11106 B1=ns++;//5 transition aligned
\r
11107 B2=ns++;//6 transition unaligned
\r
11109 mat=declare_int (ns, 4);
\r
11110 tran=declare_int (ns, ns);
\r
11111 p=declare_int (l+1, ns);
\r
11112 t=declare_int (l+1, ns);
\r
11114 //emission Values
\r
11115 mat[M1][0]=F; //non id
\r
11116 mat[M1][1]=T1;//id
\r
11117 mat[M1][2]=0; //gap
\r
11118 mat[M1][3]=F; //transition
\r
11125 mat[m1][0]=100-T1;
\r
11130 mat[m2][0]=100-T2;
\r
11145 //transition values
\r
11155 tran[M1][m2]=-FO->p4;
\r
11156 tran[M1][M1]=+FO->p2;
\r
11160 tran[M1][B2]=-FO->p1;
\r
11163 tran[M2][m2]=+FO->p3;
\r
11177 tran[m1][B2]=-FO->p1;
\r
11181 tran[m2][M1]= -FO->p4;
\r
11182 tran[m2][M2]= +FO->p3;
\r
11195 tran[B2][m1]= -FO->p1;
\r
11197 tran[B2][M1]= -FO->p1;
\r
11203 translate=vcalloc (ns, sizeof (int));
\r
11211 for (a=1;a<=l; a++)
\r
11215 for (cs=0; cs<ns; cs++)
\r
11217 for (ps=0; ps<ns; ps++)
\r
11219 c=p[a-1][ps]+mat[cs][obs]+tran[ps][cs];
\r
11220 if (ps==0 || c>=best){t[a][cs]=ps;best=p[a][cs]=c;}
\r
11227 for (a=0; a<ns; a++)
\r
11229 if (a==0 || p[l][a]>=best){tb=a;best=p[l][a];}
\r
11232 for (a=l; a>0; a--)
\r
11238 aln[0][p2]=aln[1][p2]=translate[tb];
\r
11243 free_int (p, -1);
\r
11245 free_int (t, -1);
\r
11246 free_int (mat, -1);
\r
11247 free_int (tran, -1);
\r
11248 vfree (translate);
\r
11251 char **pw_aln2clean_pw_aln_fsa1 (char ** aln, OveralnP *FO)
\r
11253 int a, b, c, d, l, id;
\r
11254 int c1, c2, e0, e1,tb, obs;
\r
11256 int **mat, **tran, **p, **t, **s;
\r
11258 int S, M1, M2, m1, m2, K;
\r
11261 static int **smat;
\r
11265 if (!smat)smat=read_matrice ( "blosum62mt");
\r
11267 l=strlen (aln[0]);
\r
11269 if ( l!=strlen (aln[1]))
\r
11271 printf_exit ( EXIT_FAILURE, stderr, "\nERROR: unaligned strings");
\r
11275 s=declare_int (l+1, 2);
\r
11276 for (id=0,b=0,a=0;a<l; a++)
\r
11278 c1=aln[0][a];c2=aln[1][a];
\r
11280 if ( c1=='-' || c2=='-' || c1=='x' || c1=='X' || c2=='x' || c2=='X')continue;
\r
11284 sc=smat[c1-'A'][c2-'A'];
\r
11285 if (sc>=2){id++; s[b][0]=1;}
\r
11286 else {s[b][0]=0;}
\r
11294 free_int (s, -1);
\r
11297 FO->f=(FO->f==0)?30:FO->f;
\r
11298 FO->p1=(FO->p1==0)?90:FO->p1;
\r
11299 FO->p2=(FO->p2==0)?15:FO->p2;
\r
11300 FO->p3=(FO->p3==0)?0:FO->p3;
\r
11302 l=b;//length of the ungapped aln
\r
11303 T1=100*(float)id/(float)b;
\r
11314 M1=ns++;//1 matched aligned
\r
11315 m1=ns++;//2 mmatched aligned
\r
11316 M2=ns++;//3 matched unaligned
\r
11317 m2=ns++;//4 mmatched unaligned
\r
11319 mat=declare_int (ns, 2);
\r
11320 tran=declare_int (ns, ns);
\r
11321 p=declare_int (l+1, ns);
\r
11322 t=declare_int (l+1, ns);
\r
11331 mat[m1][0]=100-T1;
\r
11334 mat[m2][0]=100-T2;
\r
11345 tran[M1][m2]=-FO->p1;// -P;
\r
11346 tran[M1][M1]=+FO->p2;
\r
11351 tran[M2][m2]=+FO->p3;
\r
11364 tran[m2][M1]=-FO->p1;
\r
11365 tran[m2][M2]=+FO->p3;
\r
11368 translate=vcalloc (ns, sizeof (int));
\r
11376 for (a=1;a<=l; a++)
\r
11380 for (cs=0; cs<ns; cs++)
\r
11382 for (ps=0; ps<ns; ps++)
\r
11384 c=p[a-1][ps]+mat[cs][obs]+tran[ps][cs];
\r
11385 if (ps==0 || c>=best){t[a][cs]=ps;best=p[a][cs]=c;}
\r
11392 for (a=0; a<ns; a++)
\r
11394 if (a==0 || p[l][a]>=best){tb=a;best=p[l][a];}
\r
11396 for (a=l; a>0; a--)
\r
11398 int p2=s[a-1][1];
\r
11399 aln[0][p2]=aln[1][p2]=translate[tb];
\r
11405 free_int (p, -1);
\r
11406 free_int (s, -1);
\r
11407 free_int (t, -1);
\r
11408 free_int (mat, -1);
\r
11409 free_int (tran, -1);
\r
11410 vfree (translate);
\r
11413 float* analyze_overaln ( Alignment *iA, Alignment *iB, char *mode, int filter, int f, int p1,int p2, int p3)
\r
11415 Alignment *C, *D;
\r
11416 Alignment *A, *B;
\r
11419 F=vcalloc (1, sizeof (OveralnP));
\r
11425 sprintf (F->mode, "%s", mode);
\r
11429 A=copy_aln (iA, NULL);
\r
11430 B=copy_aln (iB, NULL);
\r
11432 C=aln2gap_cache (A,0);
\r
11433 A=filter_aln_upper_lower (A, C, 0, 0);
\r
11434 D=aln2clean_pw_aln (B, F);
\r
11435 r=aln2pred (A,D,mode);
\r
11442 float* aln2pred ( Alignment *A, Alignment*B, char *mode)
\r
11444 int a, b, c, d, i, l, salp, s, n;
\r
11445 static char **list, *buf1, *buf2, *alp, *alp_lu;
\r
11448 int fp, fn, tn, tp;
\r
11449 int tfp, tfn, ttn, ttp;
\r
11450 float sp, sn, sen2, best, result;
\r
11454 fresult=vcalloc ( 3, sizeof (float));
\r
11456 if ( mode && strstr (mode, "case"))
\r
11458 A=aln2case_aln (A,"u","l");
\r
11459 B=aln2case_aln (B,"u","l");
\r
11462 if (mode && strstr (mode, "printaln"))
\r
11467 C=copy_aln (B, NULL);
\r
11468 for (a=0; a<B->nseq; a++)
\r
11470 i=name_is_in_list (C->name[a], S->name, S->nseq, 100);
\r
11472 for (b=0; b<C->len_aln; b++) C->seq_al[a][b]='-';
\r
11474 for (d=0,b=0; b<C->len_aln; b++)
\r
11476 if ( !is_gap (C->seq_al[a][b]))
\r
11478 if (C->seq_al[a][b]==S->seq[i][d])C->seq_al[a][b]=toupper(C->seq_al[a][b]);
\r
11486 vfree (alp);vfree (alp_lu);
\r
11487 alp=vcalloc ( 256, sizeof (char));
\r
11488 alp_lu=vcalloc ( 256, sizeof (char));
\r
11490 for (c=0; c<2; c++)
\r
11494 for (salp=0,a=0; a<AL->nseq; a++)
\r
11496 for (b=0; b<AL->len_aln; b++)
\r
11498 c=AL->seq_al[a][b];
\r
11499 if (!is_gap(c) && !alp[c])
\r
11509 vfree (buf1); vfree(buf2);
\r
11510 buf1=vcalloc ( A->len_aln+1, sizeof (char));
\r
11511 buf2=vcalloc ( B->len_aln+1, sizeof (char));
\r
11513 free_arrayN ((void **)r, 3);
\r
11514 r=declare_arrayN(3, sizeof (int),A->nseq,salp+1,salp+1);
\r
11515 free_char ( list, -1);
\r
11516 list=declare_char ( A->nseq, 100);
\r
11517 for (n=0,a=0; a< A->nseq; a++)
\r
11519 for ( b=0; b<B->nseq; b++)
\r
11521 if ( strm (A->name[a], B->name[b]))
\r
11523 sprintf ( buf1, "%s", A->seq_al[a]);
\r
11524 sprintf ( buf2, "%s", B->seq_al[b]);
\r
11525 ungap (buf1); ungap (buf2);
\r
11526 if ((l=strlen (buf1))!=strlen (buf2))continue;
\r
11529 sprintf ( list[n], "%s", A->name[a]);
\r
11530 for (c=0; c<l; c++)
\r
11535 r[n][alp[c1]][alp[c2]]++;
\r
11545 for ( s=1; s<=salp; s++)
\r
11548 sprintf (type, "_%c_", alp_lu[s]);
\r
11549 ttp=ttn=tfp=tfn=0;
\r
11550 for (a=0; a<n; a++)
\r
11553 for (b=1; b<=salp; b++)
\r
11555 for (c=1; c<=salp; c++)
\r
11557 if ( b==s && c==s) tp+=r[a][b][c];
\r
11558 else if ( b==s && c!=s)fn+=r[a][b][c];
\r
11559 else if ( b!=s && c==s)fp+=r[a][b][c];
\r
11560 else if ( b!=s && b!=s)tn+=r[a][b][c];
\r
11569 rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
\r
11570 if ( mode && strstr (mode, "printstat"))fprintf ( stdout, ">%s S=%c sp=%6.2f sn=%6.2f sen2=%6.2f best=%6.2f\n", list[a],alp_lu[s],sp, sn, sen2, best);
\r
11573 rates2sensitivity (ttp, ttn, tfp, tfn, &sp, &sn, &sen2, &best);
\r
11574 if (mode && strstr (mode, "printstat"))fprintf ( stdout, ">TOT S=%c sp=%6.2f sn=%6.2f re=%6.2f best=%6.2f\n", alp_lu[s],sp, sn, sen2, best);
\r
11576 if ( mode && strstr (mode, type))
\r
11586 Alignment * mark_exon_boundaries (Alignment *A, Alignment *E)
\r
11588 char *buf, *buf2;
\r
11589 int a, b, c, i, l;
\r
11591 buf2=vcalloc ( E->len_aln+1, sizeof (char));
\r
11592 buf =vcalloc ( E->len_aln+1, sizeof (char));
\r
11594 for (a=0; a< A->nseq; a++)
\r
11596 i=name_is_in_list (A->name[a], E->name, E->nseq, 100);
\r
11597 if ( i==-1) continue;
\r
11598 sprintf (buf, "%s", E->seq_al[i]);
\r
11602 for (c=0, b=0; b<l; b++)if (buf[b]!='o' && buf[b]!='b' && buf[b]!='j')buf2[c++]=toupper(buf[b]);
\r
11605 //lowercase the boundaries of buf2;
\r
11606 for ( c=0,b=0; b<l; b++)
\r
11608 //ENSEMBL: o: 0, b:1 j:2
\r
11609 if (buf[b]=='b' || buf[b]=='o' && c>=1)buf2[c-1]=tolower(buf2[c-1]);
\r
11610 else if (buf[b]=='j' &&c<l)buf2[c+1]=tolower(buf2[c+1]);
\r
11614 for (c=0,b=0; b<A->len_aln; b++)
\r
11616 if (!is_gap(A->seq_al[a][b]))
\r
11618 A->seq_al[a][b]=buf2[c++];
\r
17509 /*********************************COPYRIGHT NOTICE**********************************/
17510 /*© Centro de Regulacio Genomica */
17512 /*Cedric Notredame */
17513 /*Tue Oct 27 10:12:26 WEST 2009. */
17514 /*All rights reserved.*/
17515 /*This file is part of T-COFFEE.*/
17517 /* T-COFFEE is free software; you can redistribute it and/or modify*/
17518 /* it under the terms of the GNU General Public License as published by*/
17519 /* the Free Software Foundation; either version 2 of the License, or*/
17520 /* (at your option) any later version.*/
17522 /* T-COFFEE is distributed in the hope that it will be useful,*/
17523 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
17524 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
17525 /* GNU General Public License for more details.*/
17527 /* You should have received a copy of the GNU General Public License*/
17528 /* along with Foobar; if not, write to the Free Software*/
17529 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
17530 /*............................................... |*/
17531 /* If you need some more information*/
17532 /* cedric.notredame@europe.com*/
17533 /*............................................... |*/
17537 /*********************************COPYRIGHT NOTICE**********************************/