7 #include "io_lib_header.h"
8 #include "util_lib_header.h"
9 #include "dp_lib_header.h"
10 #include "define_header.h"
12 int aln_has_stockholm_structure (Alignment *A)
14 return name_is_in_list ("#=GC SS_cons", A->name, A->nseq, 100);
17 int get_aln_stockholm_structure (Alignment *A)
20 if ((i=aln_has_stockholm_structure(A))==-1)
21 A=add_alifold2aln (A, NULL);
22 return aln_has_stockholm_structure(A);
24 int ** update_RNAfold_list (Alignment *A, int **pos, int s, int **l)
29 if (!is_gap(A->seq_al[s][l[a][0]]) && !is_gap (A->seq_al[s][l[a][1]]))
31 l[a][2]=pos[s][l[a][0]];
32 l[a][3]=pos[s][l[a][1]];
43 Alignment *compare_RNA_fold ( Alignment *A, Alignment *B)
49 int tot_ol=0, tot_l=0;
51 i1=get_aln_stockholm_structure (A);
52 i2=get_aln_stockholm_structure (B);
54 l1=vienna2list (A->seq_al[i1]);
55 l2=vienna2list (B->seq_al[i2]);
57 pos1=aln2pos_simple(A, A->nseq);
58 pos2=aln2pos_simple(B, B->nseq);
62 for (a=0; a< A->nseq; a++)
65 int ol=0, ll1=0, ll2=0;
66 if ( A->name[a][0]=='#')continue;
67 i=name_is_in_list (A->name[a], B->name, B->nseq, 100);
70 l1=update_RNAfold_list (A,pos1,a, l1);
71 l2=update_RNAfold_list (B,pos2,i, l2);
72 lu=declare_char (A->len_aln, B->len_aln);
78 if (l2[b][2]==-1 || l2[b][3]==-1);
82 lu[l2[b][2]][l2[b][3]]=1;
92 if (l1[b][2]==-1 || l1[b][3]==-1);
96 if (lu[l1[b][2]][l1[b][3]]==1)
98 A->seq_al[a][l1[b][0]]='6';
99 A->seq_al[a][l1[b][1]]='6';
104 A->seq_al[a][l1[b][0]]='0';
105 A->seq_al[a][l1[b][1]]='0';
116 fprintf ( stdout, "@@ Seq: %s Overalp: %.2f Al1: %.2f Al2: %.2f \n", A->name[a], (float)(ol*200)/(ll1+ll2), (float)(ol*100)/ll1,(float)(ol*100)/ll2);
119 fprintf ( stdout, "@@ Seq: Tot Overalp: %.2f \n", (float)(tot_ol*200)/(tot_l));
123 int is_neutral(char c1, char c2);
124 int is_watson (char c1, char c2);
125 int is_watson2 (char c1, char c2);
126 int is_watson (char c1, char c2)
130 if ( is_watson2 (c1, c2)) return 1;
131 else return is_watson2 (c2, c1);
133 int is_watson2 (char c1, char c2)
136 if ( c1=='g' && c2=='c')return 1;
137 else if (c1=='a' && (c2=='t' || c2=='u'))return 1;
140 int is_neutral (char c1, char c2)
145 if (is_watson (c1, c2)) return 1;
146 else if (c1=='g' && (c2=='t' || c2=='u'))return 1;
147 else if ((c1=='t' || c1=='u') && c2=='g')return 1;
151 int ** vienna2list ( char *seq)
156 list=declare_int (l+1, 8);
157 for (i=0,a=0; a<l; a++)
162 for (i2=0,b=a+1; b<l && i2>=0; b++)
164 if (seq[b]=='(')i2++;
165 else if (seq[b]==')')i2--;
175 Alignment *aln2alifold(Alignment *A)
183 output_clustal_aln (tmp1,A);
184 printf_system ("RNAalifold %s >%s 2>/dev/null", tmp1, tmp2);
185 return alifold2aln (tmp2);
188 Alignment *add_alifold2aln (Alignment *A, Alignment *ST)
191 int r1, rr1, r2, rr2;
192 int watson, comp,tot;
197 int ncomp=0, nwatson=0;
206 T=copy_aln (A, NULL);
210 for (a=0; a<A->len_aln; a++)
212 for (f=0,b=0; b<A->nseq && f==0; b++)
214 if (is_gap (A->seq_al[b][a]))f=1;
220 for (b=0; b<A->nseq; b++)T->seq_al[b][a]='-';
225 //add or Replace the structure
226 l=strlen (ST->seq_al[1]);
227 for (a=0; a< l; a++)if (ST->seq_al[1][a]==STOCKHOLM_CHAR)ST->seq_al[1][a]='.';
228 if ((i=name_is_in_list ("#=GC SS_cons", A->name, A->nseq, 100))!=-1)
230 sprintf (A->seq_al[i], "%s", ST->seq_al[1]);
234 A=realloc_aln2 ( A, A->nseq+1, A->len_aln+1);
235 sprintf (A->name[A->nseq], "#=GC SS_cons");
236 sprintf (A->seq_al[A->nseq], "%s", ST->seq_al[1]);
241 Alignment * alifold2analyze (Alignment *A, Alignment *ST, char *mode)
247 s=name_is_in_list ("#=GC SS_cons", A->name,A->nseq, 100);
251 A=add_alifold2aln (A,ST);
252 s=name_is_in_list ("#=GC SS_cons", A->name,A->nseq, 100);
255 list=vienna2list (A->seq_al[s]);
256 list=alifold_list2cov_list (A, list);
258 usegap=0; //do not use gaped positions by default
259 if (mode && strstr (mode, "usegap"))usegap=1;//count positions with gaps
263 A=alifold2cov_stat (A, list,usegap);
267 if ( strstr (mode, "stat")) A=alifold2cov_stat (A, list, usegap);
268 if ( strstr (mode, "list")) A=alifold2cov_list (A, list, usegap);
269 if ( strstr (mode, "aln")) A=alifold2cov_aln (A, list, usegap);
270 if ( strstr (mode, "color") )
273 C=copy_aln (A, NULL);
274 C=alifold2cov_cache (C, list, usegap);
275 A=alifold2cov_aln (A, list, usegap);
276 if ( strstr ( mode, "ps"))
277 output_color_ps (A, C, "stdout");
279 output_color_html (A, C, "stdout");
280 myexit (EXIT_SUCCESS);
287 int ** alifold_list2cov_list (Alignment *A, int **list)
290 int r1, rr1, r2, rr2;
291 int neutral,watson, comp,tot, occupancy;
296 int ncomp=0, nwatson=0, nneutral=0, ncomp_wc=0;
301 for (nseq=0,a=0; a< A->nseq; a++)if ( A->name[a][0]!='#')nseq++;
302 max=((nseq*(nseq-1))/2);
313 for (c=0; c<A->nseq-1; c++)
315 if (A->name[c][0]=='#')continue;
316 r1=tolower(A->seq_al[c][p1]);
317 r2=tolower(A->seq_al[c][p2]);
318 if (is_gap(r1) || is_gap(r2))continue;
319 for (d=c+1; d<A->nseq; d++)
321 if (A->name[d][0]=='#')continue;
322 rr1=tolower(A->seq_al[d][p1]);
323 rr2=tolower(A->seq_al[d][p2]);
324 if (is_gap(rr1) || is_gap(rr2))continue;
325 if (is_watson (r1, r2))watson++;
326 if (is_watson (rr1, rr2))watson++;
327 if (is_neutral (r1, r2))neutral++;
328 if (is_neutral (rr1, rr2))neutral++;
329 if (r1!=rr1 && r2!=rr2)comp++;
338 watson=(watson*100)/(occupancy*2);
339 comp=(comp*100)/occupancy;
340 neutral=(neutral*100)/(occupancy*2);
341 occupancy=(occupancy*100)/max;
345 list[a][6]=occupancy;
347 if (list[a][3]<100)list[a][7]='I';//incompatible pair
350 list[a][7]='N';//Neutral pair
353 list[a][7]='W';//Watson and Crick
354 if ( list[a][5]>0)list[a][7]='C'; //Watson and crick compensated
356 else if ( list[a][5]>0)
358 list[a][7]='c';//compensated
366 Alignment *alifold2cov_aln (Alignment *inA,int **list, int ug)
372 A=copy_aln (inA, NULL);
373 A=realloc_aln2 ( A, A->nseq+1, A->len_aln+1);
374 sprintf (A->name[A->nseq], "#=GC SS_analyze");
375 sprintf (A->seq_al[A->nseq], "%s", A->seq_al[A->nseq-1]);
380 if (list[a][6]<100 && !ug);
384 A->seq_al[A->nseq-1][list[a][0]]=s;
385 A->seq_al[A->nseq-1][list[a][1]]=s;
391 Alignment *alifold2cov_stat (Alignment *A,int **list, int ug)
393 int fold=0,watson=0, comp=0, compwc=0, incomp=0, neutral=0;
401 if (list[a][6]<100 && !ug);
405 watson +=(s=='W')?1:0;
406 compwc +=(s=='C')?1:0;
408 neutral+=(s=='N')?1:0;
409 incomp +=(s=='I')?1:0;
413 fprintf ( stdout, "@@ TOT Nseq:%d tot_len: %d fold: %d neutral: %d watson: %d CorWC: %d cor: %d Incompatible: %d\n",A->nseq-1, A->len_aln,fold, neutral,watson, compwc,comp,incomp);
416 Alignment *alifold2cov_cache (Alignment *inA, int **list, int ug)
421 A=copy_aln (inA, NULL);
426 if (list[a][6]<100 && !ug);
430 if (s=='C')v=9; //red
431 else if ( s=='c')v=7; //orange
432 else if ( s=='W')v=5; //Yellow
433 else if ( s=='N')v=2; //green
434 else if ( s=='I')v=0; //blue;
435 for (b=0;b<A->nseq; b++)
437 if (A->name[b][0]=='#');
442 A->seq_al[b][list[a][c]]='0'+v;
452 Alignment *alifold2cov_list (Alignment *A,int **list, int ug)
460 if (list[a][6]<100 && !ug);
463 fprintf ( stdout, "@@ WC Compensated pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
464 for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
465 fprintf (stdout,"\n");
469 fprintf ( stdout, "@@ Neural Compensated pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
470 for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
471 fprintf (stdout,"\n");
475 fprintf ( stdout, "@@ WC pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
476 for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
477 fprintf (stdout,"\n");
481 fprintf ( stdout, "@@ Neutral pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
482 for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
483 fprintf (stdout,"\n");
487 fprintf ( stdout, "@@ incompatible pair: %4d %4d =>", list[a][0]+1, list [a][1]+1);
488 for (b=0; b<A->nseq; b++)if (A->name[b][0]!='#')fprintf ( stdout, "[%c%c]", toupper (A->seq_al[b][list[a][0]]), toupper(A->seq_al[b][list[a][1]]));
489 fprintf (stdout,"\n");
498 Alignment *aln2sample (Alignment *A, int n)
504 B=copy_aln (A, NULL);
508 pos=declare_int (A->len_aln, 2);
509 for (a=0; a<A->len_aln; a++){pos[a][0]=a;pos[a][1]=rand()%(1000*A->len_aln);}
511 sort_int (pos, 2, 1, 0, A->len_aln-1);
513 n=(n==0)?A->len_aln:(MIN (n, (A->len_aln)));
515 for (b=0; b<A->nseq; b++)
516 A->seq_al[b][a]=B->seq_al[b][pos[a][0]];
517 for (b=0; b<A->nseq; b++)
518 A->seq_al[b][n]='\0';
525 Alignment *aln2bootstrap (Alignment *A, int n)
530 if (n==0)n=A->len_aln;
531 else A=realloc_aln (A, n+1);
533 B=copy_aln (A, NULL);
536 p=rand ()%A->len_aln;
537 for (b=0; b<A->nseq; b++)
538 A->seq_al[b][a]=B->seq_al[b][p];
540 for ( b=0; b<A->nseq; b++)A->seq_al[b][n]='\0';
549 Alignment * aln2random_aln (Alignment *A, char *smode)
559 smode=vcalloc (4, sizeof (char));
560 sprintf ( smode, "SCR");//Sequences, Column Residues
562 else if ( strm (smode, "NO"))return A;
567 if ( strstr ( smode, "S"))
569 A=aln2scramble_seq (A);
571 if ( strstr ( smode, "C"))
574 res=declare_int (A->nseq, 2);
575 for (a=0; a< A->len_aln; a++)
577 for (n=0,b=0;b<A->nseq; b++)
579 if ( !is_gap(A->seq_al[b][a]))
581 res[n][0]=A->seq_al[b][a];
582 res[n][1]=rand()%max;
585 sort_int (res, 2, 1, 0, n-1);
587 for (n=0,b=0;b<A->nseq; b++)
589 if ( !is_gap(A->seq_al[b][a]))A->seq_al[b][a]=res[n++][0];
596 //Redistributes the residues randomly without changing the gap pattern
597 if ( strstr ( smode, "R"))
599 max=A->len_aln*A->nseq;
600 res=declare_int (max, 2);
602 for (n=0,a=0; a< A->len_aln; a++)
604 for (b=0;b<A->nseq; b++)
606 if ( !is_gap(A->seq_al[b][a]))
608 res[n][0]=A->seq_al[b][a];
609 res[n][1]=rand()%max;
615 sort_int (res, 2, 1, 0, n-1);
616 for (n=0,a=0; a< A->len_aln; a++)
618 for (b=0;b<A->nseq; b++)
620 if ( !is_gap(A->seq_al[b][a]))
622 A->seq_al[b][a]=res[n++][0];
633 Alignment *score_aln2score_ascii_aln (Alignment *A, Alignment *C)
635 //Convert the output of T-Coffee evaluate into a printable score_ascii alignment*/
636 //A and C must be sorted
637 //sets to 0 lone residues
640 for (a=0; a<A->nseq; a++)
641 for (b=0; b<A->len_aln; b++)
644 int rC=C->seq_al[a][b];
645 int rA=A->seq_al[a][b];
646 if ( !strm (A->name[a], C->name[a])){HERE ("Unsorted aln in score_aln2score_ascii"); myexit (EXIT_FAILURE);}
648 if ( rA=='x' || rA=='X')C->seq_al[a][b]='9';
649 else if ( rC >='0' && rC<='9');
650 else if ( rC<10)C->seq_al[a][b]='0'+rC;
651 else if ( rC==NO_COLOR_RESIDUE && !is_gap(rA)) C->seq_al[a][b]='0';
652 else if ( rC==NO_COLOR_RESIDUE && is_gap(rA))C->seq_al[a][b]='-';
656 Alignment*aln2gap_cache (Alignment *A, int val)
661 B=copy_aln (A, NULL);
662 for (b=0; b<A->len_aln; b++)
664 for (nr=0,a=0; a<A->nseq; a++)nr+=!is_gap (A->seq_al[a][b]);
665 for (a=0; a<A->nseq; a++)if (!is_gap(A->seq_al[a][b]))B->seq_al[a][b]=(nr==1)?'0'+val:'1';
670 Alignment* aln2case_aln (Alignment *B, char *upper, char *lower)
675 A=copy_aln (B, NULL);
677 up=(upper)?upper[0]:'u';
678 lo=(lower)?lower[0]:'l';
680 for (a=0; a<A->nseq; a++)
681 for (b=0; b<A->len_aln; b++)
686 else A->seq_al[a][b]=(isupper (c))?up:lo;
690 Alignment *aln2scale (Alignment *A, char *coffset)
697 if (coffset)offset=atoi(coffset);
700 sprintf (s, "%d", A->len_aln+offset);
703 A=realloc_aln2 (A, A->nseq+n, A->len_aln+1);
704 s1=vcalloc ( n+1, sizeof (char));
705 s2=vcalloc ( n+1, sizeof (char));
710 else strcat (s2, "0");
711 sprintf (A->name[A->nseq+a], "%s", s2);
714 for (a=0; a<A->len_aln; a++)
716 sprintf (s1, "%d", a+1+offset);
717 s2=invert_string (s1);
725 A->seq_al[A->nseq+b][a]=v;
736 int * pos2list (int * pos, int len, int *nl)
741 list=vcalloc (len, sizeof (int));
742 for (a=0; a<len; a++)if (pos[a])list[nl[0]++]=a;
745 int *list2pos (int *list, int nl, int len)
748 pos=vcalloc (len, sizeof (int));
749 for (a=0; a<nl; a++)pos[list[a]]=1;
753 int **aln2resindex ( Alignment *A, Alignment *B, FILE *fp)
759 list=vcalloc (A->nseq+((B)?B->nseq:0), sizeof (int));
760 pos=aln2pos_simple_2 (A);
764 for ( a=0; a<B->nseq; a++)
766 list[a]=name_is_in_list(B->name[a], A->name, A->nseq, 100);
771 for ( a=0; a<A->nseq; a++)
781 if ( s!=-1)fprintf (fp, " %s",A->name[s]);
785 for ( a=0; a<A->len_aln; a++)
791 else if (pos[s][a]<0)
792 fprintf (fp, "%4d", -1);
794 fprintf (fp, "%4d", pos[s][a]);
801 int **index_seq_res ( Sequence *S1, Sequence *S2, int **name_index)
803 /*Index the residues of S1 according to S2
804 index[seq1 of S1][z]->x, where x is the position of residue z of seq1/S1 in S2->seq[index[Seq1/S1]]
808 char *seq1=NULL, *seq2=NULL;
811 index=vcalloc ( S1->nseq, sizeof (int*));
813 for (a=0; a< S1->nseq; a++)
815 int len1, len2, b, c;
819 if (name_index[a][0]==-1)
821 else if (name_index[a][1]==-1)
823 seq2=S2->seq[name_index[a][0]];
825 else if ((Profile=seq2R_template_profile (S2, name_index[a][0])) !=NULL)
827 seq2=Profile->seq_al[name_index[a][1]];
830 len1=(seq1)?strlen (seq1):0;
831 len2=(seq2)?strlen (seq2):0;
832 index[a]=vcalloc (len2, sizeof(int));
835 for (c=0,b=0; b<len2; b++)if( !is_gap(seq2[b]))index[a][c++]=b;
836 //index[a]=get_res_index ( seq1, seq2);
841 int **index_seq_name ( Sequence *S1, Sequence *S2)
843 /*Index the names of S1 according to S2
844 index[seq1 of S1][0]->x if seq1 is the xth sequence of S2
845 ->-1 if seq1 is nowhere to be found
846 index[seq1 of S1][1]->z if seq1 is the zth sequence within the xth profile of S2
851 index=declare_int (S1->nseq, 2);
854 for ( a=0; a<S1->nseq; a++)
856 index[a][0]=index[a][1]=-1;
857 x=name_is_in_list (S1->name[a],S2->name,S2->nseq,100);
858 if ( x!=-1){index[a][0]=x;index[a][1]=-1;}
859 for ( b=0; b<S2->nseq; b++)
861 if ((Profile=seq2R_template_profile (S2,b)))
863 z=name_is_in_list (S1->name[a],Profile->name,Profile->nseq,100);
864 if ( z!=-1){index[a][0]=b;index[a][1]=z;b=S2->nseq;}
874 int *get_name_index (char **l1, int n1, char **l2, int n2)
878 /*return Array[Index_L1]=Index_L2 */
879 r=vcalloc ( n1, sizeof (int));
880 for ( a=0; a< n1; a++)
881 r[a]=name_is_in_list (l1[a],l2,n2,100);
885 int* get_res_index (char *seq0, char *seq1)
889 if ( !seq0 || !seq1) return NULL;
892 coor=vcalloc ( strlen (seq0)+1, sizeof (int));
893 if (!strm (seq0, seq1))
895 int r0, r1 , isr0, isr1;
898 A=align_two_sequences (seq0,seq1,"pam250mt",-5,-1, "myers_miller_pair_wise");
900 for ( a=0; a< A->len_aln; a++)
902 r0=A->seq_al[0][a];r1=A->seq_al[1][a];
907 if (isr0 && isr1)coor[l0-1]=l1-1;
908 else if (isr0) coor[l0-1]=-1;
917 for ( a=0;a< l0; a++)
924 int change_residue_coordinate ( char *in_seq1, char *in_seq2, int v)
926 /*Expresses the coordinate of a residue in seq1, in the coordinate system of seq2*/
929 static char *seq1, *seq2;
933 if ( seq1 !=in_seq1 || seq2 !=in_seq2)
935 int r0, r1 , isr0, isr1;
941 seq1=in_seq1, seq2=in_seq2;
942 A=align_two_sequences (seq1,seq2,"pam250mt", -14, -2, "myers_miller_pair_wise");
944 coor=vcalloc ( A->len_aln, sizeof (int));
945 for ( a=0; a< A->len_aln; a++)
947 r0=A->seq_al[0][a];r1=A->seq_al[1][a];
954 if (isr0 && isr1)coor[l0-1]=l1-1;
955 else if (isr0) coor[l0-1]=-1;
963 int ** minimise_repeat_coor (int **coor, int nseq, Sequence *S)
967 new_coor=declare_int ( nseq, 3);
968 min=return_min_int (coor, nseq, 2);
969 for ( a=0; a< nseq; a++)
971 new_coor[a][0]=coor[a][0];
972 new_coor[a][1]=coor[a][1];
977 int ** get_nol_seq ( Constraint_list *CL, int **coor, int nseq, Sequence *S)
983 new_coor=declare_int ( nseq+1, 3);
986 buf=get_undefined_list ( CL);
990 for ( a=0; a< nseq; a++)buf[coor[a][0]][coor[a][1]]=1;
993 for ( a=0; a< nseq; a++)
999 while ( p<=l && !buf[s][p++])nl++;
1001 new_coor[a][1]=coor[a][1];
1004 free_int ( buf, -1);
1010 int compare_pos_column( int **pos1,int p1, int **pos2,int p2, int nseq)
1015 for ( a=0; a< nseq; a++)
1023 if ( v1!=v2)return 0;
1031 char *seq2alphabet (Sequence *S)
1033 return array2alphabet (S->seq, S->nseq, "");
1036 char *aln2alphabet (Alignment *A)
1038 return array2alphabet (A->seq_al, A->nseq, "");
1041 char *array2alphabet (char **array, int n, char *forbiden)
1047 hasch=vcalloc (256, sizeof (int));
1048 alphabet=vcalloc ( 257, sizeof (char));
1051 for ( a=0; a<n; a++)
1053 l=strlen (array[a]);
1054 for ( b=0; b<l; b++)
1055 hasch[tolower(array[a][b])]++;
1058 for ( a=0, b=0; a< 256; a++)
1060 if (hasch[a] && !strrchr(forbiden,a))alphabet[b++]=a;
1069 //***************************************************************
1072 //***************************************************************
1074 char* alnpos2hmmtop_pred (Alignment *A,Alignment *Pred, int pos, int mode)
1076 static char *result;
1077 static Alignment *Cache;
1083 score=vcalloc (256, sizeof (int));
1084 result=vcalloc (100, sizeof (char));
1087 if (!Pred && !Cache)
1089 Cache=aln2hmmtop_pred (A);
1091 if (!Pred) Pred=Cache;
1094 for (tot=0,a=0; a<A->nseq; a++)
1097 s=Pred->seq_al[a][pos];
1100 score[tolower(s)]++;
1105 if ( score['h']>score['i'] && score['h']>score['o'])cons='h';
1107 else if ( score['i']>score['o'])cons='i';
1109 if (tot==0) return "";
1112 if (mode==VERBOSE)sprintf (result, " H: %3d I: %3d O: %3d P: %c", (score['h']*100)/tot, (score['i']*100)/tot, (score['o']*100)/tot, cons);
1113 else if (mode == SHORT)sprintf ( result, "%c", cons);
1114 score['h']=score['o']=score['i']=0;
1119 Alignment * aln2hmmtop_pred (Alignment *A)
1125 PA=copy_aln (A, NULL);
1126 buf=vcalloc ( A->len_aln+1, sizeof (char));
1128 for ( a=0; a< A->nseq; a++)
1130 sprintf (buf, "%s", A->seq_al[a]);
1131 pred=seq2tmstruc (buf);
1132 for (c=0,b=0; b<A->len_aln; b++)
1134 if (!is_gap (PA->seq_al[a][b]))PA->seq_al[a][b]=pred[c++];
1142 char * seq2tmstruc ( char *seq)
1145 char *seqfile, *predfile, *buf;
1148 seqfile=vtmpnam (NULL);
1149 predfile=vtmpnam (NULL);
1151 fp=vfopen (seqfile, "w");
1152 fprintf ( fp, ">seq1\n%s", seq);
1156 printf_system ( "fasta_seq2hmmtop_fasta.pl -in=%s -out=%s -arch=%s/%s -psv=%s/%s", seqfile, predfile, get_mcoffee_4_tcoffee(), "hmmtop.arch", get_mcoffee_4_tcoffee(), "hmmtop.psv");
1157 S=get_fasta_sequence (predfile, NULL);
1158 buf=vcalloc ( strlen (S->seq[0])+1, sizeof (char));
1159 sprintf ( buf, "%s", S->seq[0]);
1161 free_sequence (S, S->nseq);
1166 char * set_blast_default_values()
1168 set_string_variable ("blast_server", (getenv ("blast_server_4_TCOFFEE"))?getenv ("blast_server_4_TCOFFEE"):"EBI");
1169 set_string_variable ("pdb_db", (getenv ("pdb_db_4_TCOFFEE"))?getenv ("pdb_db_4_TCOFFEE"):"pdb");
1170 set_string_variable ("prot_db", (getenv ("prot_db_4_TCOFFEE"))?getenv ("prot_db_4_TCOFFEE"):"uniprot");
1171 set_int_variable ("prot_min_sim", 0);
1172 set_int_variable ("prot_max_sim", 100);
1174 set_int_variable ("prot_min_cov", 0);
1175 set_int_variable ("prot_max_cov", 100);
1177 set_int_variable ("pdb_min_sim", 0);
1178 set_int_variable ("pdb_max_sim", 100);
1179 set_int_variable ("pdb_min_cov", 0);
1180 set_int_variable ("pdb_max_cov", 100);
1185 char * seq2pdb (Sequence *S)
1187 set_blast_default_values();
1189 S=seq2template_seq (S, "PDB", NULL);
1190 return seq2P_pdb_id(S,0);
1193 Alignment * seq2blast ( Sequence *S)
1196 set_blast_default_values();
1200 S=seq2template_seq (S, "BLAST", NULL);
1201 A=seq2R_template_profile(S,0);
1202 sprintf ( A->name[0], "%s", S->name[0]);
1207 for (a=0; a< S->nseq; a++)
1211 NS=fill_sequence_struc(1, &(S->seq[a]), &(S->name[a]));
1212 NS=seq2template_seq (NS, "BLAST", NULL);
1213 A=seq2R_template_profile(NS,0);
1214 sprintf ( name, "%s.prf", S->name[a]);
1216 output_fasta_aln (name,A);
1217 fprintf (stdout, "\nOUTPUT %s\n", name);
1219 myexit (EXIT_SUCCESS);
1227 Sequence * seq2unique_name_seq ( Sequence *S)
1230 if ((a=name_list2unique_name_list (S->nseq, S->name)))
1232 add_warning ( stderr, "\nWarning: Sequence %s is duplicated in file %s. The sequence will be renamed", S->name[a-1], S->file[a-1]);
1236 Alignment * aln2unique_name_aln ( Alignment *S)
1239 if ((a=name_list2unique_name_list (S->nseq, S->name)))
1241 add_warning ( stderr, "\nWarning: Sequence %s is duplicated in file %s. The sequence will be renamed", S->name[a-1], S->file[a-1]);
1247 int name_list2unique_name_list (int n, char **name)
1252 for (a=0; a<n-1; a++)
1253 for (b=a+1; b<n; b++)
1255 if ( strm (name[a], name[b]))
1256 {duplicate=a+1;b=a=n;}
1265 tmp1=vtmpnam (NULL);
1266 tmp2=vtmpnam (NULL);
1267 fp=vfopen (tmp1, "w");
1268 for (a=0; a< n; a++)fprintf ( fp, ">%s\naggggg\n", name[a]);
1270 printf_system ("fasta_aln2fasta_aln_unique_name.pl %s > %s", tmp1, tmp2);
1271 S=get_fasta_sequence (tmp2, NULL);
1274 name[a]=vrealloc (name [a], sizeof (int)*(strlen (S->name[a])+1));
1275 sprintf ( name[a], "%s", S->name [a]);
1277 free_sequence(S, -1);
1281 char**gene2exons (char **seq, int nseq)
1285 for (a=0; a<nseq; a++)
1287 int in_exon=0, flag=0,l;
1289 for ( b=0; b<l; b++)
1295 seq[a][b]=(flag)?r:tolower(r);
1308 Sequence* seq2clean_seq (Sequence *S, char *alp)
1312 for (a=0; a< S->nseq; a++)
1314 l=strlen (S->seq[a]);
1315 for (d=0,b=0; b<l; b++)
1318 if ( alp==NULL && !strchr (AA_ALPHABET, c) && !strchr (DNA_ALPHABET, c));
1319 else if (alp && strchr (alp, c));
1320 else S->seq[a][d++]=c;
1323 S->len[a]=strlen (S->seq[a]);
1327 int ** seq2aln_pos (Alignment *A, int *ns, int **l_s)
1330 int a, b,c, d,l, p , g;
1333 l=MAX(strlen (A->seq_al[l_s[0][0]]), strlen (A->seq_al[l_s[1][0]]));
1334 code=declare_int ((A->S)->nseq,l+1);
1338 l=strlen (A->seq_al[l_s[c][0]]);
1339 for (d=0; d<ns[c]; d++)
1341 a=A->order[l_s[c][d]][0];
1342 for (p=0, b=0; b<l; b++)
1344 g=is_gap (A->seq_al[l_s[c][d]][b]);
1345 if (!g){p++; code[a][p]=b+1;}
1352 Alignment *local_maln2global_maln (char *seq, Alignment *A)
1354 /*inputs a BLAST alignmnent where the master sequence may be partila
1355 outputs the same alignment, while amkeing sure the profile is perfectly in sink with its master sequence
1359 int start, end, rend;
1360 char qname[100], *p;
1363 sprintf ( qname, "%s", A->name[0]);
1364 p=strtok (qname, "_");
1365 if ( !strm (p, "QUERY"))
1367 fprintf ( stderr, "\nUnappropriate format for the alignment [%s:FATAL]", PROGRAM);
1368 myexit (EXIT_FAILURE);
1371 start=atoi(strtok (NULL, "_"));
1372 end=atoi(strtok (NULL, "_"));
1375 B=copy_aln (A,NULL);
1376 if ( start>1 || end<rend )A=realloc_aln (A,rend+1);
1378 for (a=0; a<start-1; a++)
1380 A->seq_al[0][a]=seq[a];
1381 for ( b=1; b< A->nseq; b++)A->seq_al[b][a]='-';
1384 for (c=0,a=start-1; a< end; a++, c++)
1386 A->seq_al[0][a]=seq[a];
1387 for ( b=1; b< A->nseq; b++)
1389 A->seq_al[b][a]=B->seq_al[b][c];
1392 for ( a=end; a<rend; a++)
1394 A->seq_al[0][a]=seq[a];
1395 for ( b=1; b< A->nseq; b++)A->seq_al[b][a]='-';
1397 for ( a=0; a< A->nseq; a++) A->seq_al[a][rend]='\0';
1404 int ** aln2inv_pos ( Alignment *A)
1407 pos=vcalloc (A->nseq, sizeof (char*));
1408 for (a=0; a< A->nseq; a++)pos[a]=seq2inv_pos (A->seq_al[a]);
1411 int * seq2inv_pos ( char *seq)
1413 /*returns a list where each value gives the index of the corresponding residue in seq*/
1414 /*Numbering: 1 to L : Analogy to the aln2pos*/
1420 for ( l2=a=0; a< l1; a++)l2+=1-is_gap(seq[a]);
1421 pos=vcalloc (l2+1, sizeof (int));
1422 for ( l2=a=0; a< l1; a++)if (!is_gap(seq[a]))pos[++l2]=a+1;
1427 int ** aln2pos_simple_2 (Alignment *A)
1431 pos1=aln2pos_simple (A, A->nseq);
1432 pos2=duplicate_int (pos1, A->nseq,read_size_int (pos1[0],sizeof (int)));
1433 pos1=aln2pos_simple (NULL, 0);
1436 int ** aln2pos_simple (Alignment *A, int n_nseq, ...)
1439 function documentation: start
1440 int ** aln2pos_simple (Alignment *A, int n_nseq, ...)
1442 ####with two parameter only: Alignment *A, int n_nseq
1444 this function turns A into pos, a matrix where each residue is replace by its index according to the complete sequence.
1445 the indices in pos are computed using A->order[x][1] that contains the indice of the first residue of seq x of A
1447 n_nseq MUST not be null
1449 ####with more than two param:
1450 int ** aln2pos_simple (Alignment *A, int n_nseq, int *ns, int **ls)
1451 n_nseq must be set to 0 for the param 3 and four to be read
1453 ns[x]=number seq in group
1454 ls[x]=list of the sequences in group x ( size=ns[x])
1456 The computation of the indices is only carried out on the scpecified residues
1459 in pos, the numbering of the residues goes from 1 to L:
1460 pos[0][0]=3, means that the first position of the first sequence
1461 in the alignmnet contains residue #3 from sequence A->order[0][0];
1463 function documentation: end
1489 list=vcalloc(n_nseq, sizeof (int));
1490 for ( a=0; a< n_nseq; a++)list[a]=a;
1494 va_start (ap, n_nseq);
1495 ns=va_arg(ap, int * );
1496 ls=va_arg(ap, int **);
1498 list=vcalloc ( ns[0]+ns[1], sizeof (int));
1500 for ( a=0; a< ns[0]; a++)list[n_nseq++]=ls[0][a];
1501 for ( a=0; a< ns[1]; a++)list[n_nseq++]=ls[1][a];
1504 max_nseq=MAX(read_size_int(A->order,sizeof (int*)),return_max_int (A->order, read_size_int(A->order,sizeof (int*)),0))+1;
1505 n_len=get_longest_string ( A->seq_al,A->max_n_seq, NULL, NULL)+1;
1508 T=declare_int (max_nseq, n_len);
1509 for ( c=0; c< n_nseq; c++)
1512 l=strlen ( A->seq_al[a]);
1514 for ( p=A->order[a][1],b=0; b<l; b++)
1516 g=1-is_gap(A->seq_al[a][b]);
1518 T[a][b]=(g==1)?p:-(1+p);
1519 if ( A->seq_al[a][b]==UNDEFINED_RESIDUE)T[a][b]=0;
1520 if ( A->seq_cache && T[a][b]>0)T[a][b]=A->seq_cache[A->order[a][0]][T[a][b]];
1528 Alignment ** split_seq_in_aln_list ( Alignment **aln, Sequence *S, int n_seq, char **seq_list)
1531 char * long_seq=NULL;
1539 if ( aln==NULL)return NULL;
1540 translation=declare_int ( S->nseq,2);
1542 for (len=0,a=0; a< S->nseq; a++)
1544 if((b=name_is_in_list (S->name[a],seq_list, n_seq, 100))!=-1)
1546 l=strlen(S->seq[a])+1;
1547 long_seq=vrealloc(long_seq,(len+l+1)*sizeof(char));
1548 long_seq=strcat(long_seq, S->seq[a]);
1549 long_seq=strcat(long_seq, "*");
1551 translation[a][0]=b;
1552 translation[a][1]=len;
1555 else translation[a][0]=-1;
1558 long_seq[len-1]='\0';
1561 table=declare_int ( len+1, 2);
1563 for ( b=0,a=0; a< S->nseq; a++)
1565 if ( translation[a][0]!=-1)
1568 while (long_seq[b]!='\0' && long_seq[b]!='*')
1571 table[b+1][0]=translation[a][0];
1575 table[b][0]=translation[a][0];
1580 for ( a=0; a< (aln[-1])->nseq; a++)
1582 for ( b=0; b< (aln[a])->nseq; b++)
1585 (aln[a])->order[b][0]=table[(aln[a])->order[b][1]][0];
1586 (aln[a])->order[b][1]=table[(aln[a])->order[b][1]][1];
1587 sprintf ( (aln[a])->name[b],"%s_%d_%d", S->name[(aln[a])->order[b][0]],a+1,b+1);
1590 free_int (translation, -1);
1591 free_int (table, -1);
1597 Sequence * fill_sequence_struc ( int nseq, char **sequences, char **seq_name)
1601 int shortest, longuest;
1605 shortest=longuest=0;
1609 shortest=get_shortest_string( sequences, nseq, NULL, NULL);
1610 longuest=get_longest_string (sequences, nseq, NULL, NULL);
1614 shortest=longuest=strlen (sequences[0]);
1622 S=declare_sequence (shortest, longuest,nseq);
1625 if (sequences)S->seq=copy_char ( sequences, S->seq, nseq, -1);
1626 else S->seq=declare_char (S->nseq, 1);
1628 S->name=copy_char ( seq_name, S->name,nseq, -1);
1630 ungap_array (S->seq,nseq);
1631 for ( a=0; a< S->nseq; a++)S->len[a]=strlen(S->seq[a]);
1636 Alignment * thread_profile_files2aln (Alignment *A, char *template_file, Fname *F)
1642 if (!A->S)A->S=aln2seq (A);
1643 if (template_file)A->S=seq2template_seq (A->S, template_file,F);
1644 for ( a=0; a< A->nseq; a++)
1646 P=seq2R_template_profile (A->S, a);
1650 sprintf ( P->name[0], "%s", A->name[a]);
1654 return expand_aln (A);
1660 Alignment * expand_aln (Alignment *A)
1662 /*This function expands the profiles within an alignment*/
1666 Alignment *MAIN=NULL, *SUB=NULL;
1676 list=vcalloc (A->nseq, sizeof (int));
1677 for ( a=0; a< A->nseq; a++)
1679 Profile=seq2R_template_profile (A->S, A->order[a][0]);
1680 if (Profile && Profile->expand)
1682 new_nseq+=Profile->nseq;
1687 list[n_sub_seq++]=a;
1691 if ( n_sub_seq==A->nseq){vfree(list);return A;}
1692 else if (n_sub_seq==0){MAIN=copy_aln (A, MAIN);MAIN->nseq=0;}
1695 MAIN=extract_sub_aln (A, n_sub_seq, list);
1700 for ( a=0; a< A->nseq; a++)
1702 Profile=seq2R_template_profile (A->S, A->order[a][0]);
1703 if ( Profile && Profile->expand)
1706 SUB=copy_aln (Profile,SUB);
1708 SUB=realloc_aln2(SUB, SUB->nseq, A->len_aln+1);
1710 for ( e=0,b=0; b< A->len_aln; b++)
1712 if ( is_gap(A->seq_al[a][b]))
1713 {for (d=0; d< SUB->nseq; d++)SUB->seq_al[d][b]='-';}
1716 for(d=0; d<SUB->nseq; d++)SUB->seq_al[d][b]=Profile->seq_al[d][e];
1721 MAIN=stack_aln(MAIN, SUB);
1728 Alignment * expand_number_aln (Alignment *A,Alignment *EA)
1730 /*This function expands the profiles within an alignment*/
1734 Alignment *MAIN=NULL, *SUB=NULL, *C=NULL;
1740 if ( !EA || !A)return EA;
1742 if ( EA->nseq<A->nseq)
1744 fprintf (stderr, "\n[ERROR:expand_number_aln] Using as a master an expanded aln (%d %d) [FATAL:%s]", EA->nseq, A->nseq,PROGRAM);
1748 myexit (EXIT_FAILURE);
1752 list=vcalloc (EA->nseq, sizeof (int));
1753 for ( a=0; a< EA->nseq; a++)
1755 Profile=seq2R_template_profile (EA->S, EA->order[a][0]);
1756 if (Profile && Profile->expand)new_nseq+=Profile->nseq;
1760 list[n_sub_seq++]=a;
1764 if ( n_sub_seq==EA->nseq){vfree(list);return EA;}
1765 else if (n_sub_seq==0){MAIN=copy_aln (EA, MAIN);MAIN->nseq=0;}
1768 MAIN=extract_sub_aln (EA, n_sub_seq, list);
1773 C=extract_sub_aln (EA,1, list);
1778 for ( a=0; a< EA->nseq; a++)
1780 Profile=seq2R_template_profile (EA->S, EA->order[a][0]);
1781 if ( Profile && Profile->expand)
1783 SUB=copy_aln (Profile,SUB);
1784 SUB=realloc_aln2(SUB, SUB->nseq, EA->len_aln+1);
1786 for ( e=0,b=0; b<= EA->len_aln; b++)
1788 if (is_gap(A->seq_al[a][b]))
1790 for ( d=0; d<SUB->nseq; d++)
1791 SUB->seq_al[d][b]=NO_COLOR_RESIDUE;
1795 for ( d=0; d<SUB->nseq; d++)
1798 if ( is_gap (Profile->seq_al[d][e]))
1800 SUB->seq_al[d][b]=NO_COLOR_RESIDUE;
1802 else SUB->seq_al[d][b]=EA->seq_al[a][b];
1807 for (d=0; d< SUB->nseq; d++)SUB->score_seq[d]=EA->score_seq[a];
1809 MAIN=stack_aln(MAIN, SUB);
1813 MAIN=stack_aln(MAIN, C);
1815 MAIN->score=MAIN->score_aln=EA->score_aln;
1825 Alignment * probabilistic_rm_aa ( Alignment *A, int pos, int len)
1839 if (pos==0)pos= (rand()%(A->len_aln-(2*len+len))) +len;
1842 for ( a=0; a< A->nseq; a++)
1844 if (random_len)left =rand()%len;
1846 if (random_len)right=rand()%len;
1848 if ( (pos-right)<0 || (pos+left)>A->len_aln)
1850 add_warning ( stderr, "\nWarning: probabilistic_rm_aa, pos out of range [%s]\n", PROGRAM);
1853 for ( b=pos-right; b<pos+left; b++)A->seq_al[a][b]=(b==pos)?'~':'*';
1857 free_sequence ( A->S, A->nseq);
1863 Alignment * remove_gap_column ( Alignment *A, char *mode)
1872 seq_list =vcalloc ( A->nseq, sizeof (int));
1873 while ( (p=strtok(mode, ":")))
1878 seq_list[nseq++]=atoi(p+1)-1;
1880 else if ( (a=name_is_in_list (p, A->name, A->nseq, 100))!=-1)
1888 for ( a=0; a< A->nseq; a++)seq_list[a]=a;
1892 for ( cl=0,a=0; a<=A->len_aln; a++)
1894 for (keep_col=1, b=0; b< nseq && keep_col; b++)
1896 keep_col=(is_gap(A->seq_al[seq_list[b]][a]))?0:keep_col;
1901 for ( b=0; b< A->nseq; b++)
1903 A->seq_al[b][cl]=A->seq_al[b][a];
1909 for ( b=0; b< A->nseq; b++)
1911 A->seq_al[b][cl]='-';
1923 Alignment * ungap_sub_aln (Alignment *A, int ns, int *ls)
1929 len=strlen ( A->seq_al[ls[0]]);
1931 for ( c=0,a=0; a<len; a++)
1933 for ( t=0,b=0; b<ns; b++)
1934 t+=is_gap(A->seq_al[ls[b]][a]);
1938 for ( b=0; b<ns; b++)
1939 A->seq_al[ls[b]][c]=A->seq_al[ls[b]][a];
1943 for ( b=0; b<ns; b++)A->seq_al[ls[b]][c]='\0';
1947 Sequence * ungap_seq ( Sequence *S)
1951 if ( !S)return NULL;
1953 S->max_len=S->min_len=strlen (S->seq[0]);
1954 for ( a=0; a< S->nseq; a++)
1957 S->len[a]=strlen (S->seq[a]);
1958 S->max_len=MAX(S->max_len,S->len[a]);
1959 S->min_len=MAX(S->min_len,S->len[a]);
1964 Alignment* shift_column (Alignment *A, int from, int to);
1965 int max_shift (Alignment *A, int p);
1966 int column_is_lower (Alignment *A, int p);
1968 Alignment * unalign_aln_2 (Alignment *A, Alignment *C, int t)
1975 for (a=0; a<A->nseq; a++)
1976 for (b=0; b<A->len_aln; b++)
1978 int res=C->seq_al[a][b];
1979 A->seq_al[a][b]=toupper(A->seq_al[a][b]);
1980 if ((isdigit (res) && (res-'0')<=t))
1981 A->seq_al[a][b]=tolower(A->seq_al[a][b]);
1986 while ( A->seq_al[0][n])
1989 for (b=0; b<A->nseq; b++)if (islower (A->seq_al[b][n]))insert=1;
1992 insert_gap_col (A,n,1);
1993 for (b=0; b<A->nseq; b++)
1995 if ( islower (A->seq_al[b][n+1]))
1997 A->seq_al[b][n]=A->seq_al[b][n+1];
1998 A->seq_al[b][n+1]='-';
2004 for (a=A->len_aln-1; a>=0; a--)
2006 if (column_is_lower (A,a))
2010 shift_column (A,a, a+s);
2015 Alignment* shift_column (Alignment *A, int from, int to)
2020 buf=vcalloc (A->nseq, sizeof (char));
2021 for (a=0; a<A->nseq; a++)
2023 buf[a]=A->seq_al[a][from];
2024 A->seq_al[a][from]='-';
2027 insert_gap_col (A, to, 1);
2028 for ( a=0; a<A->nseq; a++)A->seq_al[a][to]=buf[a];
2033 int max_shift (Alignment *A, int p)
2035 int shift, max_shift, a;
2036 for (max_shift=A->len_aln,a=0; a< A->nseq; a++)
2040 if (!islower (A->seq_al[a][p]) || A->seq_al[a][p]=='-')continue;
2041 while (A->seq_al[a][p+shift+1]=='-')shift++;
2042 max_shift=MIN(shift,max_shift);
2046 int column_is_lower (Alignment *A, int p)
2050 for ( a=0; a<A->nseq; a++)
2051 if ( !is_gap (A->seq_al[a][p]) && !islower(A->seq_al[a][p]))return 0;
2055 Alignment * unalign_aln (Alignment *A, Alignment *C, int t)
2060 for (a=0; a<A->nseq; a++)
2061 for (b=0; b<A->len_aln; b++)
2063 int res=C->seq_al[a][b];
2064 A->seq_al[a][b]=toupper(A->seq_al[a][b]);
2065 if ((isdigit (res) && (res-'0')<=t))
2066 A->seq_al[a][b]=tolower(A->seq_al[a][b]);
2070 for (pos=-1, a=0; a<C->nseq; a++)
2073 while ( C->seq_al[a][b])
2075 int res=C->seq_al[a][b];
2076 if ((isdigit (res) && (res-'0')<=t))
2078 if (pos==-1){pos=b;len=1;}
2084 C=unalign_aln_pos(C,a,pos, len);
2089 if ( pos!=-1){C=unalign_aln_pos(C,a,pos, len);pos=-1;}
2092 thread_seq_struc2aln (C, S);
2093 A=realloc_aln2 (A, A->nseq, C->len_aln+1);
2094 A->len_aln=C->len_aln;
2095 for (a=0; a<A->nseq; a++)sprintf ( A->seq_al[a], "%s", C->seq_al[a]);
2098 free_sequence (S, -1);
2101 Alignment * unalign_aln_pos (Alignment *A, int s, int p, int l)
2108 buf=vcalloc (l+1, sizeof (char));
2111 buf[a]=A->seq_al[s][p+a];
2112 A->seq_al[s][p+a]='-';
2116 A=insert_gap_col (A,p, l);
2119 A->seq_al[s][p+a]=buf[a];
2124 Alignment * insert_gap_col (Alignment *A, int p, int l)
2130 gap=generate_null(l);
2131 if ( !A || p>=A->len_aln || p<0)return A;
2133 buf=vcalloc (A->len_aln+l+1, sizeof (char));
2134 A=realloc_aln2(A,A->nseq, A->len_aln+l+1);
2135 for (a=0; a<A->nseq; a++)
2138 A->seq_al[a][p]='\0';
2139 sprintf ( buf, "%s%s%c%s", A->seq_al[a],gap,c,A->seq_al[a]+p+1);
2140 sprintf (A->seq_al[a], "%s", buf);
2146 Alignment * unalign_residues (Alignment *A, int si1, int si2)
2148 char *s1, *s2, *ns1, *ns2;
2151 s1=A->seq_al[si1];s2=A->seq_al[si2];
2154 ns1=vcalloc (2*l+1, sizeof (char));
2155 ns2=vcalloc (2*l+1, sizeof (char));
2157 for (b=a=0; a< l; a++)
2160 if (is_gap(r1) || is_gap(r2) || isupper (r1) || isupper(r2))
2162 ns1[b]=(r1=='.')?'-':r1;
2163 ns2[b]=(r2=='.')?'-':r2;
2182 A->len_aln=strlen (ns1);
2185 Alignment *degap_aln (Alignment *A)
2187 //Reomove all the gaps
2189 for ( a=0; a< A->nseq; a++)ungap (A->seq_al[a]);
2193 Alignment *ungap_aln_n ( Alignment *A, int p)
2195 /*remove all the columns of gap-only within an alignment*/
2200 if ( A->nseq==0)return A;
2202 for ( c=0,a=0; a< A->len_aln; a++)
2204 for ( t=0,b=0; b<A->nseq; b++)
2205 t+=is_gap(A->seq_al[b][a]);
2207 if (p>0 && (gp>=p || (t==A->nseq && p==100) || (t && p==1)));//Remove columns containing more than p% gaps
2208 else if (p<0 && (gp<=p || (t==0 && p==-100) ||(t && p==-1)));//remove columns containing less than p% gaps
2211 for ( b=0; b<A->nseq; b++)
2212 A->seq_al[b][c]=A->seq_al[b][a];
2216 for ( b=0; b<A->nseq; b++)A->seq_al[b][c]='\0';
2221 Alignment *ungap_aln ( Alignment *A)
2223 return ungap_aln_n (A, 100);
2226 Alignment *ungap_aln ( Alignment *A)
2230 for ( c=0,a=0; a< A->len_aln; a++)
2232 for ( t=0,b=0; b<A->nseq; b++)
2233 t+=is_gap(A->seq_al[b][a]);
2237 for ( b=0; b<A->nseq; b++)
2238 A->seq_al[b][c]=A->seq_al[b][a];
2242 for ( b=0; b<A->nseq; b++)A->seq_al[b][c]='\0';
2250 Alignment *remove_end (Alignment *A)
2255 for (a=0; a< A->len_aln; a++)
2257 for ( b=0, d=0; b< A->nseq; b++)
2258 if ( !is_gap(A->seq_al[b][a]))d++;
2262 for (a=A->len_aln-1; a>0; a--)
2264 for ( b=0, d=0; b< A->nseq; b++)
2265 if ( !is_gap(A->seq_al[b][a]))d++;
2270 return extract_aln(A, left, right+1);
2273 Alignment* condense_aln (Alignment *A)
2275 /* condense complementarz columns:
2281 int a, b, plen, n,m, r1, r2;
2284 while ( A->len_aln !=plen)
2287 for ( a=0; a< A->len_aln-1; a++)
2289 for ( n=m=b=0; b< A->nseq; b++)
2291 r1=is_gap(A->seq_al[b][a]);
2292 r2=is_gap(A->seq_al[b][a+1]);
2297 if ( n==A->nseq && m!=A->nseq)
2299 for (b=0; b< A->nseq; b++)
2301 if (!is_gap(A->seq_al[b][a+1]))
2303 A->seq_al[b][a]=A->seq_al[b][a+1];
2304 A->seq_al[b][a+1]='-';
2318 void compress_aln ( Alignment *A)
2321 /*remove all the columns of gap-only within an alignment*/
2326 for (c=0, a=0; a< A->len_aln; a++)
2328 for ( b=0, d=0; b< A->nseq; b++)
2329 if ( A->seq_al[b][a]!='-'){d=1; break;}
2333 for (b=0; b< A->nseq; b++)
2334 A->seq_al[b][c]=A->seq_al[b][a];
2340 for ( a=0; a< A->nseq; a++)
2341 A->seq_al[a][c]='\0';
2344 Alignment *seq_coor2aln ( Sequence *S, Alignment *A, int **coor, int nseq)
2349 A=realloc_alignment2(A, nseq, return_maxlen ( S->seq, S->nseq)+1);
2350 for ( a=0; a< S->nseq; a++)sprintf ( A->file[a], "%s", S->file[a]);
2351 for ( a=0; a< nseq; a++)
2353 sprintf (A->name[a], "Repeat_%d_%d", a, coor[a][0]);
2354 buf=extract_char ( S->seq[coor[a][0]], coor[a][1]-1, coor[a][2]);
2355 sprintf ( A->seq_al[a],"%s", buf);
2358 A->order[a][1]=coor[a][1]-1;
2364 Alignment *strings2aln (int nseq,...)
2366 /*strings2aln(nseq, <name1>, <seq1>, <name2>, <seq2>....)*/
2368 char **list, **list2;
2369 char **name, **name2;
2375 list=vcalloc (nseq, sizeof (char*));
2376 name=vcalloc (nseq, sizeof (char*));
2377 for ( a=0; a< nseq; a++)
2379 name[a]=va_arg(ap,char*);
2380 list[a]=va_arg(ap,char*);
2385 for ( max=0,a=0; a< nseq; a++)
2387 max=(strlen (list[a])>max)?strlen(list[a]):max;
2389 list2=declare_char (nseq, max+1);
2390 name2=declare_char (nseq, MAXNAMES+1);
2392 for ( a=0; a< nseq; a++)
2394 sprintf ( list2[a], "%s", list[a]);
2395 sprintf ( name2[a], "%s", name[a]);
2399 S=fill_sequence_struc(nseq,list2,name2);
2401 free_char (list2, -1);
2402 free_char (name2, -1);
2405 A=seq2aln(S,NULL, 1);
2408 Alignment *seq2aln ( Sequence *S, Alignment *A,int rm_gap)
2412 A=realloc_alignment2(A, S->nseq, S->max_len+1);
2413 for ( a=0; a< S->nseq; a++)sprintf ( A->file[a], "%s", S->file[a]);
2415 A->max_len=S->max_len;
2416 A->min_len=S->min_len;
2418 for ( a=0; a< S->nseq; a++)
2423 sprintf ( A->seq_comment[a], "%s", S->seq_comment[a]);
2424 sprintf ( A->aln_comment[a], "%s", S->aln_comment[a]);
2426 sprintf ( A->name[a], "%s", S->name[a]);
2427 sprintf ( A->seq_al[a], "%s", S->seq[a]);
2429 ungap ( A->seq_al[a]);
2430 A->len[a]=strlen ( A->seq_al[a]);
2432 if ( rm_gap==0 || rm_gap==NO_PAD)sprintf ( A->seq_al[a], "%s", S->seq[a]);
2435 if (rm_gap!=NO_PAD)padd_aln (A);
2440 Alignment *padd_aln ( Alignment *A)
2442 A->seq_al=padd_string (A->seq_al, A->nseq, '-');
2443 A->len_aln=strlen (A->seq_al[0]);
2447 char **padd_string ( char **string, int n,char pad)
2449 /*Pads a the strings so that they all have the same length*/
2454 max_len=get_longest_string (string,n, NULL, NULL);
2457 buf=generate_null (max_len-strlen (string[a]));
2458 strcat ( string[a], buf);
2464 Alignment * trim_aln_with_seq ( Alignment *S, Alignment *P)
2468 static int seqindex;
2472 A=align_two_aln (S,P, "blosum62mt",-8,-1, "myers_miller_pair_wise");
2473 for (a=0; a<A->nseq; a++) sprintf (A->name[a], "tmpname_%d", seqindex++);
2475 R=copy_aln (A, NULL);
2476 for (c=0, a=0; a< A->len_aln; a++)
2478 if ( is_gap (A->seq_al[0][a]));
2481 for ( b=0; b<A->nseq; b++)
2482 R->seq_al[b][c]=A->seq_al[b][a];
2486 for ( a=0; a< A->nseq; a++)R->seq_al[a][c]='\0';
2497 Alignment * add_align_seq2aln ( Alignment *A, char *seq, char *seq_name)
2501 A=declare_aln (NULL);
2502 A=realloc_aln2 ( A, 1, strlen (seq)+1);
2504 sprintf ( A->name[A->nseq], "%s", seq_name);
2505 sprintf ( A->seq_al[A->nseq], "%s", seq);
2509 else if ( strlen (seq)!=A->len_aln)
2511 fprintf ( stderr, "\nError: Attempt to stack incompatible aln and aligned sequence[FATAL]\n");
2512 myexit (EXIT_FAILURE);
2518 A=realloc_aln2 ( A, A->nseq+1, A->len_aln+1);
2519 sprintf ( A->name[A->nseq], "%s", seq_name);
2520 sprintf ( A->seq_al[A->nseq], "%s", seq);
2527 Alignment *aln2number (Alignment *A)
2529 A->seq_al=char_array2number(A->seq_al, A->nseq);
2532 Sequence *seq2number (Sequence *A)
2534 A->seq=char_array2number(A->seq, A->nseq);
2538 Sequence * aln2seq (Alignment *A)
2540 return aln2seq_main(A, RM_GAP);
2542 Sequence * aln2seq_main (Alignment *A, int mode)
2548 if ( !A) return NULL;
2549 else if ( A->nseq==0)return NULL;
2550 for (maxlen=0,a=0; a<A->nseq; a++)maxlen=MAX(maxlen, strlen (A->seq_al[a]));
2553 LS=declare_sequence ( maxlen+1, maxlen+1, A->nseq);
2555 for ( a=0; a< LS->nseq; a++)
2557 sprintf (LS->file[a],"%s", A->file[a]);
2559 sprintf ( LS->seq[a], "%s", A->seq_al[a]);
2561 if (mode==RM_GAP)ungap ( LS->seq[a]);
2563 LS->len[a]=strlen ( LS->seq[a]);
2565 sprintf ( LS->seq_comment[a], "%s",A->seq_comment[a]);
2566 sprintf ( LS->aln_comment[a], "%s",A->aln_comment[a]);
2567 sprintf ( LS->name[a], "%s", A->name[a]);
2572 Sequence *keep_residues_in_seq ( Sequence *S, char *list, char replacement)
2578 A=keep_residues_in_aln ( A, list, replacement);
2579 for ( a=0; a< A->nseq; a++)
2581 ungap (A->seq_al[a]);
2582 sprintf ( S->seq[a], "%s", A->seq_al[a]);
2589 Alignment *aln2short_aln ( Alignment *A, char *list, char *new, int spacer)
2594 for ( a=0; a< A->nseq; a++)
2596 buf=vcalloc ( strlen (A->seq_al[a])+1, sizeof (char));
2598 for (l=0,cl=0, b=0; b< A->len_aln; b++)
2602 else if ( is_in_set (r, list))
2604 if (cl){cl=0; buf[l++]=new[0];}
2609 if ( cl==spacer){buf[l++]=new[0];cl=0;}
2616 sprintf (A->seq_al[a], "%s", buf);
2622 Alignment *keep_residues_in_aln ( Alignment *A, char *list, char replacement)
2624 return filter_keep_residues_in_aln (A,NULL, 0, -1, list, replacement);
2626 Alignment *filter_keep_residues_in_aln ( Alignment *A,Alignment *ST, int use_cons, int value, char *list, char replacement)
2632 sl=declare_char (n+1, 256);
2633 for (a=0; a< n; a++)
2634 sprintf ( sl[a], "%c%c", list[a], list[a]);
2635 sprintf ( sl[a],"#%c", replacement);
2636 A=filter_aln_convert (A, ST,use_cons,value, n+1, sl);
2642 Alignment *filter_convert_aln ( Alignment *A,Alignment *ST, int use_cons, int value, int n, ...)
2648 sl=vcalloc ( n,sizeof(char*));
2649 for ( a=0; a< n; a++)
2651 sl[a]=va_arg(ap, char * );
2654 A=filter_aln_convert (A,ST,use_cons,value, n,sl);
2659 Alignment * filter_aln ( Alignment *A, Alignment *ST, int value)
2661 return filter_aln_convert (A, ST,0,value,DELETE, NULL);
2663 Alignment * filter_aln_switchcase ( Alignment *A, Alignment *ST,int use_cons, int value)
2665 return filter_aln_convert (A, ST,0,value,SWITCHCASE, NULL);
2667 Alignment * filter_aln_upper_lower ( Alignment *A, Alignment *ST,int use_cons, int value)
2669 return filter_aln_convert (A, ST,use_cons,value, LOWER, NULL);
2671 Alignment * filter_aln_lower_upper ( Alignment *A, Alignment *ST,int use_cons, int value)
2674 return filter_aln_convert (A, ST,use_cons,value, UPPER, NULL);
2676 Alignment * STseq2STaln ( Alignment *A, Alignment *ST)
2680 if (ST && ST->len_aln !=A->len_aln)
2682 Sequence *S_T, *S_A;
2687 for (a=0; a< A->nseq; a++)
2689 i=name_is_in_list (A->name[a], S_T->name,S_T->nseq, 100);
2693 s1=(S_T)->seq[i];ungap(s1);
2694 s2=(S_A)->seq[a];ungap(s2);
2696 if ( strlen (s1)!=strlen(s2))
2698 fprintf ( stderr, "%s\n%s\n", s1, s2);
2699 printf_exit (EXIT_FAILURE, stderr, "ERROR: Sequence %s has different length in the alignment and in the structure Alignment [FATAL:%s]\n", A->name[a], PROGRAM);
2703 ST=copy_aln (A, ST);
2704 thread_seq_struc2aln (ST,S_T);
2709 Alignment * merge_annotation ( Alignment *A, Alignment *ST, char *seq)
2713 ST=STseq2STaln (A, ST);
2716 s=name_is_in_list ( seq, A->name, A->nseq, 100);
2720 add_warning ( stderr, "\nERROR: %s is not in your MSA [FATAL: %s]", PROGRAM);
2721 myexit (EXIT_FAILURE);
2724 for (a=0; a<A->len_aln; a++)
2729 if (is_gap (t))continue;
2730 for (b=0; b<A->nseq; b++)
2736 if (!isdigit(t) || (isdigit (t) && t<r))
2746 Alignment * filter_aln_convert ( Alignment *A, Alignment *ST,int use_cons, int value, int n_symbol,char **symbol_list)
2753 ST=STseq2STaln (A, ST);
2754 if ( ST && use_cons)
2756 cons=name_is_in_list ("con", ST->name,ST->nseq+1, 100);
2757 if ( cons==-1)cons=name_is_in_list ("cons", ST->name,ST->nseq+1, 100);
2758 if ( cons==-1)cons=name_is_in_list ("Cons", ST->name,ST->nseq+1, 100);
2762 fprintf (stderr, "WARNING: Could Not Use the Consensus Sequence [WARNING:%s]\n", PROGRAM);
2766 A->residue_case=KEEP_CASE;
2767 for ( a=0; a< A->nseq; a++)
2769 if(value!=10 && ST && !use_cons)
2771 c=name_is_in_list (A->name[a], ST->name, ST->nseq,100);
2775 for ( b=0; b< A->len_aln; b++)
2777 if ( value==10 || !ST)st=11;
2778 else if ( ST && use_cons)
2780 st=(isdigit(ST->seq_al[cons][b]))?ST->seq_al[cons][b]-'0':ST->seq_al[cons][b];
2782 else st=(isdigit(ST->seq_al[c][b]))?ST->seq_al[c][b]-'0':ST->seq_al[c][b];
2785 if ( st==value || value==-1 || st==NO_COLOR_RESIDUE)
2787 if ( n_symbol==UPPER && !symbol_list)A->seq_al[a][b]=toupper (A->seq_al[a][b]);
2788 else if ( n_symbol==LOWER && !symbol_list)A->seq_al[a][b]=tolower (A->seq_al[a][b]);
2789 else if ( n_symbol==SWITCHCASE && !symbol_list)
2791 if ( !isalpha(A->seq_al[a][b]));
2792 else if (isupper (A->seq_al[a][b]))A->seq_al[a][b]=tolower (A->seq_al[a][b]);
2793 else if (islower (A->seq_al[a][b]))A->seq_al[a][b]=toupper (A->seq_al[a][b]);
2795 else if ( n_symbol==DELETE && !symbol_list)A->seq_al[a][b]='-';
2798 A->seq_al[a][b]=convert(A->seq_al[a][b],n_symbol,symbol_list);
2808 char ** sar_aln2motif (Alignment *A, Alignment *B, int *pos, int c);
2809 char ** sar_aln2motif (Alignment *A, Alignment *B, int *pos, int c)
2811 static Alignment *I;
2812 static Alignment *O;
2815 float tp,tn,fp,fn,best, sp, sn, sen2;
2822 static int *alp_size;
2830 I=copy_aln(A, NULL);
2831 O=copy_aln(A, NULL);
2836 I->nseq=O->nseq=I->len_aln=O->len_aln=0;
2837 for (a=0; a<A->len_aln; a++)
2841 for (i=o=0,b=0; b<A->nseq; b++)
2844 if ( is_gap(A->seq_al[b][a]))return 0;
2845 if (B->seq_al[b][c]=='I')I->seq_al[i++][I->len_aln]=A->seq_al[b][a];
2846 else O->seq_al[o++][O->len_aln]=A->seq_al[b][a];
2853 if (O->len_aln==0 || I->len_aln==0) return 0;
2856 for (a=0; a<o; a++)O->seq_al[a][O->len_aln]='\0';
2857 for (a=0; a<i; a++)I->seq_al[a][I->len_aln]='\0';
2859 alp=vcalloc ( sizeof (char**), I->len_aln);
2860 alp_size= vcalloc ( I->len_aln, sizeof (int));
2861 for (a=0; a<I->len_aln; a++)
2864 alp[a]=string2alphabet ( (col=aln_column2string (I,a)),2, &alp_size[a]);
2870 motif_list=generate_array_string_list (I->len_aln, alp, alp_size, &n, NULL, OVERLAP);
2871 best_pred=best_motif=0;
2877 for (b=0; b<I->nseq; b++)
2879 if (match_motif (I->seq_al[b], motif_list[a]))tp++;
2882 for (b=0; b<O->nseq; b++)
2884 if (match_motif (O->seq_al[b], motif_list[a]))fp++;
2887 rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
2889 if (best> best_pred)
2896 output_Alignment_without_header ( I, stdout);
2897 fprintf ( stdout, "\n");
2898 output_Alignment_without_header ( O, stdout);
2901 fprintf ( stdout, "\nMotifCompound %d pred: %.2f motif: ", c, best_pred);
2902 for (n1=0, a=0; a<I->len_aln; a++)
2906 m=motif_list[best_motif][a];
2907 fprintf ( stdout, "[%s]-", m);
2909 n1+=(l==1 && !strm ("*",m) )?1:0;
2911 fprintf (stdout, "SCORE: %d", n1);
2913 for (a=0; a<n; a++)vfree (motif_list[a]);
2915 free_arrayN((void ***) alp, 3);
2924 void explore_weight_matrix (Alignment *A, Alignment *B, int range, int n, int *array);
2925 void explore_weight_matrix (Alignment *A, Alignment *B, int range, int n, int *array)
2930 fprintf ( stdout, "\n W:");
2931 for (a=0; a<A->len_aln; a++)fprintf ( stdout, "%d", array[a]);
2932 fprintf ( stdout, " %.4f",(float)sar_aln2r(A,B,array,0));
2937 for ( a=0; a<range; a++)
2940 explore_weight_matrix (A, B, range, n+1, array);
2944 float search_best_combo(Alignment *A, Alignment *B);
2945 void search_best_combo_sar_aln(Alignment *A, Alignment *B);
2946 void search_best_combo_sar_aln(Alignment *A, Alignment *B)
2953 S=copy_aln (B, NULL);
2955 for ( a=0; a<B->len_aln-w;a++)
2957 for (b=0; b<B->nseq; b++)
2961 S->seq_al[b][c]=B->seq_al[b][a+c];
2963 S->seq_al[b][c]='\0';
2966 s=search_best_combo (A, S);
2967 fprintf ( stdout,"\nP: XXXX \nP: XXXXX A=%d / %d", a, B->len_aln);
2973 float search_best_combo(Alignment *A, Alignment *B)
2975 int a, b, c, d, best_pos,nl, max;
2976 float best_score, score;
2980 int combo_mode=1; //1: greedy 2: consider all thw w combinations;
2987 pos=vcalloc ( A->len_aln, sizeof (int));
2988 list=vcalloc (A->len_aln, sizeof (int));
2993 for (a=0; a< max; a++)
2995 for (best_score=-9999,best_pos=0,b=0; b< A->len_aln-w; b++)
2997 for (c=0; c<nl; c++)pos[list[c]]=1;
2998 for (c=0; c<w; c++)pos[b+c]=1;
2999 score=sar_aln2r(A,B,pos,0);
3000 if ( score>best_score)
3005 for (c=0; c<w; c++)pos[b+c]=0;
3007 if (best_pos==list[nl-1])break;
3008 list[nl++]=best_pos;
3009 for (b=0; b<nl; b++) pos[list[b]]=1;
3010 fprintf ( stdout, "\n%2d P: %d S:%.3f Delta= %d", nl,best_pos, best_score, (int)sar_aln2delta(A,B, pos,0));
3011 for (b=0; b<nl; b++) pos[list[b]]=0;
3015 for (a=0; a<nl; a++) pos[list[a]]=1;
3016 fprintf ( stdout, "\nR: %3f " ,(float)sar_aln2r(A,B,pos,1));
3019 else if ( combo_mode==2)
3025 int *preset, n_preset;
3027 tmpf=vtmpnam (NULL);
3029 generate_array_int_list (max, 0,A->len_aln-1, 1,NULL, tmpf);
3030 printf_system ( "cp %s testfile", tmpf);
3031 buf=vcalloc ( 1000, sizeof (char));
3032 fp=vfopen (tmpf, "r");
3036 preset=vcalloc (A->len_aln, sizeof (int));
3037 preset[n_preset++]=353;
3038 preset[n_preset++]=361;
3039 //preset[n_preset++]=365;
3040 //preset[n_preset++]=187;
3041 //preset[n_preset++]=397;
3042 //preset[n_preset++]=492;
3045 while ( (buf=vfgets ( buf, fp))!=NULL)
3048 array=string2num_list (buf);
3050 for (a=1; a<=max; a++)
3054 for ( a=0; a<n_preset; a++)pos[preset[a]]=1;
3056 score=sar_aln2r(A,B,pos,0);
3058 if ( score>best_score)
3061 fprintf ( stdout, "\n");
3062 for (a=0; a<n_preset; a++)fprintf (stdout, "%2d ", preset[a]);
3063 for (a=1; a<=max; a++)fprintf (stdout, "%2d ", array[a]);
3064 fprintf ( stdout, " R: %.3f", best_score);
3065 for (nl=0,a=0; a<n_preset; a++)list[nl++]=preset[a];
3066 for (a=1; a<=max; a++)list[nl++]=array[a];
3068 //if ( score!=0)HERE ("R=%.2f", score);
3069 for (b=1; b<=max; b++)
3073 fprintf ( stdout, "\n");
3075 //for (a=0; a<max; a++)fprintf (stdout, "%2d ", array[best_pos][a]);
3076 //fprintf ( stdout, " R: %.3f", best_score);
3078 for (c=0; c<B->len_aln; c++)
3080 sar_aln2motif (A,B,pos, c);
3083 myexit (EXIT_FAILURE);
3084 HERE ("***************");
3085 fp2=vfopen ("aln.aln", "w");
3086 for (a=0; a<A->nseq; a++)
3088 fprintf (fp2, ">%s\n", A->name[a]);
3089 for ( b=0; b<nl; b++)fprintf (fp2, "%c", A->seq_al[a][list[b]]);
3090 fprintf ( fp2, "\n");
3093 HERE ("Output aln.aln");
3096 float tp=0, tn=0, fp=0, fn=0, pp2=0,pp=0, sn,sn2, sp;
3097 int **result,**result2,**compound_score, *ref_score,n2,n, s, p, c;
3101 compound_score=declare_int (B->len_aln, 2);
3102 ref_score=vcalloc (nl, sizeof (int));
3104 result=declare_int (B->len_aln*A->nseq*A->nseq, 2);
3105 result2=declare_int (B->len_aln*A->nseq*A->nseq, 2);
3107 for (n2=c=0; c< B->len_aln; c++)
3112 if (!M)M=read_matrice ("blosum62mt");
3113 for (n=0,a=0; a<A->nseq-1; a++)
3115 for (b=a+1; b<A->nseq;b++)
3117 for (s=0,p=0; p<nl; p++)
3121 r1=A->seq_al[a][list[p]];
3122 r2=A->seq_al[b][list[p]];
3123 if ( !is_gap (r1) && !is_gap(r2))s+=M[r1-'A'][r2-'A'];
3125 result2[n2][0]=result[n][0]=s;
3127 sar1=B->seq_al[a][c];sar2=B->seq_al[b][c];
3129 if (sar1=='I' && sar1==sar2)
3131 result2[n2][1]=result[n][1]=1;
3135 else if ( sar1==sar2 && sar1=='O')
3141 result2[n2][1]=result[n][1]=0;
3144 //else if ( s1==s2=='O')result[n][1]=-1;
3149 sort_int_inv (result, 2, 0, 0, n-1);
3152 for (tp=0,a=0; a<n; a++)
3155 if ((pp-tp) == (a-tp))break;
3164 fprintf ( stdout, "\nCompound %3d sn: %.3f sn2: %.3f sp: %.3f MIN: %.3f",c,sn, sn2,sp, MIN((MIN(sn,sn2)),sp));
3165 compound_score[c][0]=c;
3166 compound_score[c][1]=1000*MIN((MIN(sn,sn2)),sp);
3169 sort_int_inv (compound_score,2, 1, 0, B->len_aln-1);
3171 fp2=vfopen ("compound.fasta", "w");
3172 for (d=0; d<nl; d++)
3175 for (n=0,a=0;a<A->nseq; a++)
3176 for (b=0; b<A->nseq; b++)
3178 r1= A->seq_al[b][list[d]];
3179 r2= A->seq_al[b][list[d]];
3180 if (is_gap(r1) || is_gap(r2))continue;
3183 ref_score[d]+=M[r1-'A'][r2-'A'];
3189 AO=copy_aln (A, NULL);
3190 AI=copy_aln (A,NULL);
3191 AO->len_aln=AI->len_aln=nl;
3192 for (a=0; a<A->nseq; a++)AO->seq_al[a][nl]=AI->seq_al[a][nl]='\0';
3194 for (a=0; a<B->len_aln; a++)
3196 fprintf (stdout, "\n>%4d %4d ", compound_score[a][0], compound_score[a][1]);
3197 for (b=0; b<B->nseq; b++) fprintf (stdout, "%c", B->seq_al[b][compound_score[a][0]]);
3198 fprintf ( stdout, "\n");
3200 for (AI->nseq=0,b=0; b<B->nseq; b++)
3202 if (B->seq_al[b][compound_score[a][0]]=='O')continue;
3203 fprintf ( stdout, "\n\t");
3204 for (c=0; c<nl; c++)
3206 fprintf ( stdout, "%c", A->seq_al[b][list[c]]);
3207 AI->seq_al[AI->nseq][c]=A->seq_al[b][list[c]];
3211 fprintf ( stdout, "\n\t");
3212 for (d=0; d<nl; d++)
3214 for (score=0,n=0,b=0; b<B->nseq; b++)
3216 if (B->seq_al[b][compound_score[a][0]]=='O')continue;
3217 for (c=0; c<B->nseq; c++)
3219 if (B->seq_al[c][compound_score[a][0]]=='O')continue;
3223 r1= A->seq_al[b][list[d]];
3224 r2= A->seq_al[b][list[d]];
3225 if (is_gap(r1) || is_gap(r2))continue;
3226 else score+=M[r1-'A'][r2-'A'];
3232 if ((float)score/(float)ref_score[d]>1.2)fprintf ( stdout, "*");
3233 else fprintf ( stdout, " ");
3235 for (AO->nseq=0,b=0; b<B->nseq; b++)
3237 if (B->seq_al[b][compound_score[a][0]]=='I')continue;
3238 fprintf ( stdout, "\n\t");
3239 for (c=0; c<nl; c++)
3241 AO->seq_al[AO->nseq][c]=A->seq_al[b][list[c]];
3242 fprintf ( stdout, "%c", A->seq_al[b][list[c]]);
3246 simI=aln2sim (AI, "blosum62mt"); simO=aln2sim (AO, "blosum62mt");
3247 fprintf ( stdout, "\nDELTA: I: %d O: %d %d",simI,simO, simI-simO);
3251 for ( a=0; a<B->nseq; a++)
3254 fprintf ( fp2, ">%s\n", B->name[a]);
3255 for (b=0; b<B->len_aln/2; b++)
3256 fprintf ( fp2, "%c", B->seq_al[a][compound_score[b][0]]);
3257 fprintf (fp2, "\n");
3260 HERE ("OUTPUT compound.fasta");
3265 sort_int_inv (result, 2, 0, 0, n-1);
3268 for (tp=0,a=0; a<n; a++)
3271 if ((pp-tp) == (a-tp))break;
3280 fprintf ( stdout, "\nTOT: sn: %.3f sn2: %.3f sp: %.3f MIN: %.3f",sn, sn2,sp, MIN((MIN(sn,sn2)),sp));
3283 HERE ("Delta= %d", delta);
3287 C=copy_aln(A, NULL);
3288 for (a=0; a< nl; a++)
3289 for (b=0; b<A->nseq; b++)
3290 C->seq_al[b][a]=A->seq_al[b][list[a]];
3292 array=vcalloc (C->len_aln, sizeof (int));
3293 explore_weight_matrix (C, B, 6,0, array);
3300 void count_misc (Alignment *A, Alignment *B)
3302 int **done, a, b, c, d, e,f, g, *list, n, score;
3307 search_best_combo (A,B);
3308 myexit (EXIT_FAILURE);
3309 pos=vcalloc (A->len_aln+1, sizeof (int));
3318 fprintf ( stdout, "\nR: %3f " ,(float)sar_aln2r(A,B,pos,1));myexit (EXIT_FAILURE);
3320 for (a=0; a< A->len_aln-w; a++)
3334 fprintf ( stdout, "\nP: %3d W:2 R: %3f ",a+1, (float)sar_aln2r(A,B,pos,0));
3341 myexit (EXIT_FAILURE);
3342 for (a=0; a<w; a++) pos[a]=1;
3343 for (a=w; a< A->len_aln-1; a++)
3347 fprintf ( stdout, "\nP: %3d W:2 R: %3f ",a, (float)sar_aln2r(A,B,pos,0));
3350 myexit (EXIT_FAILURE);
3356 explore_weight_matrix (A, B,3, 0,pos);
3357 myexit (EXIT_FAILURE);
3359 for (a=0; a<A->len_aln; a++)
3360 for ( b=0; b<A->len_aln; b++)
3361 for (c=0; c<A->len_aln; c++)
3362 for (d=0; d<A->len_aln; d++)
3363 for (f=0; f<A->len_aln; f++)
3364 for (g=0; g<A->len_aln; g++)
3374 fprintf ( stdout, "\n%d %d %d %d %d %d %.3f", a, b,c,d,f, g, sar_aln2r(A,B, pos,0));
3378 myexit (EXIT_FAILURE);
3381 slist=declare_double (A->nseq*A->nseq*10, 2);
3382 done=declare_int (256, 256);
3383 list=vcalloc ( A->nseq, sizeof (int));
3385 for (a=0; a<A->len_aln-1; a++)
3387 for (b =0; b<256; b++)for (c=0; c<256; c++)done[b][c]=0;
3389 for (b=0; b<A->nseq-1; b++)
3393 r2=A->seq_al[b][a+1];
3394 if (done[r1][r2])continue;
3398 fprintf ( stdout, "\n%3d %c%c: %s ",a+1, r1, r2, A->name[b]);
3399 for ( c=b+1; c<A->nseq; c++)
3401 if (r1==A->seq_al[c][a] && r2==A->seq_al[c][a+1])
3403 fprintf ( stdout, "%s ", A->name[c]);
3410 for (e=0,score=0,c=0; c<n-1; c++)
3411 for (d=c+1; d<n; d++,e++)
3412 score+=get_sar_sim2(B->seq_al[list[c]], B->seq_al[list[d]]);
3413 fprintf ( stdout, " Score=%d", score/e);
3417 for (score=0,e=0,a=0; a<A->nseq-1; a++)
3418 for (b=a+1; b<A->nseq; b++,e++)
3420 score+=get_sar_sim2(B->seq_al[a], B->seq_al[b]);
3422 fprintf (stdout,"AVG=%d", score/e);
3423 for (n=0,a=0; a< A->nseq-1; a++)
3427 if (!M)M=read_matrice ("blosum62mt");
3430 for (b=a+1; b<A->nseq; b++)
3432 int n11, n01, n10, n00, n1;
3434 for (sim=d=0;d<A->len_aln; d++)
3440 //sim +=(M[r1-'A'][r2-'A']>0)?1:0;
3443 sim=(100*sim)/(A->len_aln);//+rand()%10;
3444 for (n1=n00=n11=n10=n01=score=0, d=0; d<B->len_aln; d++)
3449 n11+=(r1=='I' && r2=='I');
3450 n00+=(r1=='O' && r2=='O');
3451 n10+=(r1=='I' && r2=='0');
3452 n01+=(r1=='O' && r2=='I');
3453 n1+=(r1=='I' || r2=='I');
3455 score =((n11+n00)*100)/B->len_aln;
3457 //score=get_sar_sim2(B->seq_al[a], B->seq_al[b]);
3459 fprintf ( stdout, "\nSIM: %d SC: %d", sim, score);
3460 slist[n][0]=(double)sim;
3461 slist[n][1]=(double)score;
3465 r=return_r(slist, n);
3466 fprintf ( stdout, "\nR= %.4f", (float)r[0]);
3467 myexit (EXIT_FAILURE);
3470 int aln2ngap ( Alignment *A)
3473 for (a=0; a< A->len_aln; a++)
3474 for (b=0; b<A->nseq; b++) ngap+=is_gap (A->seq_al[b][a]);
3477 int * count_in_aln ( Alignment *A, Alignment *ST, int value, int n_symbol,char **symbol_list, int *table)
3482 if (!table)table=vcalloc (n_symbol, sizeof (int));
3484 A->residue_case=KEEP_CASE;
3485 for ( a=0; a< A->nseq; a++)
3487 if(value!=10 && ST)for ( c=0; c< ST->nseq; c++)if ( strm(ST->name[c], A->name[a]))break;
3488 for ( b=0; b< A->len_aln; b++)
3490 if ( value==10 || !ST)st=11;
3491 else st=(isdigit(ST->seq_al[c][b]))?ST->seq_al[c][b]-'0':ST->seq_al[c][b];
3492 if ( st==value || value==-1)
3494 for ( d=0; d<n_symbol; d++)table[d]+=is_in_set ( A->seq_al[a][b], symbol_list[d]);
3501 char *dna_aln2cons_seq ( Alignment *A)
3504 static int **column_count;
3505 static int **old_tot_count;
3506 static int **new_tot_count;
3507 static char *string1, *string2;
3510 int NA=0, NG=1, NC=2, NT=3, IGAP=4;
3511 static int MAX_EST_SIZE=10000;
3512 static int size_increment=1000;
3514 int overlap=0, best_overlap=0;
3517 seq=vcalloc ( A->len_aln+1, sizeof (char));
3521 column_count=vcalloc(MAX_EST_SIZE, sizeof (int*));
3522 for ( a=0; a< MAX_EST_SIZE; a++)
3523 column_count[a]=vcalloc (5, sizeof (int));
3525 old_tot_count=vcalloc(MAX_EST_SIZE, sizeof (int*));
3526 new_tot_count=vcalloc(MAX_EST_SIZE, sizeof (int*));
3527 A->P=declare_profile( "agct-",MAX_EST_SIZE);
3528 string1=vcalloc (MAX_EST_SIZE, sizeof (char));
3529 string2=vcalloc (MAX_EST_SIZE, sizeof (char));
3531 else if (A->len_aln>MAX_EST_SIZE)
3535 for ( a=0; a< MAX_EST_SIZE; a++)
3536 vfree(column_count[a]);
3537 vfree(column_count);
3538 vfree(old_tot_count);
3539 vfree(new_tot_count);
3544 column_count=vcalloc(MAX_EST_SIZE+ size_increment, sizeof (int*));
3545 for ( a=0; a< MAX_EST_SIZE+ size_increment; a++)
3546 column_count[a]=vcalloc (5, sizeof (int));
3548 old_tot_count=vcalloc(MAX_EST_SIZE+ size_increment, sizeof (int*));
3549 new_tot_count=vcalloc(MAX_EST_SIZE+ size_increment, sizeof (int*));
3551 for (a=0; a< MAX_EST_SIZE; a++)
3553 old_tot_count[a]=*(column_count++);
3554 for ( b=0; b<5; b++)old_tot_count[a][b]=(A->P)->count[b][a];
3556 free_int ( (A->P)->count, -1);
3558 (A->P)->count=declare_int (5, MAX_EST_SIZE+ size_increment);
3559 (A->P)->max_len=MAX_EST_SIZE+ size_increment;
3560 MAX_EST_SIZE+= size_increment;
3561 string1=vcalloc (MAX_EST_SIZE, sizeof (char));
3562 string2=vcalloc (MAX_EST_SIZE, sizeof (char));
3566 sprintf ( string1, "%s",A->seq_al[0]);
3567 sprintf ( string2, "%s",A->seq_al[1]);
3570 string1=mark_internal_gaps(string1,'.');
3571 string2=mark_internal_gaps(string2,'.');
3575 for (b=0,a=0; a< A->len_aln; a++)
3586 best_overlap=MAX(overlap, best_overlap);
3591 if (!is_gap(r1) && first==1)new_tot_count[a]=old_tot_count[b++];
3592 else if (is_gap(r1) || first==0){new_tot_count[a]=*column_count;column_count++;};
3596 if(r1=='a') new_tot_count[a][NA]++;
3597 else if ( r1=='g')new_tot_count[a][NG]++;
3598 else if ( r1=='c')new_tot_count[a][NC]++;
3599 else if ( r1=='t')new_tot_count[a][NT]++;
3600 else if (is_gap(r1));
3603 new_tot_count[a][NA]++;
3604 new_tot_count[a][NG]++;
3605 new_tot_count[a][NC]++;
3606 new_tot_count[a][NT]++;
3609 if ( a> 0 && a<A->len_aln-1 && r1=='.')
3611 new_tot_count[a][IGAP]+=((new_tot_count[a-1][NA]+new_tot_count[a-1][NG]+new_tot_count[a-1][NC]+new_tot_count[a-1][NT]));
3615 if(r2=='a') new_tot_count[a][NA]++;
3616 else if ( r2=='g')new_tot_count[a][NG]++;
3617 else if ( r2=='c')new_tot_count[a][NC]++;
3618 else if ( r2=='t')new_tot_count[a][NT]++;
3619 else if ( r2=='.')new_tot_count[a][IGAP]++;
3623 new_tot_count[a][NA]++;
3624 new_tot_count[a][NG]++;
3625 new_tot_count[a][NC]++;
3626 new_tot_count[a][NT]++;
3628 (A->P)->count[0][a]=new_tot_count[a][NA];
3629 (A->P)->count[1][a]=new_tot_count[a][NG];
3630 (A->P)->count[2][a]=new_tot_count[a][NC];
3631 (A->P)->count[3][a]=new_tot_count[a][NT];
3632 (A->P)->count[4][a]=new_tot_count[a][IGAP];
3634 best_int(4,1, &best,new_tot_count[a][NA], new_tot_count[a][NG],new_tot_count[a][NC],new_tot_count[a][NT]);
3635 if( best==0) seq[a]='a';
3636 else if ( best==1)seq[a]='g';
3637 else if ( best==2)seq[a]='c';
3638 else if ( best==3)seq[a]='t';
3644 fprintf ( stderr, "[Best Overlap: %d Residues]", best_overlap);
3645 count_buf=old_tot_count;
3646 old_tot_count=new_tot_count;
3647 new_tot_count=count_buf;
3653 char *aln2cons_maj ( Alignment *A, int ns, int *ls, int n_groups, char **group_list)
3661 if ( !aa) aa=vcalloc (1000, sizeof (int));
3663 len=strlen (A->seq_al[ls[0]]);
3664 seq=vcalloc (len+1, sizeof (char));
3669 ls=vcalloc ( A->nseq, sizeof (int));
3670 for ( a=0; a< A->nseq; a++)ls[a]=a;
3674 for ( a=0; a<len; a++)
3676 int best_s=0, best_aa=0, r;
3677 for (b=0; b< ns; b++)
3679 r=tolower(A->seq_al[ls[b]][a]);
3681 if (!is_gap(r) && aa[r]>best_s)
3688 for (best_s=0, best_aa=0,b=0; b< ns; b++)
3690 aa[tolower(A->seq_al[ls[b]][a])]=0;
3693 if ( clean_ls)vfree(ls);
3699 char *aln2cons_seq ( Alignment *A, int ns, int *ls, int n_groups, char **group_list)
3709 len=strlen (A->seq_al[ls[0]]);
3710 seq=vcalloc (len+1, sizeof (char));
3715 ls=vcalloc ( A->nseq, sizeof (int));
3716 for ( a=0; a< A->nseq; a++)ls[a]=a;
3723 group_list=declare_char ( 26, 2);
3724 for ( a=0; a<26; a++)group_list[a][0]=a+'a';
3730 for ( a=0; a<len; a++)
3732 group=vcalloc (n_groups+1, sizeof (int));
3733 for (best_group=0,b=0; b< ns; b++)
3735 if ( !is_gap(A->seq_al[ls[b]][a]))
3737 for (c=0; c< n_groups; c++)
3738 if ( is_in_set (tolower(A->seq_al[ls[b]][a]), group_list[c]))
3740 best_group=(group[c]>group[best_group])?c:best_group;
3743 seq[a]=group_list[best_group][0];
3748 if ( aa_group) free_char (group_list, -1);
3750 if ( clean_ls)vfree(ls);
3755 Alignment *aln2conservation ( Alignment *A, int threshold,char *seq)
3757 int a, b, c, d, i, c1, c2;
3765 pos =vcalloc (A->len_aln, sizeof (int));
3766 eval=vcalloc (A->len_aln, sizeof (int));
3767 sim=aln2sim_mat (A, "idmat");
3768 if (seq)i=name_is_in_list (seq, A->name, A->nseq, 100);
3771 if ( i==-1) {HERE ("%s is an unknown:sequence [FATAL]"); myexit (EXIT_FAILURE);}
3773 for (a=0; a<A->len_aln; a++)
3777 for (c=0,e=a-w; e<=a+w; e++)
3779 if (e<0 || e==A->len_aln)continue;
3780 c1=toupper (A->seq_al[i][e]);
3781 for (b=0; b<A->nseq; b++)
3783 c2=toupper (A->seq_al[b][a]);
3787 s=(double)((double)sim[i][b]/(double)(100));
3792 s=(double)(((double)100-(double)sim[i][b])/(double)(100));
3794 eval[a]+=(s==0)?0:log(s);
3797 pos[a]=(c*100)/A->nseq;
3798 if (!is_gap(c1)){tot+=pos[a]; tn++;}
3800 if (pos[a]>=threshold)A->seq_al[i][a]=toupper (A->seq_al[i][a]);
3801 else A->seq_al[i][a]=tolower (A->seq_al[i][a]);
3803 fprintf (stdout, ">%s %s [i=%d]\n%s\n", A->name[i],A->aln_comment[i],i, A->seq_al[i]);
3804 tot=(tn>0)?(float)tot/(float)tn:0;
3806 for (d=0,a=0; a<A->len_aln; a++)
3808 fprintf (stdout, "# %c %4d", A->seq_al[i][a],pos[a]);
3811 if ( !is_gap (A->seq_al[i][a]))
3813 fprintf (stdout, " LogOdd: %6.2f ", (tot==0 || pos[a]==0)?0:(float)log((float)pos[a]/tot));
3814 fprintf ( stdout, " Pos: %5d E-Val: %9.2f", ++d, eval[a]/(A->nseq));
3816 fprintf ( stdout, "\n");
3818 fprintf ( stdout, "#average conservation: %.2f", tot);
3819 myexit (EXIT_SUCCESS);
3821 char *aln2cons_seq_mat ( Alignment *A, char *mat_name)
3823 return sub_aln2cons_seq_mat (A, A->nseq, NULL, mat_name);
3825 char *sub_aln2cons_seq_mat2 ( Alignment *A,int ns, char **ls, char *mat_name)
3829 list=name_array2index_array(ls, ns, A->name, A->nseq);
3830 cons=sub_aln2cons_seq_mat ( A,ns, list, mat_name);
3835 char *sub_aln2cons_seq_mat ( Alignment *A,int ns, int *ls, char *mat_name)
3840 int score=0, best_score=0, best_r=0;
3844 mat=read_matrice (mat_name);
3845 len=strlen ( A->seq_al[(ls==NULL)?0:ls[0]]);
3846 seq=vcalloc (len+1, sizeof (char));
3847 for ( a=0; a<len; a++)
3849 for (b=0; b<20; b++)
3852 for ( naa=0,score=0,c=0; c<ns; c++)
3854 s=(ls==NULL)?c:ls[c];
3855 if ( ls && ls[c]==-1) continue;
3856 else if (is_gap(A->seq_al[s][a]))continue;
3861 score+=mat[r1-'A'][r2-'A'];
3864 if (naa==0)best_r='-';
3865 if ( b==0 || score>best_score){best_score=score; best_r=r1;}
3873 int seq_list2in_file ( TC_method *M, Sequence *S, char *list, char *file)
3881 t=tolower(M->seq_type[0]);
3885 return seq_list2fasta_file ( S, list, file, M->out_mode);
3896 fp=vfopen ( file, "w");
3897 slist=string2num_list (list);
3900 if (strlen (M->seq_type) >1)
3902 add_warning( stderr, "\nERROR: Mixed seq_type not supported for external methods\n[FATAL:%s]", PROGRAM);
3905 for ( a=2; a<n; a++)
3908 if (t=='p')T=(S->T[s])->P;
3909 else if (t=='r')T=(S->T[s])->R;
3910 else if (t=='g')T=(S->T[s])->G;
3914 fprintf ( fp, ">%s\n%s%s", S->name[s], S->seq[s], LINE_SEPARATOR);
3916 else if ( T && T->template_file && T->template_file[0])
3918 fp2=vfopen (T->template_file, "r");
3919 while ( (c=fgetc (fp2))!=EOF)
3921 fprintf ( fp, "%c", c);
3923 fprintf (fp, "%s", LINE_SEPARATOR);
3928 fprintf (fp, "TARGET_SEQ_NAME: ");
3929 for (a=2; a<n; a++)fprintf ( fp, "%s ", (S->name[slist[a]]));
3930 fprintf ( fp, "%s", LINE_SEPARATOR);
3932 vfclose (fp); vfree (slist);
3940 int seq_list2fasta_file( Sequence *S, char *list, char *file, char *outmode)
3947 //out_mode: names can only be re-converted when out mode is aln
3949 /*Buf is used because cmalloced functions cannot go through strtok*/
3953 fp=vfopen ( file, "w");
3956 for ( a=0; a<S->nseq; a++)
3958 if (outmode && strm (outmode, "aln"))fprintf ( fp, ">%s %s\n%s\n", decode_name (S->name[a], CODE),S->name[a], S->seq[a]);
3959 else fprintf ( fp, ">%s %s\n%s\n", S->name[a],S->name[a], S->seq[a]);
3971 buf=vcalloc ( strlen (list)+1, sizeof (char));
3972 sprintf ( buf, "%s", list);
3975 n=atoi(strtok (list,SEPARATORS));
3977 list2=declare_int (n, 2);
3979 for ( a=0; a<n; a++)
3981 list2[a][0]=atoi(strtok (NULL, SEPARATORS));
3982 list2[a][1]=rand()%max;
3984 if ( atoigetenv ("HoT_4_TCOFFEE"))sort_int ( list2,2, 1, 0, n-1);
3985 for ( a=0; a< n; a++)
3988 if (outmode && strm (outmode, "aln"))fprintf ( fp, ">%s %s\n%s\n", decode_name (S->name[i], CODE), S->name[a],S->seq[i]);
3989 else fprintf ( fp, ">%s %s\n%s\n", S->name[a], S->name[a],S->seq[i]);
3996 Structure * seq2struc ( Sequence *S, Structure *ST)
4000 for ( a=0; a< S->nseq; a++)
4001 for ( b=0; b< S->len[a]; b++)
4002 ST->struc[a][b+1][ST->n_fields-1]=S->seq[a][b];
4006 void aln2struc (Alignment *A, Structure *ST)
4010 for ( a=0; a< A->nseq; a++)
4011 for (c=0, b=0; b< A->len_aln; b++)
4013 if ( !is_gap (A->seq_al[a][b]))
4015 ST->struc[a][c][ST->n_fields-1]=A->seq_al[a][b];
4020 Alignment *stack_aln (Alignment *A, Alignment *B)
4023 int max_len=0, max_nseq=0;
4024 if ( B==NULL)return A;
4025 if ( A==NULL)return B;
4027 max_nseq=A->nseq+B->nseq;
4028 for (a=0; a< A->nseq; a++)max_len=MAX(strlen(A->seq_al[a]),max_len);
4029 for (a=0; a< B->nseq; a++)max_len=MAX(strlen(B->seq_al[a]),max_len);
4031 A=realloc_aln2 ( A,max_nseq,max_len+1);
4033 for (a=A->nseq,b=0; b< B->nseq; b++, a++)
4035 sprintf ( A->seq_comment[a] , "%s", B->seq_comment[b]);
4036 sprintf ( A->aln_comment[a] , "%s", B->aln_comment[b]);
4038 sprintf ( A->seq_al [a] , "%s", B->seq_al [b]);
4039 sprintf ( A->name [a] , "%s", B->name[b]);
4040 sprintf ( A->file [a], "%s" , B->file[b]);
4041 A->order[a][0]=B->order[b][0];
4042 A->order[a][1]=B->order[b][1];
4043 A->score_seq[a]=B->score_seq[b];
4044 A->len[a]=B->len[b];
4047 A->len_aln=MAX(A->len_aln, B->len_aln);
4048 A->nseq=A->nseq+B->nseq;
4049 A->score_aln=A->score_aln+B->score_aln;
4051 A->finished=A->finished+B->finished;
4055 Alignment *chseqIaln(char *name, int seq_n, int start,int len,Sequence *S, int seqIaln, Alignment *A)
4059 seq=extract_char ( S->seq[seq_n], start, len);
4060 A=realloc_aln2 (A, (A==NULL)?(seqIaln+1):MAX(A->nseq,seqIaln+1), ((A==NULL)?(strlen (seq)):MAX(strlen (seq),A->len_aln))+1);
4063 sprintf ( A->seq_al[seqIaln], "%s",seq);
4066 A->order[seqIaln][0]=seq_n;
4067 A->order[seqIaln][1]=start;
4068 sprintf ( A->name[seqIaln], "%s", name);
4069 A->nseq=MAX(A->nseq, seqIaln+1);
4070 A->len_aln=return_maxlen(A->seq_al, A->nseq);
4076 Alignment * aln_gap2random_aa(Alignment *A)
4081 if (strm ( (A->S)->type, "PROTEIN"))
4082 sprintf ( alp, "acefghiklmnpqrstuvwy");
4083 else if ( strm ( (A->S)->type, "DNA") ||strm ( (A->S)->type, "RNA") )
4084 sprintf ( alp, "agct");
4088 for (a=0; a<A->nseq; a++)
4089 for ( b=0; b<A->len_aln; b++)
4090 if ( is_gap (A->seq_al[a][b]))A->seq_al[a][b]=alp[(int)rand()%(l)];
4094 Alignment * make_random_aln(Alignment *A,int nseq, int len, char *alphabet)
4099 A=realloc_aln2(A, nseq, len+1);
4103 for ( a=0; a< A->nseq; a++)sprintf ( A->file[a], "random alignment");
4104 for ( a=0; a< nseq; a++)
4105 A=add_random_sequence2aln(A,alphabet);
4108 Alignment * add_random_sequence2aln( Alignment *A, char *alphabet)
4115 A=realloc_alignment2 (A, A->nseq+1, A->len_aln+1);
4117 for ( a=0; a< A->len_aln; a++)A->seq_al[A->nseq][a]=alphabet[rand()%n];
4118 if (! A->name[A->nseq][0])
4120 for ( a=0; a<10; a++)A->name[A->nseq][a]=alphabet[rand()%n];
4121 A->name[A->nseq][a]='\0';
4128 Sequence *get_defined_residues( Alignment *A)
4133 if ( !A || !A->S) return NULL;
4135 S=duplicate_sequence (A->S);
4136 for ( a=0; a< S->nseq; a++)
4137 for ( b=0; b< S->len[a]; b++)S->seq[a][b]=UNDEFINED_RESIDUE;
4138 buf=vcalloc(A->len_aln+1,sizeof (char));
4139 for ( a=0; a< A->nseq; a++)
4141 sprintf ( buf, "%s",A->seq_al[a]);
4146 for ( b=1; b<= l; b++)
4148 r=A->seq_cache[s][b];
4150 if ( r>=0)S->seq[s][r-1]=(A->S)->seq[s][r-1];
4156 Alignment *thread_defined_residues_on_aln ( Alignment *A, Sequence *S1)
4160 for ( a=0; a< A->nseq; a++)
4164 for (b=0;b< A->len_aln; b++)
4166 gap=is_gap(A->seq_al[a][b]);
4171 r2=A->seq_cache[s][r]-1;
4173 if (r2>=0 && S1->seq[s][r2]==UNDEFINED_RESIDUE)
4174 A->seq_al[a][b]=UNDEFINED_RESIDUE;
4181 int ** trim_aln_borders (char **seq1, char **seq2, int nseq)
4191 max=MAX(get_longest_string (seq1,-1, NULL, NULL),get_longest_string (seq2,-1, NULL, NULL))+1;
4192 buf1=vcalloc ( max, sizeof(char));
4193 buf2=vcalloc ( max, sizeof(char));
4195 for ( a=0; a< nseq; a++)
4197 sprintf ( buf1, "%s", seq1[a]);
4198 sprintf ( buf2, "%s", seq2[a]);
4205 if (str_overlap ( buf1, buf2,'*')!=0)
4207 l1=strlen ( seq1[a]);
4208 l2=strlen ( seq2[a]);
4209 for ( b=0,c=0; c< l1; c++)
4210 if ( !is_gap(seq1[a][c]))seq1[a][c]=buf1[b++];
4212 for ( b=0,c=0; c< l2; c++)
4213 if ( !is_gap(seq2[a][c]))seq2[a][c]=buf2[b++];
4222 Sequence * merge_seq ( Sequence *IN, Sequence *OUT)
4226 if ( OUT==NULL)return duplicate_sequence (IN);
4229 if ( IN && check_list_for_dup( IN->name, IN->nseq))
4231 fprintf ( stderr, "\nERROR: %s is duplicated in file %s[FATAL]\n", check_list_for_dup( IN->name, IN->nseq), IN->file[0]);
4232 myexit (EXIT_FAILURE);
4234 for ( a=0; a< IN->nseq; a++)
4235 if ((OUT=add_sequence ( IN, OUT, a))==NULL)return NULL;
4240 Alignment *seq_name2removed_seq_name(Sequence *S, Alignment *NA, float **diff)
4244 for (a=0; a< S->nseq; a++)
4246 if (name_is_in_list( S->name[a], NA->name, NA->nseq, 100)!=-1) continue;
4247 for ( min_diff=100, s=0, b=0; b< NA->nseq; b++)
4249 rb=name_is_in_list ( NA->name[b], S->name, S->nseq, 100);
4250 if ( diff[a][rb]<min_diff)
4253 min_diff=diff[a][rb];
4257 strcat ( NA->seq_comment[s], " ");
4258 strcat ( NA->seq_comment[s], S->name[a]);
4266 int seq_name2index (char *name, Sequence *S)
4269 else return name_is_in_list ( name, S->name, S->nseq, MAXNAMES+1);
4271 char * seq_name2coor ( char *s, int *start, int *end, char sep)
4274 char n1[100], n2[100];
4280 while ( s[a]!=sep && s[a]!='\0')a++;
4281 if ( s[a]=='\0')return s;
4287 while ( s[a]!=sep && s[a]!='\0')n1[b++]=s[a++];
4289 if ( s[a]=='\0'){n1[b]='\0';if ( n1[0])start[0]=atoi(n1);return s;}
4290 else s[a++]=n1[b]='\0';
4293 while ( s[a]!=sep && s[a]!='\0')n2[c++]=s[a++];
4297 if ( n1[0])start[0]=atoi(n1);
4298 if ( n2[0])end[0]=atoi(n2);
4304 Sequence *extract_one_seq(char *n,int start, int end, Alignment *S, int keep_name)
4313 if ( n[0]=='#')seq=S->nseq;
4314 else if ( (seq=name_is_in_list (n, S->name, S->nseq, 100)+1)!=0);
4315 else if (is_number (n) && (seq=atoi(n))!=0) seq=atoi(n);
4318 fprintf ( stderr, "\nCould not find Sequence %s [FATAL]", n);
4319 myexit (EXIT_FAILURE);
4323 name=vtmpnam ( NULL);
4324 fp=vfopen ( name, "w");
4325 if ( start && end &&!keep_name)fprintf (fp, ">%s_%d_%d\n",S->name[seq],start, end);
4326 else if ( start && end==0 && !keep_name)fprintf (fp, ">%s_%d_%d\n",S->name[seq],start,(int)strlen ( S->seq_al[seq]));
4327 else fprintf (fp, ">%s\n", S->name[seq]);
4329 if ( start==0 && end==0){fprintf (fp, "%s\n", S->seq_al[seq]);}
4330 else if (end==0){fprintf (fp, "%s\n", S->seq_al[seq]+start-1);}
4333 for ( a=start-1; a<end; a++){fprintf ( fp, "%c", S->seq_al[seq][a]);}
4334 fprintf ( fp, "\n");
4339 OUT_S=get_fasta_sequence_num (name, NULL);
4346 Sequence * extract_sub_seq( Sequence *COOR, Sequence *S)
4351 for ( a=0; a< S->nseq; a++)
4353 if ( (s=name_is_in_list ( S->name[a], COOR->name, COOR->nseq, 100))!=-1)
4356 sscanf ( COOR->seq_comment[s], "%d %d", &start, &end);
4357 for (c=0,b=start-1; b< end; b++, c++)S->seq[a][c]=S->seq[a][b];
4359 sprintf ( S->seq_comment[a], "%s",COOR->seq_comment[s]);
4363 S=reorder_seq ( S, COOR->name, COOR->nseq);
4369 char * aln_column2string (Alignment *A, int p)
4375 HERE ("ERROR: index (p=%d) loger than aln (l=%d) [FATAL]", p, A->len_aln);
4376 myexit (EXIT_FAILURE);
4380 s=vcalloc (A->nseq+1, sizeof (char));
4381 for (a=0; a< A->nseq; a++)s[a]=A->seq_al[a][p];
4387 int **fix_seq_aln (Sequence *S, Alignment*A, int **cache)
4391 if (!cache)cache=vcalloc (S->nseq, sizeof (int*));
4393 for (s=0; s<A->nseq; s++)
4395 if ((i=name_is_in_list (A->name[s], S->name, S->nseq, 100)==-1))continue;
4396 for (nr=0,b=0; b<A->len_aln; b++)
4398 if (!is_gap(A->seq_al[s][b]))
4405 int **fix_seq_seq (Sequence *S0, Sequence *Sx)
4407 //Expresses seq1 in terms of s2
4413 index=vcalloc ( S0->nseq, sizeof (int*));
4414 for (s0=0; s0<S0->nseq; s0++)
4417 index[s0]=vcalloc (l+1, sizeof (int));
4418 i=index[s0][0]=name_is_in_list (S0->name[s0], Sx->name, Sx->nseq, 100);
4420 else if (strm (S0->seq[s0], Sx->seq[i]))
4422 for (r0=1; r0<=l; r0++)
4433 Alignment *B=align_two_sequences (S0->seq[s0],Sx->seq[i],(strm(S0->type, "PROTEIN"))?"blosum62mt":"idmat",-4,-1, "myers_miller_pair_wise");
4434 for (c=0; c<B->len_aln; c++)
4437 int g0=is_gap(B->seq_al[0][c]);
4438 int g1=is_gap(B->seq_al[1][c]);
4441 if (!g0 && !g1)index[s0][nr0]=nr1;
4443 if (aln2sim(B, "idmat")<20) add_warning (stderr,"Unreliable reconciliation for sequence %s. If it a PDB, check source file", S0->name[s0]);
4444 free_aln (B);B=NULL;
4449 int **fix_aln_seq_new (Alignment *A, Sequence *Sx)
4455 f=fix_seq_seq(S, Sx);
4456 free_sequence (S, S->nseq);
4459 Alignment * fix_aln_seq ( Alignment *A, Sequence *S)
4463 int g0, g1, nr0, nr1;
4468 /*This function establishes the correspondance between every (1..N+1) residue of each aligned sequence
4469 and its correspondance in S:
4470 A->seq_cache[a][b]=x means that residue b of aligned sequence a corresponds to residue x of the sequence with tye same index in S
4471 A->seq_cache[a][b]=0 means there is no correspondance.
4472 a is the index of the sequence
4473 Applying this function is needed for turning an alignment into a constraint list
4477 if ( S==NULL)return A;
4478 reorder_aln (A, S->name,S->nseq);
4479 if (A->seq_cache)free_int (A->seq_cache, -1);
4480 A->seq_cache=declare_int ( S->nseq, MAX((A->len_aln+1), S->max_len+1));
4482 for (a=0; a< S->nseq; a++)
4483 for ( b=0; b< A->len_aln; b++)A->seq_cache[a][b]=-1;
4486 for ( a=0; a< S->nseq; a++)
4488 for (b=0; b< A->nseq; b++)
4490 if (strm ( S->name[a], A->name[b]))
4495 buf1=vcalloc ( A->len_aln+1, sizeof (char));
4496 sprintf (buf1, "%s", A->seq_al[b]);
4498 upper_string (buf1);
4501 buf2=vcalloc (strlen(S->seq[a])+1, sizeof (char));
4502 sprintf (buf2, "%s",S->seq[a]);
4504 upper_string (buf2);
4508 if ( strm (buf1,buf2))
4511 for ( c=0; c<S->len[a]; c++)A->seq_cache[a][c+1]=c+1;
4516 B=align_two_sequences (buf2,buf1,"blosum62mt",-4,-1, "myers_miller_pair_wise");
4517 if ( getenv ("DEBUG_RECONCILIATION"))
4519 fprintf (stderr, "\n[DEBUG_RECONCILIATION:fix_aln_seq]\nReconciliation of %s\nA=Ref_sequence\nB=New_seq", S->name[a]);
4523 for (id=0, tot=0,nr0=0,nr1=0,c=0; c<B->len_aln; c++)
4525 g0=is_gap(B->seq_al[0][c]);
4526 g1=is_gap(B->seq_al[1][c]);
4532 id+=(B->seq_al[0][c]==B->seq_al[1][c])?1:0;
4533 A->seq_cache[a][nr1]=nr0;
4537 A->seq_cache[a][nr1]=0;
4540 if ( ((id*100)/tot)<20)
4543 fprintf ( stderr, "\nTwo different sequences have the same name: %s", S->name[a]);
4544 fprintf ( stderr, "\nIf %s is a PDBID, Make sure it identifies the right chain (A, B, 1, 2...)", S->name[a]);
4545 fprintf ( stderr, "\nChain number or index must be added to the PDB id (i.e. 1gowA)");
4546 fprintf ( stderr, "\nIf You want to use %s anyway, rename it with a non-PDB identifier such as seq_%s\n",S->name[a],S->name[a]);
4547 myexit (EXIT_FAILURE);
4550 free_sequence ( B->S, -1);
4557 vfree(buf1);vfree(buf2);
4561 Sequence * add_prf2seq ( char *file, Sequence *S)
4566 if ( !is_aln (file)&& !is_seq (file))return S;
4573 R=fill_R_template(file,file, S);
4576 ((R->VR)->A)->expand=1;
4577 new_seq=declare_char (1,A->len_aln+1);
4578 sprintf ( new_seq[0], "%s",aln2cons_seq_mat(A, "blosum62mt"));
4580 NS=fill_sequence_struc(1, new_seq,A->file);
4581 S=add_sequence (NS, S, 0);
4582 (S->T[S->nseq-1])->R=R;
4584 free_sequence (NS, NS->nseq);
4585 free_char( new_seq, -1);
4590 int prf_in_seq ( Sequence *S)
4597 for ( a=0; a< S->nseq; a++)
4598 if (seq2R_template_profile(S, a)) return 1;
4602 Sequence * add_sequence ( Sequence *IN, Sequence *OUT, int i)
4610 OUT=duplicate_sequence (IN);
4613 for (a=0; a<OUT->nseq; a++)
4616 P=seq2R_template_profile (OUT, a);
4618 else if (name_is_in_list (IN->name[i], P->name, P->nseq, 100)!=-1) return OUT;
4621 /*Adds sequence i of IN at the end of OUT*/
4623 if ((s=name_is_in_list ( IN->name[i], OUT->name, OUT->nseq,STRING))==-1 )
4625 OUT=realloc_sequence (OUT, OUT->nseq+1, IN->len[i]);
4626 sprintf ( OUT->name[OUT->nseq],"%s",IN->name[i]);
4627 sprintf ( OUT->file[OUT->nseq],"%s",IN->file[i]);
4628 sprintf ( OUT->seq_comment[OUT->nseq],"%s",IN->seq_comment[i]);
4629 sprintf ( OUT->aln_comment[OUT->nseq],"%s",IN->aln_comment[i]);
4631 sprintf ( OUT->seq[OUT->nseq],"%s",IN->seq[i]);
4632 OUT->len[OUT->nseq]=IN->len[i];
4633 OUT->T[OUT->nseq][0]=IN->T[i][0];
4637 else if ( s!=-1 && !case_insensitive_strcmp ( IN->seq[i], OUT->seq[s]))
4640 if ( getenv4debug("DEBUG_RECONCILIATION"))fprintf ( stderr,"[DEBUG_RECONCILIATION:add_sequence]\n%s\n%s\n", IN->seq[i], OUT->seq[s]);
4642 add_warning (stderr, "DISCREPANCY:%s in [%s] and [%s]\n", IN->name[i], IN->file[i], OUT->file[s]);
4645 if (((buf=build_consensus(IN->seq[i], OUT->seq[s],"cfasta_pair_wise" ))!=NULL)||((buf=build_consensus(IN->seq[i], OUT->seq[s],"myers_miller_pair_wise" ))!=NULL))
4648 OUT->max_len=MAX(OUT->max_len, strlen(buf));
4649 OUT->min_len=MIN(OUT->min_len, strlen(buf));
4650 OUT->seq =realloc_char ( OUT->seq, -1, -1,OUT->nseq,OUT->max_len+1);
4652 sprintf ( OUT->seq[s],"%s",buf);
4653 OUT->len[s]=strlen (buf);
4659 fprintf ( stderr, "IMPOSSIBLE TO RECONCILIATE SOME SEQUENCES[FATAL:%s]\n", PROGRAM);
4660 print_aln ( align_two_sequences (IN->seq[i], OUT->seq[s], "idmat", 0, 0, "fasta_pair_wise"));
4661 myexit (EXIT_FAILURE);
4673 Sequence * trim_seq ( Sequence *A, Sequence *B)
4678 if (A->nseq>B->nseq)
4684 R=declare_sequence (MIN(A->min_len,B->min_len), MAX(A->max_len, B->max_len), MIN(A->nseq, B->nseq));
4687 for (a=0; a< A->nseq; a++)
4689 if ( name_is_in_list ( A->name[a], B->name, B->nseq,STRING+1)!=-1)
4691 sprintf ( R->name[R->nseq], "%s", A->name[a]);
4692 sprintf ( R->seq[R->nseq], "%s", A->seq[a]);
4693 sprintf ( R->file[R->nseq], "%s", A->file[a]);
4694 sprintf ( R->aln_comment[R->nseq], "%s", A->aln_comment[a]);
4695 sprintf ( R->seq_comment[R->nseq], "%s", A->seq_comment[a]);
4697 R->len[R->nseq]=A->len[a];
4704 Sequence * trim_aln_seq ( Alignment *A, Alignment *B)
4707 static char **name_list;
4714 /*This function inputs two alignments A and B
4715 It removes sequences that are not common to both of them
4716 It rearange the sequences so that they are in the same order
4717 A decides on the order
4718 The Sequences (A->S) and (B->S) are treated the same way
4719 Sequences are also merged in order to detects discrepencies.
4720 A pointer to S is returned
4722 if (name_list)free_char (name_list, -1);
4723 name_list=declare_char (MAX(A->nseq, B->nseq), STRING+1);
4725 for ( a=0; a< A->nseq; a++)
4727 if ( name_is_in_list ( A->name[a], B->name, B->nseq,STRING)!=-1)
4729 sprintf ( name_list[n++], "%s", A->name[a]);
4735 reorder_aln ( A, name_list, n);
4736 if (A->seq_cache)cache_A=duplicate_int (A->seq_cache, -1, -1);
4737 if (B->seq_cache)cache_B=duplicate_int (B->seq_cache, -1, -1);
4738 reorder_aln ( B, name_list, n);
4739 for ( a=0; a< n; a++)
4743 p=A->seq_cache[A->order[a][0]];
4744 A->seq_cache[A->order[a][0]]=cache_A[a];
4749 p=B->seq_cache[B->order[a][0]];
4750 B->seq_cache[B->order[a][0]]=cache_B[a];
4753 A->order[a][0]=B->order[a][0]=a;
4755 free_int(A->seq_cache, -1);
4756 free_int(B->seq_cache, -1);
4758 A->seq_cache=cache_A;
4759 B->seq_cache=cache_B;
4766 A->S=B->S=merge_seq (SA, SB);
4769 Sequence * trim_aln_seq_name ( Alignment *A, Alignment *B)
4774 /*This function inputs two alignments A and B
4775 It removes sequences that are not common to both of them
4776 It rearange the sequences so that they are in the same order
4777 A decides on the order
4779 S=declare_sequence ( 1, 1, A->nseq+B->nseq);
4781 for ( a=0; a< A->nseq; a++)
4783 if ( name_is_in_list ( A->name[a], B->name, B->nseq,STRING)!=-1)
4785 sprintf ( S->name[S->nseq++], "%s", A->name[a]);
4793 char ** rm_name_tag (char **name, int nseq, char *tag)
4798 char **template_list;
4799 if ( !name )return NULL;
4801 tag_list=declare_char (10, 4);
4805 ntag=1; sprintf ( tag_list[0], "%s", tag);
4810 sprintf ( tag_list[ntag++], "_S_");
4811 sprintf ( tag_list[ntag++], "_G_");
4813 template_list=declare_char (nseq, 100);
4814 for ( a=0; a<nseq ; a++)
4816 for ( b=0; b<ntag; b++)
4818 s=strstr(name[a], tag_list[b]);
4823 sprintf ( template_list[a], ">%s _%s_ %s", name[a], s+1, s+3);
4829 free_char (tag_list, -1);
4830 return template_list;
4832 Sequence * swap_header ( Sequence *S, Sequence *H)
4836 for ( a=0; a< S->nseq; a++)
4838 if ( (n=name_is_in_list (S->name[a],H->name, H->nseq, 1000))!=-1)
4843 list=string2list (H->seq_comment[n]);
4844 if ( list==NULL || atoi(list[0])==1)continue;
4845 S->seq_comment[a]='\0';
4846 sprintf (S->name[a], "%s%s%s",H->name[n], list[1], list[2]);
4847 vfree ( S->seq_comment[a]);S->seq_comment[a]=vcalloc ( strlen (H->seq_comment[n])+1, sizeof (char));
4848 for (b=3; b< atoi(list[0]); b++)S->seq_comment[a]=strcat (S->seq_comment[a], list[b]);
4849 free_char (list, -1);
4856 Sequence * profile_seq2template_seq ( Sequence *S, char *template_file, Fname *F)
4858 /*This function fetches potential templates associated with sequences within a profile*/
4862 for ( i=0; i< S->nseq; i++)
4865 if ( (A=seq2R_template_profile (S, i)))
4869 A->S=seq2template_seq (A->S, template_file, F);
4870 if (!A->S)return NULL;
4877 Sequence * seq2template_type(Sequence *Seq)
4882 struct X_template *S=NULL;
4883 struct X_template *P=NULL;
4884 struct X_template *R=NULL;
4885 struct X_template *G=NULL;
4886 struct X_template *F=NULL;
4887 struct X_template *T=NULL;
4888 struct X_template *E=NULL;
4889 struct X_template *U=NULL;
4894 for (a=0; a< Seq->nseq; a++)
4896 if (!Seq->T[a])continue;
4897 //HERE ADD a Template
4898 P=seq_has_template (Seq, a, "_P_");
4899 S=seq_has_template (Seq, a, "_S_");
4900 R=seq_has_template (Seq, a, "_R_");
4901 G=seq_has_template (Seq, a, "_G_");
4902 F=seq_has_template (Seq, a, "_F_");
4903 T=seq_has_template (Seq, a, "_T_");
4904 E=seq_has_template (Seq, a, "_E_");
4905 U=seq_has_template (Seq, a, "_U_");
4908 sprintf ( (Seq->T[a])->seq_type, "%c%c%c%c%c%c%c%c", (P)?'P':e, (S)?'S':e, (S &&!P)?'s':e,(R)?'R':e, (G)?'G':e,(T)?'T':e,(E)?'E':e,(U)?'U':e);
4910 if (R && (A=seq2R_template_profile (Seq,a)) && A->S)
4913 A->S=seq2template_type ( A->S);
4919 char * string_contains_template_tag (char *string_in)
4923 if ( strstr (string, "_P_"))return "_P_";
4924 if ( strstr (string, "_S_"))return "_S_";
4925 if ( strstr (string, "_R_"))return "_R_";
4926 if ( strstr (string, "_G_"))return "_G_";
4927 if ( strstr (string, "_F_"))return "_F_";
4928 if ( strstr (string, "_T_"))return "_T_";
4929 if ( strstr (string, "_E_"))return "_E_";
4930 if ( strstr (string, "_U_"))return "_U_";
4934 static int check_blast_is_installed (char *server);
4938 static int check_blast_is_installed (char *server)
4940 if (strm (server, "EBI"));
4941 else if ( strm (server, "NCBI"))
4942 return check_program_is_installed (NCBIWEBBLAST_4_TCOFFEE,NULL, NULL,NCBIWEBBLAST_ADDRESS, INSTALL_OR_DIE);
4943 else if ( strm (server, "LOCAL"))
4944 return check_program_is_installed (NCBIBLAST_4_TCOFFEE,NULL, NULL,NCBIBLAST_ADDRESS, INSTALL_OR_DIE);
4949 Sequence * vremove_seq_template_files(Sequence *S)
4951 return handle_seq_template_file (S, "remove");
4953 Sequence * display_seq_template_files(Sequence *S)
4955 return handle_seq_template_file (S, "display");
4957 Sequence * handle_seq_template_file (Sequence *S, char *mode)
4962 for (a=0; a< S->nseq; a++)
4967 handle_X_template_files (T->P, mode);
4968 handle_X_template_files (T->F, mode);
4969 handle_X_template_files (T->R, mode);
4970 handle_X_template_files (T->T, mode);
4971 handle_X_template_files (T->E, mode);
4977 int handle_X_template_files ( X_template *T, char *mode)
4981 if ( strm (mode, "remove"))
4983 vremove (T->template_file);
4984 vremove (T->template_name);
4986 else if (strm (mode, "display"))
4989 sprintf ( buf, "Template %s", template_type2type_name (T->template_type));
4990 if (check_file_exists (T->template_name))display_output_filename ( stdout,buf,T->template_format,T->template_name, STORE);
4994 printf_exit (EXIT_FAILURE, stderr, "\nUnkonwn mode %s for template handling [FATAL:%s]", mode, PROGRAM);
4998 Sequence * seq2template_seq ( Sequence *S, char *template_list, Fname *F)
5000 /*Expected format for the template file:
5001 >seq_name _X_ Target_template
5003 G for genomes (Exoset)
5004 When alternative templates are given for a sequence, the first one superseeds all the others
5007 /*Fill the sequences*/
5014 char *pdb_db,*prot_db;
5017 int remove_template_file=0;
5020 remove_template_file=get_int_variable ("remove_template_file");
5021 server=get_string_variable ("blast_server");
5022 pdb_db=get_string_variable ("pdb_db");
5023 prot_db=get_string_variable ("prot_db");
5025 PmI=get_int_variable ("pdb_min_sim");
5026 PMI=get_int_variable ("pdb_max_sim");
5027 PmC=get_int_variable ("pdb_min_cov");
5029 BmI=get_int_variable ("prot_min_sim");
5030 BMI=get_int_variable ("prot_max_sim");
5031 BmC=get_int_variable ("prot_min_cov");
5033 //Set the type of the PDB structure
5034 if ((p=get_string_variable ("pdb_type")))
5036 sprintf ( pdb_type, "%s",p);
5040 sprintf (pdb_type, "dmn");
5043 if ( (template_list && template_list[0]=='\0') || strm ( template_list, "no_template"))
5047 else if ( strstr (template_list, "MODE_"))//pre_set mode
5049 return seq2template_seq ( S,template_list+strlen ("MODE_"),F);
5051 else if ( strm ( template_list, "SSP")|| strm ( template_list, "GOR"))
5054 /*use GOR to Predict the secondary structure*/
5055 check_program_is_installed (GOR4_4_TCOFFEE,NULL, NULL,GOR4_ADDRESS, INSTALL_OR_DIE);
5056 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#ssp_template@seq#%s/%s@obs#%s/%s@cache#%s@type#_E_",get_mcoffee_4_tcoffee(), "New_KS.267.seq", get_mcoffee_4_tcoffee(), "New_KS.267.obs", get_cache_dir());
5057 S=seq2template_seq (S,buf, F);
5060 else if ( strm ( template_list, "PSISSP") || strm (template_list, "PSIGOR"))
5063 /*Computes a GOR consensus on a psi-blast output*/
5064 check_program_is_installed (GOR4_4_TCOFFEE,NULL, NULL,GOR4_ADDRESS, INSTALL_OR_DIE);
5065 check_blast_is_installed(server);
5067 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#psissp_template@seq#%s/%s@obs#%s/%s@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_E_",get_mcoffee_4_tcoffee(), "New_KS.267.seq", get_mcoffee_4_tcoffee(), "New_KS.267.obs", get_cache_dir(), BmI,BMI,BmC,server);
5068 S=seq2template_seq (S,buf, F);
5071 else if ( strm ( template_list, "TM"))
5074 /*predict transmembrane structure*/
5075 check_program_is_installed (HMMTOP_4_TCOFFEE,NULL, NULL,HMMTOP_ADDRESS, INSTALL_OR_DIE);
5076 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#tm_template@arch#%s/%s@psv#%s/%s@type#_T_",get_mcoffee_4_tcoffee(), "hmmtop.arch", get_mcoffee_4_tcoffee(), "hmmtop.psv");
5077 S=seq2template_seq (S,buf, F);
5080 else if ( strm ( template_list, "PSITM"))
5083 /*predict transmembrane structure*/
5084 check_program_is_installed (HMMTOP_4_TCOFFEE,NULL, NULL,HMMTOP_ADDRESS, INSTALL_OR_DIE);
5085 check_blast_is_installed(server);
5087 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#psitm_template@database#%s@arch#%s/%s@psv#%s/%s@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_T_", prot_db, get_mcoffee_4_tcoffee(), "hmmtop.arch", get_mcoffee_4_tcoffee(), "hmmtop.psv",get_cache_dir(), BmI,BMI,BmC,server);
5088 S=seq2template_seq (S,buf, F);
5092 else if (strm ( template_list, "PSIBLAST"))
5095 check_blast_is_installed(server);
5096 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#psiprofile_template@database#%s@method#psiblast@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_R_", prot_db,get_cache_dir(),BmI,BMI,BmC,server);
5097 S=seq2template_seq (S,buf, F);
5101 else if (strm ( template_list, "BLAST") )
5103 check_blast_is_installed(server);
5104 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#profile_template@database#%s@method#blastp@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_R_", prot_db,get_cache_dir(),BmI,BMI,BmC,server);
5105 S=seq2template_seq (S,buf, F);
5109 else if ( strm ( template_list, "EXPRESSO") || strm (template_list, "PDB"))
5111 check_blast_is_installed(server);
5115 for (i= 0; i < S->len[0]; ++i)
5117 isRNA = (isRNA || is_rna(S->seq[0][i]));
5122 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#pdb_template@database#%s@method#blastn@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_P_@pdb_type#%s",pdb_db, get_cache_dir(),PmI,PMI,PmC, server,pdb_type);
5126 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#pdb_template@database#%s@method#blastp@cache#%s@minid#%d@maxid#%d@mincov#%d@server#%s@type#_P_@pdb_type#%s",pdb_db, get_cache_dir(),PmI,PMI,PmC, server,pdb_type);
5128 return seq2template_seq (S,buf, F);
5131 else if ( strm (template_list, "RCOFFEE") || strm (template_list, "RNA"))
5133 char *file_struc_clac = vtmpnam (NULL);
5134 FILE* struc_calc_f =vfopen(file_struc_clac,"w");
5137 for (i = 0; i< S->nseq; ++i)
5142 fprintf(struc_calc_f,"%s %s\n",S->name[i],S->T[i]->P->template_file);
5145 vfclose(struc_calc_f);
5148 // S = seq2template_seq (S,buf,F);
5149 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#calc_rna_template@pdbfile#%s@cache#%s@type#_F_", file_struc_clac,get_cache_dir());
5153 check_program_is_installed (RNAPLFOLD_4_TCOFFEE,NULL, NULL,RNAPLFOLD_ADDRESS, IS_FATAL);
5154 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#RNA_template@type#_F_");
5157 S = seq2template_seq (S,buf,F);
5158 sprintf ( buf, "SCRIPT_tc_generic_method.pl@mode#calc_rna_template@pdbfile#%s@cache#%s@type#_F_", file_struc_clac,get_cache_dir());
5161 // printf("IN T_\n");
5162 return seq2template_seq (S,buf,F);
5166 /*2: Templates from seqnames (SELF) or named like the sequences (SEQFILE)*/
5167 else if ( strstr (template_list, "SELF_") ||strstr (template_list, "SEQFILE_") )
5173 for (a=0; a< S->nseq; a++)
5176 if ( (p=strstr (template_list,"SELF_")))p=S->name[a];
5177 else if ( strstr (template_list, "SEQFILE_"))p=template_list;
5180 fprintf ( stderr, "\nUnkown mode for Template [FATAL:%s]\n", PROGRAM);
5181 myexit (EXIT_FAILURE);
5184 if ( strstr (template_list, "_P_") && !(S->T[a])->P)(S->T[a])->P =fill_P_template ( S->name[a], p,S);//PDB
5185 else if ( strstr (template_list, "_S_") && !(S->T[a])->S)(S->T[a])->S =fill_S_template ( S->name[a], p,S);//Sequence
5186 else if ( strstr (template_list, "_R_" )&& !(S->T[a])->R)(S->T[a])->R =fill_R_template ( S->name[a], p,S);//pRofile
5187 else if ( strstr (template_list, "_G_" )&& !(S->T[a])->G)(S->T[a])->G =fill_G_template ( S->name[a], p,S);//Genomic
5188 else if ( strstr (template_list, "_F_" )&& !(S->T[a])->F)(S->T[a])->F =fill_F_template ( S->name[a], p,S);//Fold
5189 else if ( strstr (template_list, "_T_" )&& !(S->T[a])->T)(S->T[a])->T =fill_T_template ( S->name[a], p,S);//Trans Membrane
5190 else if ( strstr (template_list, "_E_" )&& !(S->T[a])->E)(S->T[a])->E =fill_E_template ( S->name[a], p,S);//Secondary Structure
5191 else if ( strstr (template_list, "_U_" )&& !(S->T[a])->U)(S->T[a])->U =fill_U_template ( S->name[a], p,S);//unicode, list template
5197 /*2: Templates comes in a template_file*/
5198 else if ( template_list==NULL || format_is_fasta (template_list))
5203 T=(template_list!=NULL)?get_fasta_sequence (template_list, NULL):S;
5204 for (a=0; a< T->nseq; a++)
5208 if ((i=name_is_in_list(T->name[a], S->name, S->nseq, MAXNAMES))!=-1)
5211 if ( (p=strstr (T->seq_comment[a], " _P_ ")) && !(S->T[i])->P &&( (S->T[i])->P=fill_P_template (S->name[i],p,S)))ntemp++;
5212 else if ( (p=strstr (T->seq_comment[a], " _F_ ")) && !(S->T[i])->F &&( (S->T[i])->F=fill_F_template (S->name[i],p,S)))ntemp++;
5213 else if ( (p=strstr (T->seq_comment[a], " _S_ ")) && !(S->T[i])->S &&( (S->T[i])->S=fill_S_template (S->name[i],p,S)))ntemp++;
5215 else if ( (p=strstr (T->seq_comment[a], " _R_ ")) && !(S->T[i])->R &&( (S->T[i])->R=fill_R_template (S->name[i],p,S)))ntemp++;
5216 else if ( (p=strstr (T->seq_comment[a], " _G_ ")) && !(S->T[i])->G &&( (S->T[i])->G=fill_G_template (S->name[i],p,S)))ntemp++;
5217 else if ( (p=strstr (T->seq_comment[a], " _T_ ")) && !(S->T[i])->T &&( (S->T[i])->T=fill_T_template (S->name[i],p,S)))ntemp++;
5218 else if ( (p=strstr (T->seq_comment[a], " _E_ ")) && !(S->T[i])->E &&( (S->T[i])->E=fill_E_template (S->name[i],p,S)))ntemp++;
5219 else if ( (p=strstr (T->seq_comment[a], " _U_ ")) && !(S->T[i])->U &&( (S->T[i])->E=fill_U_template (S->name[i],p,S)))ntemp++;
5221 if (T!=S)strcat (S->seq_comment[i], T->seq_comment[a]);
5226 if (T!=S)free_sequence (T, -1);
5228 if ( remove_template_file==2)
5230 vremove (template_list);
5233 if (template_list)display_output_filename ( stdout, "Template_List","fasta_seq", template_list, STORE);
5237 /*3 Templates are generated with a script*/
5238 else if (strstr (template_list, "SCRIPT_") && get_string_variable ("multi_core") && strstr (get_string_variable ("multi_core"), "templates") && get_nproc()>1)
5240 char *tmp1,*command;
5242 char **temp_file,**seq_file;
5243 int * pid_list, pid, npid, submited;
5244 int nproc, max_nproc;
5248 static char *script;
5254 if (!script)script=vcalloc ( 1000, sizeof(char));
5258 command=vcalloc ( 1000, sizeof (char));
5259 tmp1=vtmpnam (NULL);
5261 A=seq2aln (S,NULL, 0);
5262 string_array_upper(A->seq_al, A->nseq);
5263 output_fasta_seq (tmp1, A);
5264 sprintf ( script, "%s", after_strstr (template_list, "SCRIPT_"));
5266 if ((p=strstr (template_list, "@type#")))
5267 p+=strlen ("@type#");
5269 if (!F){F=parse_fname (S->file[0]);freeF=1;}
5270 sprintf (outfile, "%s%s_%s%d.template_list", F->path,F->name,template_type2short_type_name(p),ntemp);
5271 while ( check_file_exists (outfile))
5273 sprintf (outfile, "%s%s_%s%d.%d.template_list",F->path, F->name,template_type2short_type_name(p),ntemp, ++num);
5275 if (freeF)free_fname(F);
5278 //max_nproc=2*nproc;
5279 max_nproc=20; //EBI recommended maximum
5280 script=substitute(script, "@", " -");
5281 script=substitute(script, "#", "=");
5283 temp_file=vcalloc ( A->nseq, sizeof (char*));
5284 seq_file =vcalloc (A->nseq, sizeof (char*));
5285 pid_list =vcalloc (MAX_N_PID, sizeof (int *));
5287 fprintf ( stderr, "\n\t------ Fetch Templates [Multi Core Mode %d CPUs]\n",get_nproc());
5288 for (npid=0, submited=0,i=0; i<S->nseq; i++)
5291 seq_file[i]=vtmpnam (NULL);
5292 temp_file[i]=vtmpnam (NULL);
5293 fp2=vfopen (seq_file[i], "w");
5294 fprintf ( fp2, ">%s\n%s\n", S->name[i], S->seq[i]);
5300 initiate_vtmpnam (NULL);
5301 if ( strstr (script, "tc_generic_method"))
5303 //sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s -tmpdir=%s",get_string_variable ("t_coffee"),script,seq_file[i],temp_file[i],get_tmp_4_tcoffee());
5304 sprintf ( command, "%s -infile=%s -outfile=%s -tmpdir=%s",script,seq_file[i],temp_file[i],get_tmp_4_tcoffee());
5307 //sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s",get_string_variable("t_coffee"),script,seq_file[i],temp_file[i]);
5308 sprintf ( command, "%s -infile=%s -outfile=%s",script,seq_file[i],temp_file[i]);
5309 command=substitute(command, "@", " ");
5310 //my_system ( command);
5311 myexit (my_system(command));
5319 submited=vwait_npid(submited,max_nproc,nproc);
5323 submited=vwait_npid(submited,0,0);
5324 //Concatenate all the files
5326 for (i=0; i<npid; i++) file_cat (temp_file[i],outfile);
5328 //Free the process table
5334 if ( check_file_exists (outfile) && format_is_fasta(outfile))
5336 S=seq2template_seq (S, outfile, F);
5338 else if (strstr (command, "webblast.pl"))return S;
5342 add_warning (stderr, "Could not Run %s to find templates[%s](Forked mode)\n",command, PROGRAM);
5350 else if (strstr (template_list, "SCRIPT_"))
5353 char *tmp1,*command;
5356 static char *script;
5360 if (!script)script=vcalloc ( 1000, sizeof(char));
5364 command=vcalloc ( 1000, sizeof (char));
5365 tmp1=vtmpnam (NULL);
5367 A=seq2aln (S,NULL, 0);
5368 string_array_upper(A->seq_al, A->nseq);
5369 output_fasta_seq (tmp1, A);
5370 sprintf ( script, "%s", after_strstr (template_list, "SCRIPT_"));
5371 fprintf ( stderr, "\n");
5372 if ((p=strstr (template_list, "@type#")))
5373 p+=strlen ("@type#");
5376 sprintf (outfile, "%s%s_%s%d.template_list", F->path,F->name,template_type2short_type_name(p),ntemp);
5380 F=parse_fname (S->file[0]);
5381 sprintf (outfile, "%s%s_%s%d.template_list",F->path, F->name,template_type2short_type_name(p),ntemp);
5385 script=substitute(script, "@", " -");
5386 script=substitute(script, "#", "=");
5388 if ( strstr (script, "tc_generic_method"))
5390 sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s -tmpdir=%s",get_string_variable ("t_coffee"),script, tmp1,outfile,get_tmp_4_tcoffee());
5392 else sprintf ( command, "%s -other_pg %s -infile=%s -outfile=%s",get_string_variable("t_coffee"),script, tmp1, outfile);
5395 command=substitute(command, "@", " ");
5397 my_system ( command);
5401 if ( check_file_exists (outfile) && format_is_fasta(outfile))
5403 S=seq2template_seq (S, outfile, F);
5405 else if (strstr (command, "webblast.pl"))return S;
5409 add_warning (stderr, "Could not Run %s to find templates[%s](unforked mode)\n",command, PROGRAM);
5420 char* seq2template_file (Sequence *S, char *file)
5425 if (file==NULL)file=vtmpnam (NULL);
5427 seq2template_file2 (S, file, "w");
5429 for (i=0; i<S->nseq; i++)
5431 if ( (A=seq2R_template_profile (S, i)))
5435 if (S)seq2template_file2 (A->S, file, "a");
5441 int seq2template_file2 (Sequence *S, char *file, char *mode)
5447 struct X_template *X;
5449 fp=vfopen ( file, mode);
5450 for ( i=0; i< S-> nseq; i++)
5457 if ( (X=(S->T[i])->P)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}
5458 /*if ( (X=(S->T[i])->S)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}*/
5459 if ( (X=(S->T[i])->R)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}
5460 if ( (X=(S->T[i])->G)){sprintf (buf2, " %s %s ", X->template_type, X->template_file);strcat (buf1, buf2);}
5461 if (buf1[0])fprintf ( fp, ">%s %s\n", S->name[i], buf1);
5466 return EXIT_SUCCESS;
5472 int seq2n_X_template ( Sequence *S, char *type)
5476 for (n=0,a=0; a< S->nseq; a++)
5478 if ( strm2 (type, "_P_","_*_") && (S->T[a])->P)n++;
5479 if ( strm2 (type, "_F_","_*_") && (S->T[a])->F)n++;
5480 if ( strm2 (type, "_S_","_*_") && (S->T[a])->S)n++;
5481 if ( strm2 (type, "_R_","_*_") && (S->T[a])->R)n++;
5482 if ( strm2 (type, "_G_","_*_") && (S->T[a])->G)n++;
5486 struct X_template *fill_X_template ( char *name, char *p, char *token)
5488 struct X_template *X;
5495 X=vcalloc (1, sizeof (X_template));
5496 sprintf ( X->seq_name, "%s", name);
5497 if ( (k=strstr (p, token)))sscanf (k+strlen(token), "%s",X->template_name);
5498 else sprintf (X->template_name, "%s", p);
5501 /*Add a Structure HERE*/
5502 sprintf ( X->template_type, "%s", token);
5503 if ( strm (token, "_P_"))X->VP=vcalloc (1, sizeof (P_template));
5504 if ( strm (token, "_F_"))X->VF=vcalloc (1, sizeof (F_template));
5506 if ( strm (token, "_S_"))X->VS=vcalloc (1, sizeof (S_template));
5507 if ( strm (token, "_R_"))X->VR=vcalloc (1, sizeof (R_template));
5508 if ( strm (token, "_G_"))X->VG=vcalloc (1, sizeof (G_template));
5509 if ( strm (token, "_T_"))X->VT=vcalloc (1, sizeof (T_template));
5510 if ( strm (token, "_E_"))X->VE=vcalloc (1, sizeof (E_template));
5511 if ( strm (token, "_U_"))X->VU=vcalloc (1, sizeof (U_template));
5516 struct X_template* free_X_template ( struct X_template *X)
5528 free_sequence ((X->VS)->S, -1);
5533 free_aln ((X->VR)->A);
5538 free_sequence ((X->VG)->S, -1);
5546 FILE * display_sequence_templates (Sequence *S,int i, FILE *io)
5550 io=display_X_template ( (S->T[i])->P, io);
5552 io=display_X_template ( (S->T[i])->F, io);
5554 io=display_X_template ( (S->T[i])->S, io);
5556 io=display_X_template ( (S->T[i])->R, io);
5557 io=display_X_template ( (S->T[i])->G, io);
5558 io=display_X_template ( (S->T[i])->T, io);
5559 io=display_X_template ( (S->T[i])->E, io);
5564 FILE * display_X_template (struct X_template *X, FILE *io)
5568 if ( !strm (X->template_type, "_S_"))fprintf (io, "\n\t%s: Template=%s, File=%s",template_type2type_name (X->template_type), X->template_name,X->template_file);
5571 char *template_type2short_type_name (char *type)
5574 if (!type)return "";
5575 else if ( strstr (type, "_P_")) return "pdb";
5576 else if ( strstr (type, "_F_")) return "rfold";
5577 else if ( strstr (type, "_S_")) return "seq";
5578 else if ( strstr (type, "_R_")) return "prf";
5579 else if ( strstr (type, "_G_")) return "genome";
5580 else if ( strstr (type, "_E_")) return "ssp";
5581 else if ( strstr (type, "_T_")) return "tmp";
5582 else if ( strstr (type, "_U_")) return "unicode";
5585 char *template_type2type_name (char *type)
5588 if ( strstr (type, "_P_")) return "PDB struc";
5589 else if ( strstr (type, "_F_")) return "RNA Fold";
5590 else if ( strstr (type, "_S_")) return "Sequeence";
5591 else if ( strstr (type, "_R_")) return "Profile";
5592 else if ( strstr (type, "_G_")) return "Genomic";
5593 else if ( strstr (type, "_E_")) return "Protein Secondary Structure";
5594 else if ( strstr (type, "_T_")) return "Protein Trans Membrane Structure ";
5595 else if ( strstr (type, "_U_")) return "Unicode and strings";
5599 struct X_template *fill_F_template ( char *name,char *p, Sequence *S)
5601 /*Profile template*/
5602 struct X_template *F;
5604 F=fill_X_template ( name, p, "_F_");
5605 sprintf (F->template_format , "TCOFFEE_LIBRARY");
5606 if (!F || !check_file_exists (F->template_name))
5608 fprintf ( stderr, "Could Not Fill _F_ (Fold) template for sequence |%s|", name);
5609 free_X_template (F);
5612 else if ( check_file_exists (F->template_name))
5614 sprintf ( F->template_file, "%s", F->template_name);
5622 struct X_template *fill_P_template ( char *name,char *p, Sequence *S)
5624 struct X_template *P;
5631 P=fill_X_template ( name, p, "_P_");
5632 sprintf (P->template_format , "pdb");
5634 if (!P ||(check_file_exists (P->template_name) && !is_pdb_file (P->template_name) ))
5636 //fprintf ( stderr, "Could Not Fill _P_ template for sequence |%s|", name);
5637 free_X_template (P);
5640 else if ( check_file_exists (P->template_name))
5642 sprintf ( P->template_file, "%s", P->template_name);
5643 buf=path2filename (P->template_name);
5644 if (P->template_name!=buf)
5646 sprintf ( P->template_name, "%s",buf );
5654 st=is_pdb_struc (P->template_name);
5657 if (st!=P->template_file)sprintf ( P->template_file, "%s", st);
5661 /*Make a first run to fix relaxed PDB files*/
5662 buf=fix_pdb_file (P->template_file);
5664 if ( buf!=P->template_file)
5667 sprintf ( P->template_file, "%s",buf);
5671 /*Check the PDB FILE EXISTS*/
5673 if (!is_pdb_file (P->template_file))
5676 if (p)add_warning(stderr, "_P_ Template | %s | Could Not Be Found\n",p);
5677 else if (name)add_warning(stderr, "_P_ Template | %s | Could Not Be Found\n",name);
5678 free_X_template (P);
5683 buf= get_pdb_id (P->template_file);
5684 if (buf!=(P->VP)->pdb_id)
5686 sprintf ((P->VP)->pdb_id, "%s", buf);
5691 /*Check the target sequence is similar enough*/
5693 PS=get_pdb_sequence (P->template_file);
5699 add_warning( stderr, "_P_ Template |%s| Could Not be Used for Sequence |%s|: Structure Not Found", P->template_name, name);
5700 free_X_template (P);P=NULL;
5704 int minsim=get_int_variable ("pdb_min_sim");
5705 int mincov=get_int_variable ("pdb_min_cov");
5708 i=name_is_in_list (name, S->name, S->nseq, 100);
5710 A=align_two_sequences (S->seq[i], PS->seq[0],"idmat",-3,0, "fasta_pair_wise");
5712 sprintf ( A->name[0], "seq");
5713 sprintf ( A->name[1], "pdb");
5714 cov=aln2coverage (A, 0);
5715 sim=aln2sim (A, "idmat");
5719 add_information( stderr, "_P_ Template %s Could Not be Used for Sequence %s: Similarity too low [%d, Min=%d]",P->template_name,name,sim,minsim);
5720 add_information( stderr, "If you want to include %s in anycase,add -pdb_min_sim=%d to the command line",name,sim);
5722 free_X_template (P);
5725 else if ( cov<=mincov)
5727 add_information(stderr, "_P_ Template |%s| Could Not be Used for Sequence |%s|: Coverage too low [%d, Min=%d]",P->template_name,name, cov, mincov);
5728 add_information( stderr, "If you want to include this sequence in anycase add -pdb_min_cov=%d to the command line", cov);
5730 free_X_template (P);P=NULL;
5733 free_sequence (PS, -1);
5739 struct X_template *fill_S_template ( char *name,char *p, Sequence *Seq)
5741 struct X_template *S;
5742 S=fill_X_template ( name, p, "_S_");
5743 if ( strm (name, p))sprintf ( S->template_file, "%s",output_fasta_seqX (NULL,"w",Seq,NULL, seq_name2index (name, Seq)));
5744 (S->VS)->S=get_fasta_sequence (S->template_file, NULL);
5747 struct X_template *fill_R_template ( char *name,char *p, Sequence *S)
5749 /*Profile template*/
5750 struct X_template *R;
5753 R=fill_X_template ( name, p, "_R_");
5754 sprintf (R->template_format , "fasta_aln");
5757 if (!is_aln(R->template_name) && !is_seq (R->template_name))
5760 add_information ( stderr, "_R_ Template %s Could Not Be Found\n",R->template_name);
5761 free_X_template (R);
5770 (R->VR)->A=main_read_aln (R->template_name, NULL);
5773 sprintf ( R->template_file, "%s", R->template_name);
5776 s=name_is_in_list(name, S->name, S->nseq, 100);
5779 S1=fill_sequence_struc (1, &S->seq[s], &S->name[s]);
5780 A1=seq2aln (S1,NULL, RM_GAP);
5782 (R->VR)->A=trim_aln_with_seq (A1, (R->VR)->A);
5784 sprintf ( R->template_file, "%s", vtmpnam (NULL));
5785 output_clustal_aln (R->template_file, (R->VR)->A);
5788 sprintf ( R->template_file, "%s", R->template_name);
5790 (R->VR)->A=aln2profile ((R->VR)->A);
5792 //free_data_in_aln ((R->VR)->A);
5798 struct X_template *fill_T_template ( char *name,char *p, Sequence *S)
5800 /*Profile template*/
5801 struct X_template *T;
5803 T=fill_X_template ( name, p, "_T_");
5804 sprintf (T->template_format , "fasta_seq");
5806 if (!is_aln(T->template_name) && !is_seq (T->template_name))
5809 add_information ( stderr, "_T_ Template %s Could Not Be Found\n",T->template_name);
5810 free_X_template (T);
5816 (T->VT)->S=main_read_seq(T->template_name);
5817 sprintf ( T->template_file, "%s", T->template_name);
5822 struct X_template *fill_U_template ( char *name,char *p, Sequence *S)
5824 /*Profile template*/
5825 struct X_template *U;
5827 U=fill_X_template ( name, p, "_U_");
5828 sprintf (U->template_format , "string list");
5830 if (!check_file_exists(U->template_name))
5832 add_information ( stderr, "_U_ Template %s Could Not Be Found\n",U->template_name);
5833 free_X_template (U);
5838 //(U->VU)->list=file2string(U->template_name);
5839 sprintf ( U->template_file, "%s", U->template_name);
5843 struct X_template *fill_E_template ( char *name,char *p, Sequence *S)
5845 /*Profile template*/
5846 struct X_template *E;
5849 E=fill_X_template ( name, p, "_E_");
5850 sprintf (E->template_format , "fasta_seq");
5852 if (!is_aln(E->template_name) && !is_seq (E->template_name))
5855 add_information ( stderr, "_E_ Template %s Could Not Be Found\n",E->template_name);
5856 free_X_template (E);
5861 (E->VE)->S=main_read_seq (E->template_name);
5862 sprintf ( E->template_file, "%s", E->template_name);
5866 struct X_template *fill_G_template ( char *name,char *p, Sequence *S)
5868 struct X_template *G;
5869 G=fill_X_template ( name, p, "_G_");
5870 sprintf (G->template_format , "fasta_seq");
5872 /*1: Get the sequence from another file if needed*/
5873 if ( strm (name, p))sprintf ( G->template_file, "%s",output_fasta_seqX (NULL,"w",S,NULL, seq_name2index (name, S)));
5874 else if ( strstr (p, "SEQFILE_"))
5880 ST=main_read_seq (after_strstr ( p,"SEQFILE_G_"));
5882 i2=seq_name2index (name, ST);
5885 sprintf ( G->template_file, "%s",output_fasta_seqX (NULL,"w",ST,NULL, i2));
5886 sprintf ( G->template_name, "%s", name);
5888 free_sequence (ST, -1);
5890 else sprintf (G->template_file, "%s", G->template_name);
5893 /*2: Put the template in VG->S*/
5894 if (!is_seq (G->template_file))
5896 add_information ( stderr, "_G_ Template %s Could Not Be Found \n",p);
5898 free_X_template (G);
5903 (G->VG)->S=get_fasta_sequence (G->template_file, NULL);
5909 char *seq2T_value ( Sequence *S, int n, char *value, char *type)
5911 static char *rv_buf;
5914 if ( !rv_buf)rv_buf=vcalloc (100, sizeof(char));
5915 if (!(X=seq_has_template (S, n, type)))return NULL;
5918 if (strm (value, "template_file"))return X->template_file;
5919 else if ( strm (value, "template_name"))return X->template_name;
5920 else if ( strm (value, "seq_name"))return X->seq_name;
5921 else if (strm (type, "_P_"))
5923 if ( strm (value, "pdb_id"))return (X->VP)->pdb_id;
5925 else if ( strm (type, "_R_"))
5927 if ( strm (value, "A"))
5930 {sprintf ( rv_buf, "%ld", (long)(X->VR)->A);return rv_buf;}
5938 char *seq2P_pdb_id (Sequence *S, int n)
5940 if (!S->T || !S->T[n] || !(S->T[n])->P ) return NULL;
5941 else return ((S->T[n])->P)->template_name;
5945 char *seq2P_template_file(Sequence *S, int n)
5948 return seq2T_value (S, n, "template_file", "_P_");
5951 char *profile2P_template_file (Sequence *S, int n)
5957 if ( !(A=seq2R_template_profile (S, n)))return NULL;
5958 for (a=0; a<A->nseq; a++)
5960 if ((p=seq2P_template_file (A->S, a))!=NULL)return p;
5964 Alignment * seq2R_template_profile (Sequence *S, int n)
5968 return (Alignment *)atop(seq2T_value (S, n, "A", "_R_"));
5970 if (!(X=seq_has_template (S, n, "_R_")))return NULL;
5973 if (!(X->VR))return NULL;
5974 else return (X->VR)->A;
5981 char * seq2E_template_string (Sequence *S, int n)
5983 struct X_template *T;
5985 if ( (T=seq_has_template (S, n, "_E_"))!=NULL)
5986 return ((T->VE)->S)->seq[0];
5991 int* seq2U_template (Sequence *S, int n)
5993 struct X_template *T;
5995 if ( (T=seq_has_template (S, n, "_U_"))!=NULL)
5996 return (T->VU)->list;
6000 char * seq2T_template_string (Sequence *S, int n)
6002 struct X_template *T;
6004 if ( (T=seq_has_template (S, n, "_T_"))!=NULL)
6005 return ((T->VT)->S)->seq[0];
6010 struct X_template* seq_has_template ( Sequence *S, int n, char *mode)
6014 if ( !S || !mode) return NULL;
6015 else if ( n<0 || n>=S->nseq)return NULL;
6016 else if ( !(S->T)) return NULL;
6017 else if ( !(S->T[n]))return NULL;
6022 if ( strm (mode, "_P_"))return T->P;
6023 else if ( strm (mode, "_F_"))return T->F;
6024 else if ( strm (mode, "_S_"))return T->S;
6025 else if ( strm (mode, "_R_"))return T->R;
6026 else if ( strm (mode, "_T_"))return T->T;
6027 else if ( strm (mode, "_E_"))return T->E;
6028 else if ( strm (mode, "_U_"))return T->U;
6029 else if ( strm (mode, "_G_"))return T->G;
6033 char ** name2random_subset (char **in_name, int n_in, int n_out)
6043 out_name=declare_char (n_out,MAXNAMES+1 );
6044 list=declare_int (n_in, 2);
6046 for (a=0; a<n_in; a++)
6049 list[a][1]=rand ()%max;
6051 sort_int ( list,2, 1, 0, n_in-1);
6053 for ( a=0; a<n_in; a++)
6055 sprintf ( out_name[a], "%s", in_name[list[a][0]]);
6057 free_int (list, -1);
6061 Alignment * aln2random_order (Alignment *A)
6066 name_list=name2random_subset (A->name, A->nseq, A->nseq);
6067 A=reorder_aln (A, name_list, A->nseq);
6068 free_char (name_list, -1);
6071 Alignment *aln2jacknife (Alignment *A, int nseq, int len)
6075 if (nseq!=0 && nseq<A->nseq)
6079 name=name2random_subset (A->name, A->nseq, nseq);
6080 A=reorder_aln (A, name, nseq);
6081 free_char (name, -1);
6084 if (len!=0 && len<A->len_aln)
6089 l=declare_int (A->len_aln, 2);
6090 for (a=0; a< A->len_aln; a++)
6093 l[a][1]=rand()%(A->len_aln*1000);
6095 sort_int ( l,2, 1, 0, A->len_aln-1);
6096 B=copy_aln (A, NULL);
6097 for ( a=0; a< len; a++)
6099 for ( b=0; b<A->nseq; b++)
6101 A->seq_al[b][a]=B->seq_al[b][l[a][0]];
6104 for (b=0; b<A->nseq; b++)A->seq_al[b][len]='\0';
6110 Alignment * aln2scramble_seq (Alignment *A)
6119 list=declare_int (A->nseq, 2);
6120 name_list=vcalloc (A->nseq, sizeof (char*));
6123 for (a=0; a<A->nseq; a++)
6126 list[a][1]=rand ()%max;
6128 sort_int ( list,2, 1, 0, A->nseq-1);
6130 for ( a=0; a< A->nseq; a++)
6131 name_list[a]=A->seq_al[a];
6132 for (a=0; a<A->nseq; a++)
6134 A->seq_al[a]=name_list[list[a][0]];
6137 free_int (list, -1);
6138 return aln2random_order (A);
6143 Alignment * reorder_aln ( Alignment *A, char **name, int nseq)
6150 if ( name==NULL)return aln2random_order(A);
6153 BUF=copy_aln ( A,NULL);
6154 for ( a=0; a<nseq; a++)
6156 sn =name_is_in_list ( name[a],BUF->name, A->nseq,STRING);
6165 SWAPP(A->order[n], BUF->order[sn], tpp_int);
6166 sprintf ( A->name[n], "%s", BUF->name[sn]);
6167 sprintf ( A->seq_al[n], "%s",BUF->seq_al[sn]);
6168 sprintf ( A->seq_comment[n], "%s", BUF->seq_comment[sn]);
6175 for ( a=n; a< A->nseq; a++)A->name[a][0]=A->seq_al[a][0]='\0';
6178 if ( A->A)A->A=reorder_aln(A->A, name, nseq);
6182 Sequence * reorder_seq_2 ( Sequence *A, int **order,int field, int nseq)
6187 if (!A || !order) return A;
6188 name=declare_char (A->nseq, 100);
6189 for (a=0; a<nseq; a++)
6190 sprintf ( name[a], "%s", A->name[order[a][field]]);
6191 A=reorder_seq (A, name,nseq);
6192 free_char (name, -1);
6195 Sequence * reorder_seq ( Sequence *A, char **name, int nseq)
6201 nA=duplicate_sequence (A);
6204 for ( a=0; a< nseq; a++)
6206 sn=name_is_in_list (name[a] ,nA->name, nA->nseq, 100);
6207 if (sn==-1)continue;
6209 if ( nA->file) sprintf ( A->file[a], "%s", nA->file[sn]);
6211 if ( nA->seq_comment)sprintf ( A->seq_comment[a], "%s", nA->seq_comment[sn]);
6212 if ( nA->aln_comment)sprintf ( A->aln_comment[a], "%s", nA->aln_comment[sn]);
6213 sprintf ( A->seq[a], "%s", nA->seq[sn]);
6214 A->len[a]=nA->len[sn];
6215 sprintf ( A->name[a], "%s", nA->name[sn]);
6216 A->T[a][0]=nA->T[sn][0];
6219 free_sequence (nA, nA->nseq);
6224 char * concatenate_seq ( Sequence *S, char *conc, int *order)
6229 conc=vcalloc ( S->nseq*S->max_len, sizeof (char));
6231 for ( a=0; a< S->nseq; a++)
6233 conc=strcat ( conc, S->seq[order[a]]);
6242 Alignment * rotate_aln ( Alignment *A, char *name)
6247 B=declare_aln2 (A->len_aln, A->nseq+1);
6248 for ( a=0; a< A->nseq; a++)
6249 for ( b=0; b< A->len_aln; b++)
6251 B->seq_al[b][a]=A->seq_al[a][b];
6253 for (a=0; a< A->len_aln; a++)
6254 if (name && name[0])sprintf ( B->name[a], "%s_%s%d", name, (a<9)?"0":"",a+1);
6256 sprintf ( B->name[a], "%d", a+1);
6259 for (a=0; a< A->len_aln; a++)B->seq_al[a][A->nseq]='\0';
6266 Alignment * invert_aln ( Alignment *A)
6271 for ( a=0; a< A->nseq; a++)
6273 l=strlen ( A->seq_al[a]);
6274 buf=vcalloc ( l+1,sizeof (char) );
6276 for ( c=l-1,b=0; b< l; b++, c--)
6278 buf[c]=A->seq_al[a][b];
6281 sprintf ( A->seq_al[a], "%s", buf);
6286 char * complement_string (char *s)
6292 for ( b=0; b< l; b++)
6297 else if (r=='A')r='T';
6298 else if (r=='t')r='a';
6299 else if (r=='T')r='A';
6300 else if (r=='g')r='c';
6301 else if (r=='G')r='C';
6302 else if (r=='c')r='g';
6303 else if (r=='C')r='G';
6307 return invert_string (s);
6309 Alignment * complement_aln ( Alignment *A)
6314 for ( a=0; a< A->nseq; a++)
6316 A->seq_al[a]=complement_string (A->seq_al[a]);
6322 Alignment * extract_nol_local_aln(Alignment *A, int start, int max_end)
6324 A=extract_aln ( A, start, max_end);
6325 A=trunkate_local_aln (A);
6329 Alignment * alnpos_list2block (Alignment *A, int n, char **in_list)
6334 int list_declared=0;
6337 if (check_file_exists (in_list[0]))
6342 mn=count_n_line_in_file (in_list[0]);
6343 list=declare_char (mn, 100);
6345 tmp_list=file2list (in_list[0], " ");
6350 if (tmp_list[a][1][0]!='!')
6352 sprintf (list[n++], "%s", tmp_list[a][1]);
6356 free_arrayN ((void **)tmp_list, 3);
6364 pos=vcalloc (A->len_aln, sizeof (int));
6368 if (strstr (list[a], "-"))
6371 x=sscanf (list[a], "%d-%d", &start, &end);
6372 if (x!=2 || !A || start<=0 || start>=end || end>A->len_aln+1)
6374 add_warning ( stderr, "Illegal coordinates in extract_pos_list [%s]", list[a]);
6378 for (a=start; a<end; a++)pos[a]=1;
6384 if (p<1 || p>A->len_aln)
6386 add_warning ( stderr, "Illegal coordinates in extract_pos_list [%s]", list[a]);
6392 B=alnpos2block(A, pos, NULL);
6394 if ( list_declared)free_char (list, -1);
6398 Alignment * aln2block (Alignment *A, int start, int end, Alignment *B)
6400 if ( !A || start<=0 || start>=end || end>A->len_aln+1)
6402 add_warning ( stderr, "Illegal coordinates in extract_block start=%d end=%d len=%d [Note : [start-end[, with [1...n] ** Block Ingored", start, end, A->len_aln);
6410 pos=vcalloc (A->len_aln, sizeof (int));
6411 for (p=start;p<end;p++)
6415 B=alnpos2block (A, pos, B);
6420 Alignment * alnpos2block (Alignment *A, int *pos, Alignment *B)
6423 //extract a subset of B without over-writing A
6428 for (a=0; a<=A->len_aln; a++)
6430 if ( pos[a]!=0 || a==A->len_aln)
6432 for ( b=0; b<A->nseq; b++)
6433 B->seq_al[b][B->len_aln]=A->seq_al[b][a];
6434 if ( a!=A->len_aln)B->len_aln++;
6440 Alignment * extract_aln ( Alignment *A, int start, int end)
6442 return extract_aln2 ( A, start, end, "cons");
6445 Alignment * extract_aln2 ( Alignment *A, int in_start, int in_end, char *seq)
6452 fp=vfopen (tmp, "w");
6453 fprintf ( fp, "%s %d %d\n", seq, in_start, in_end);
6455 return extract_aln3 (A,tmp);
6457 Alignment * extract_aln3 ( Alignment *B, char *file)
6461 int n, i, s, nline=0;
6465 char name[MAXNAMES];
6466 char line[VERY_LONG_STRING];
6475 modifies the incoming alignment
6478 offset=vcalloc ( B->nseq+1, sizeof (int));
6479 fp=vfopen (file,"r");
6480 while ( (c=fgetc(fp))!=EOF)
6483 fgets ( line, VERY_LONG_STRING,fp);
6486 sscanf (line, "%s %d", name, &start);
6487 s=name_is_in_list (name,B->name,B->nseq,MAXNAMES);
6496 col=vcalloc ( A->len_aln, sizeof (int));
6498 fp=vfopen ( file, "r");
6499 while ( (c=fgetc(fp))!=EOF)
6502 if ( c=='#' || c=='!')fgets ( line, VERY_LONG_STRING,fp);
6506 fgets ( line, VERY_LONG_STRING,fp);
6508 if (sscanf (line, "%s %d %d", name, &start, &end)==3);
6509 else if (sscanf (line, "%s %d", name, &start)==2)
6515 add_warning ( stderr, "Wrong format in coordinate file (line=%d) ** Line Ignored", nline);
6518 if ( end==0)end=A->len_aln+1;
6520 s=name_is_in_list (name,A->name,A->nseq,MAXNAMES);
6523 if ( s==-1 && !strm (name, "cons"))
6525 add_warning ( stderr, "Seq %s does not belong to the alignment (line %d) ** Line ignored", name,nline);
6528 else if ( start>end)
6530 add_warning ( stderr, "Illegal coordinates [%s %d %d] (line %d) ** Line ignored", name,start, end,nline);
6541 for (n=0, a=0; done!=1 && a< A->len_aln; a++)
6543 i=(strm (name, "cons"))?1:!is_gap(A->seq_al[s][a]);
6546 if (n>=start && n<end)
6551 //if (n>=start && n<end && !(i==0 && n==end-1))
6555 //else if ( n>=end)a=A->len_aln;
6559 HERE ("Warning Missing positions in File %s",file );
6568 /*Extract [start-end[*/
6569 for ( b=0,a=0; a< A->len_aln; a++)
6573 for (c=0; c< A->nseq; c++)A->seq_al[c][b]=A->seq_al[c][a];
6579 for (c=0; c< A->nseq; c++)A->seq_al[c][A->len_aln]='\0';
6585 Alignment * trunkate_local_aln ( Alignment *A)
6593 cache=declare_int (return_max_int (A->order,read_size_int ( A->order,sizeof (int*)),0)+1,return_max_int (A->order,read_size_int ( A->order,sizeof (int*)),1)+A->len_aln+1);
6594 pos=aln2pos_simple(A,A->nseq);
6596 for ( b=0; b<A->len_aln; b++)
6597 for ( a=0; a< A->nseq; a++)
6601 else if ( pos[a][b]>0)
6604 if (cache[seq][pos[a][b]]==0)cache[seq][pos[a][b]]++;
6605 else if ( cache[seq][pos[a][b]]>=1)
6607 cache[seq][pos[a][b]]++;
6608 A->seq_al[a][b]='\0';
6613 A->len_aln=get_shortest_string ( A->seq_al, A->nseq, NULL, NULL);
6614 pad_string_array ( A->seq_al, A->nseq, A->len_aln, '-');
6617 free_int ( cache,-1);
6623 int get_nol_aln_border ( Alignment *A, int start, int direction)
6630 /*This Function Returns the limit position for a non overlaping alignment*/
6632 cache=declare_int (return_max_int (A->order,read_size_int ( A->order,sizeof (int*)),0)+1,return_max_int (A->order,read_size_int ( A->order,sizeof (int)),1)+A->len_aln+1);
6633 pos=aln2pos_simple(A,A->nseq);
6634 end=(direction==GO_RIGHT)?A->len_aln:-1;
6637 for ( b=start; b!=end;b+=direction)
6638 for ( a=0; a< A->nseq; a++)
6642 else if ( pos[a][b]>0)
6645 if (cache[seq][pos[a][b]]==0)cache[seq][pos[a][b]]++;
6646 else if ( cache[seq][pos[a][b]]>=1)
6648 cache[seq][pos[a][b]]++;
6649 free_int(cache, -1);
6655 free_int ( cache,-1);
6657 return end-direction;
6664 char * extract_defined_seq ( char *in, int in_of, int in_start, int *aa_def, int dir, int *out_start, char *out)
6671 if ( dir==GO_LEFT){start=in_start-1;}
6672 else if ( dir==GO_RIGHT){start=in_start+1;}
6675 while (aa_def[end]!=UNDEFINED)
6681 if (end<start)SWAP(end,start);
6685 for (b=0,d=0,c=in_of;b<l; b++)
6688 if ( c>=start && c<=end)
6690 if ( out_start[0]==-1)out_start[0]=c-!is_gap(in[b]);
6700 Alignment * aln2N_replicate (Alignment *A,char *nn, char *name)
6705 fname=vcalloc (100, sizeof (char));
6708 if (!name){name=vcalloc (100, sizeof (char)); sprintf (name, "replicate");}
6714 sprintf (fname, "%s.%d.rep",name, a+1);
6715 fp=vfopen (fname, "w");
6717 vfclose(aln2replicate (A, fp));
6718 fprintf ( stdout, ">%s Alignment Replicate #%d\n",fname, a+1);
6720 myexit (EXIT_SUCCESS);
6722 FILE *aln2replicate (Alignment *A, FILE *fp)
6728 if (A->col_weight)for (a=0; a<A->len_aln; a++)tot+=A->col_weight[a];
6729 else tot=A->len_aln;
6731 p=vcalloc (A->len_aln, sizeof (int));
6732 corr=(float)A->len_aln/tot;
6734 for (a=0; a<A->len_aln; a++)
6742 for (a=0; a<A->nseq; a++)
6744 fprintf ( fp, ">%s\n", A->name[a]);
6745 //for (b=0;b<A->len_aln; b++)fprintf ( stdout, "%d ", (int)p[b]);
6746 for (b=0;b<A->len_aln; b++)fprintf ( fp, "%c", A->seq_al[a][p[b]]);
6747 fprintf ( fp, "\n");
6754 Alignment * orthologous_concatenate_aln (Alignment *A, Sequence *S, char *mode)
6757 char **name, *cname;
6761 if (mode && strm (mode, "voronoi"))seq_weight2species_weight (A, S);
6764 cname=vcalloc ( 100, sizeof (char));
6765 name=declare_char (A->nseq, 100);
6766 for (a=0; a<A->nseq; a++)
6768 char *p=strstr (A->name[a], "_");
6771 fprintf ( stderr, "\nWARNING: Seq %s could not be included.", A->name[a]);
6774 if ( name_is_in_list (p, name,nname, 100)==-1)
6776 sprintf ( name[nname++], "%s", p);
6780 C=declare_aln2 (nname, (A->len_aln*S->nseq)+1);
6781 free_char (C->name,-1); C->name=name;
6783 C->col_weight=vcalloc ( A->len_aln*S->nseq, sizeof(float));
6786 for (a=0; a<S->nseq; a++)
6788 for (b=0; b<C->nseq; b++)
6790 sprintf (cname, "%s_%s", S->name[a],C->name[b]);
6791 if ((i=name_is_in_list (cname, A->name, A->nseq, 100))==-1)
6793 char *s=generate_null (A->len_aln);
6794 strcat (C->seq_al[b], s);
6798 strcat (C->seq_al[b], A->seq_al[i]);
6800 for (c=C->len_aln, b=0;b<A->len_aln;b++, c++)
6802 C->col_weight[c]=(S->W)->SEQ_W[a];
6804 C->len_aln+=A->len_aln;
6810 Alignment * concatenate_aln ( Alignment *A1, Alignment *A2, char *spacer)
6815 A=declare_aln2( A1->nseq+A2->nseq , A1->len_aln+A2->len_aln+1);
6816 for ( a=0; a< A1->nseq; a++)
6818 if ((i=name_is_in_list ( A1->name[a], A2->name, A2->nseq, 100))!=-1)
6820 sprintf ( A->name[A->nseq], "%s", A1->name[a]);
6821 sprintf (A->seq_al[A->nseq], "%s%s%s", A1->seq_al[a],(spacer)?spacer:"", A2->seq_al[i]);
6827 buf=generate_string (A2->len_aln, '-');
6828 sprintf ( A->name[A->nseq], "%s", A1->name[a]);
6829 sprintf (A->seq_al[A->nseq], "%s%s", A1->seq_al[a], buf);
6834 for ( a=0; a< A2->nseq; a++)
6836 if ((i=name_is_in_list ( A2->name[a], A1->name, A1->nseq, 100))==-1)
6839 buf=generate_string (A1->len_aln, '-');
6840 sprintf ( A->name[A->nseq], "%s", A2->name[a]);
6841 sprintf (A->seq_al[A->nseq], "%s%s", buf, A2->seq_al[a]);
6846 A->len_aln=A1->len_aln+A2->len_aln;
6849 Alignment * aln_cat ( Alignment *A, Alignment *B)
6853 if ( A->nseq!=B->nseq)
6855 fprintf ( stderr, "\nERROR IN ALN CAT: DIFFERENT NSEQ\n");
6856 myexit(EXIT_FAILURE);
6859 A=realloc_alignment2(A, A->nseq,A->len_aln+B->len_aln+1);
6861 for ( a=0;a< A->nseq; a++)
6863 strcat ( A->seq_al[a], B->seq_al[a]);
6865 A->len_aln+=B->len_aln;
6868 int verify_aln ( Alignment *A, Sequence *S, char *message)
6873 for ( a=0;a< A->nseq; a++)
6877 for ( b=0, c=0; b< A->len_aln; b++)
6879 if ( !is_gap(A->seq_al[a][b]))
6881 if (tolower(A->seq_al[a][b])!=tolower(S->seq[s][c+r]))
6883 fprintf ( stderr, "\n%s\nResidue [%c %d, %c %d] line %d seq %d",message,A->seq_al[a][b], b,S->seq[s][c+r], c+r,a,s);
6884 output_Alignment_with_res_number(A, stderr);
6885 myexit(EXIT_FAILURE);
6895 Alignment *adjust_est_aln ( Alignment *PW, Alignment *M, int s)
6897 /*This function reajusts M, threading M onto PW
6901 seq 0 PW ----> 0->s-1 in M
6902 seq 1 PW ----> 1->s in M;
6906 static char **array;
6915 array=declare_char (500, 100000);
6918 for ( a=0; a< PW->len_aln; a++)
6920 if ( is_gap(PW->seq_al[0][a]))
6922 for ( b=0; b< s; b++)
6927 for ( b=0; b< s; b++)
6928 array[b][a]=M->seq_al[b][top_M];
6932 if ( is_gap(PW->seq_al[1][a]))
6939 array[s][a]=M->seq_al[s][bottom_M];
6944 M->len_aln=PW->len_aln;
6947 for (b=0; b<PW->len_aln; b++)
6948 M->seq_al[a][b]=array[a][b];
6949 M->seq_al[a][b]='\0';
6959 Alignment * rename_seq_in_aln (Alignment *A, char ***list)
6967 while ( list[n][0][0])
6969 if ( (i=name_is_in_list (list[n][0], A->name, A->nseq, 100))!=-1)
6971 sprintf ( A->name[i], "%s", list[n][1]);
6976 A->S=rename_seq_in_seq (A->S, list);
6979 Sequence * rename_seq_in_seq (Sequence *A, char ***list)
6982 if ( !A || !list)return A;
6985 while ( list[n][0][0])
6987 if ( (i=name_is_in_list (list[n][0], A->name, A->nseq, 100))!=-1)
6989 sprintf ( A->name[i], "%s", list[n][1]);
6995 /********************************************************************/
6997 /* FLOAT SIMILARITIES */
7001 /********************************************************************/
7002 float get_seq_fsim ( char *string1, char *string2, char *ignore, char *similarity_set,int **matrix, int MODE )
7004 int len, a, r1, r2, nr1=0, nr2=0;
7008 len=MIN((strlen (string1)),(strlen (string2)));
7009 if ( len==0)return 0;
7011 for ( a=0; a< len; a++)
7019 if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
7022 if ( matrix)sim+=matrix[r1-'A'][r2-'A'];
7023 else if (is_in_same_group_aa(r1,r2,0, NULL,similarity_set))
7029 if ( MODE==UNGAPED_POSITIONS)return ( sim*100)/pos;
7030 else if ( MODE==ALIGNED_POSITIONS)return (sim*100)/len;
7031 else if ( MODE==AVERAGE_POSITIONS)return (sim*200)/(nr1+nr2);
7038 float get_seq_fsim2 ( char *string1, char *string2, char *ignore, char *in_mode)
7046 float r=0, pos1, pos2, pos0, gap, sim;
7049 sprintf ( mode, "%s", in_mode);
7051 /*mode: <mat>__<sim_mode>
7052 mat: idscore to get the alignment done
7054 sim_mode: sim1->identities/matches
7055 sim2->identities/min len
7059 if ( (p=strstr (mode, "_"))!=NULL)
7066 if (strstr (mode, "idscore"))
7069 if (!mat) mat=read_matrice ("blosum62mt");
7070 return idscore_pairseq (string1, string2, -12, -1, mat,mode);
7074 len1=strlen (string1);
7075 for ( sim=pos1=pos2=pos0=gap=0,a=0; a< len1; a++)
7079 p1=1-is_in_set (r1, ignore);
7080 p2=1-is_in_set (r2, ignore);
7085 if (is_in_same_group_aa(r1,r2,0, NULL, mode))
7096 if ( p==NULL || strm (p, "sim1") || strm (p, "sim"))
7098 r=(pos0==0)?0:(sim*MAXID)/pos0;
7100 else if ( strm (p, "sim2"))
7102 r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MIN(pos1,pos2);
7104 else if ( strm (p, "sim3"))
7106 r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MAX(pos1,pos2);
7108 else if ( strm (p, "gap1"))
7110 r=(len1==0)?MAXID:(gap*MAXID)/len1;
7113 else if ( strm (p, "logid"))
7115 r=logid_score (pos0, sim);
7122 /********************************************************************/
7124 /* ALIGNMENT ANALYSES */
7128 /********************************************************************/
7129 int **dist_array2sim_array ( int **p, int max)
7132 s1=read_array_size ((void *)p, sizeof (void *));
7133 s2=read_array_size ((void*)p[0],sizeof (int));
7134 /* s2=read_array_size ((void*)p[0],sizeof (void *)); OLD before 64 BITS*/
7135 for ( a=0; a< s1; a++)
7136 for ( b=0; b< s2; b++)
7138 p[a][b]=max-p[a][b];
7143 int **sim_array2dist_array ( int **p, int max)
7146 s1=read_array_size ((void *)p, sizeof (void *));
7147 s2=read_array_size ((void*)p[0],sizeof (int));
7149 /*s2=read_array_size ((void*)p[0],sizeof (void *)); OLD before 64 Bits stuff*/
7150 for ( a=0; a< s1; a++)
7151 for ( b=0; b< s2; b++)
7153 p[a][b]=max-(int)p[a][b];
7158 int **normalize_array (int **p, int max, int norm)
7161 s1=read_array_size ((void *)p, sizeof (void *));
7162 s2=read_array_size ((void*)p[0],sizeof (int));
7164 /*s2=read_array_size ((void*)p[0],sizeof (void *)); OLD before 64 Bits stuff*/
7165 for ( a=0; a< s1; a++)
7166 for ( b=0; b< s2; b++)
7168 p[a][b]=(p[a][b]*norm)/max;
7173 int aln2most_similar_sequence ( Alignment *A, char *mode)
7177 int avg, best_avg=0, best_seq=0;
7183 else if ( A->nseq==1)return 0;
7186 buf=vcalloc ( A->len_aln+1, sizeof (char));
7187 w=get_sim_aln_array ( A, mode);
7189 for ( a=0; a< A->nseq; a++)
7191 sprintf ( buf, "%s", A->seq_al[a]);
7193 coverage=(strlen(buf)*MAXID)/A->len_aln;
7195 for ( avg=0,b=0; b< A->nseq; b++)avg+=w[a][b]*coverage;
7196 if ( avg>best_avg){best_avg=avg; best_seq=a;}
7205 int aln2coverage ( Alignment *A, int ref_seq)
7208 int cov_pos=0, npos=0;
7211 for ( a=0; a< A->len_aln; a++)
7213 if ( !is_gap ( A->seq_al[ref_seq][a]))
7216 for ( b=0; b< A->nseq; b++)
7218 if ( b!=ref_seq && !is_gap ( A->seq_al[b][a])){cov_pos++;break;}
7223 return (int) (npos==0)?0:(( MAXID*cov_pos)/npos);
7227 int sub_aln2sim ( Alignment *A, int *ns, int **ls, char *mode)
7233 if (!A || (ns==NULL && A->nseq<2))return -1;
7236 for (a=0; a< A->nseq-1; a++)
7237 for ( b=a+1; b< A->nseq;b++, n++)
7238 avg+=generic_get_seq_sim (A->seq_al[a], A->seq_al[b], NULL, mode);
7242 for (a=0; a<ns[0]; a++)
7243 for (b=0; b< ns[1]; b++, n++)
7245 avg+=generic_get_seq_sim (A->seq_al[ls[0][a]], A->seq_al[ls[1][b]], NULL, mode);
7248 return (int)(n==0)?0:((float)avg/(float)n);
7250 int sub_aln2max_sim ( Alignment *A, int *ns, int **ls, char *mode)
7256 if (!A || (ns==NULL && A->nseq<2))return -1;
7259 for (a=0; a< A->nseq-1; a++)
7260 for ( b=a+1; b< A->nseq;b++, n++)
7261 avg=MAX(avg,generic_get_seq_sim (A->seq_al[a], A->seq_al[b], NULL, mode));
7265 for (a=0; a<ns[0]; a++)
7266 for (b=0; b< ns[1]; b++, n++)
7268 avg=MAX(avg,generic_get_seq_sim (A->seq_al[ls[0][a]], A->seq_al[ls[1][b]], NULL, mode));
7275 double aln2entropy (Alignment *A, int *in_ls, int in_ns, float gap_threshold)
7277 int ns, a, s, col, r,ncol;
7283 ls=vcalloc ( A->nseq, sizeof (int));
7284 count=vcalloc ( 26, sizeof (double));
7290 for ( a=0; a< ns; a++)ls[a]=in_ls[a];
7295 for ( a=0; a< ns; a++)ls[a]=a;
7300 vfree(ls);vfree(count);return 0;
7302 for (ncol=0,col=0; col<A->len_aln; col++)
7304 for (ng=0,a=0; a< ns; a++)
7307 ng+=is_gap(A->seq_al[s][col]);
7310 if ( ng>gap_threshold)continue;
7314 for ( a=0; a<ns; a++)
7317 r=tolower(A->seq_al[s][col]);
7318 if (!is_gap(r))count[r-'a']++;
7320 for (a=0; a<26; a++)
7325 count[a]/=(double)ns;
7327 entropy+=count[a]*log(count[a]);
7333 vfree (ls); vfree(count);
7337 int aln2sim ( Alignment *A, char *mode)
7339 return sub_aln2sim ( A, NULL, NULL, mode);
7341 if ( !A || A->nseq<2) return -1;
7342 w=get_sim_aln_array ( A, mode);
7344 for (c=0, a=0; a< A->nseq-1; a++)
7345 for ( b=a+1; b< A->nseq; b++, c++)
7347 avg+=(float)w[a][b];
7350 return (int)((float)avg/(float)c);
7354 int aln_is_aligned ( Alignment *A)
7359 for (a=0; a< A->nseq; a++)
7360 for ( b=A->len_aln-1; b>0; b--)
7362 if (!is_gap(A->seq_al[a][b]) && is_gap(A->seq_al[a][b-1]))return 1;
7368 int seq2aln2sim_old ( char *seq1, char *seq2, char *mode_aln, char *mode_id)
7373 A=align_two_sequences (seq1, seq2, "pam250mt", -10, -1, mode_aln);
7374 sim=aln2sim (A, mode_id);
7378 int seq2aln2sim ( char *seq1, char *seq2, char *mode_aln, char *mode_id)
7387 m=read_matrice ("blosum62mt");
7388 gop=get_avg_matrix_mm(m, AA_ALPHABET)*10;
7392 A=align_two_sequences (seq1, seq2, "blosum62mt",gop,-1, mode_aln);
7393 sim=aln2sim (A, mode_id);
7397 int* get_cdna_seq_winsim ( int *cache, char *string1, char *string2, char *ignore, char *mode,int *w )
7403 len1=strlen (string1);
7404 len2=strlen (string2);
7408 fatal_exit( stderr,EXIT_FAILURE, "\nTHE TWO cDNAs DO NOT HAVE THE SAME LENGTH [FATAL:get_cdna_seq_sim:%s", PROGRAM);
7411 x=get_cdna_seq_sim(cache, string1, string2, ignore, "");
7412 for ( a=0; a< len1; a++)
7415 add_warning (stderr, "\nWARNING: winsim not implemented for cDNA");
7419 int get_cdna_seq_sim ( int *cache, char *string1, char *string2, char *ignore, char *mode)
7428 len1=strlen (string1);
7429 len2=strlen (string2);
7435 fprintf ( stderr, "\nTHE TWO cDNAs DO NOT HAVE THE SAME LENGTH [FATAL:get_cdna_seq_sim:%s", PROGRAM);
7439 for ( a=0; a< len1;)
7442 if ( cache[a]==0){a++;continue;}
7443 else if ( cache[a]==1)
7446 r1=translate_dna_codon (string1+a, 'x');
7447 r2=translate_dna_codon (string2+a, 'x');
7451 if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
7454 if (is_in_same_group_aa(r1,r2,0, NULL,mode+4))
7466 return (int) (sim*MAXID)/pos;
7470 int* get_seq_winsim ( char *string1, char *string2, char *ignore, char *mode, int*w)
7472 int len1, len2, len;
7479 len1=strlen (string1);
7480 len2=strlen (string2);
7484 if ( len1!=len2)return 0;
7485 if (window==0 || (window*2+1)>=len1)
7487 sim=get_seq_sim (string1, string2, ignore, "");
7488 for (a=0; a<len1; a++)w[a]=sim;
7493 for ( a=0; a< len1; a++)
7496 left =MAX(0, a-window);
7497 right=MIN(len1, left+len);
7498 for (sim=0,b=left; b<right; b++)
7502 if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
7507 w[a]=(sim*MAXID)/len;
7513 int get_seq_sim ( char *string1, char *string2, char *ignore, char *in_mode)
7517 int pos1, pos2, pos0,gap=0, sim;
7523 if (!mode)mode=vcalloc (100, sizeof (char));
7527 while (in_mode[0]=='_')in_mode++;
7528 sprintf ( mode, "%s", in_mode);
7531 /*mode: <mat>__<sim_mode>
7532 mat: idscore to get the alignment done
7534 sim_mode: sim1->identities/matches
7535 sim2->identities/min len
7539 if ( (p=strstr (mode, "_"))!=NULL)
7546 if (strstr (mode, "idscore"))
7549 if (!mat) mat=read_matrice ("blosum62mt");
7550 return idscore_pairseq (string1, string2, -12, -1, mat,mode);
7553 len1=strlen (string1);
7554 for ( sim=pos1=pos2=pos0=0,a=0; a< len1; a++)
7558 p1=1-is_in_set (r1, ignore);
7559 p2=1-is_in_set (r2, ignore);
7565 if (is_in_same_group_aa(r1,r2,0, NULL, mode))
7576 if ( strstr (mode, "cov"))
7578 r=(pos0+gap==0)?0:(pos0*MAXID)/(pos0+gap);
7580 else if ( p==NULL || strm (p, "sim1") || strm (p, "sim"))
7582 r=(pos0==0)?0:(sim*MAXID)/pos0;
7584 else if ( strm (p, "sim2"))
7586 r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MIN(pos1,pos2);
7588 else if ( strm (p, "sim3"))
7590 r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MAX(pos1,pos2);
7592 else if ( strm (p, "gap1"))
7594 r=(len1==0)?MAXID:(gap*MAXID)/len1;
7597 else if ( strm (p, "logid"))
7599 r=logid_score (pos0, sim);
7601 else if ( strstr (mode, "sim"))
7603 r=(pos0==0)?0:(sim*MAXID)/pos0;
7610 int get_seq_sim_2 ( char *string1, char *string2, char *ignore, char **gr, int ng)
7620 len1=strlen (string1);
7621 len2=strlen (string2);
7623 if ( len1!=len2)return 0;
7625 for ( a=0; a< len1; a++)
7629 if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
7632 if (is_in_same_group_aa(r1,r2,ng, gr, NULL))
7642 return (int) (sim*MAXID)/pos;
7646 int get_seq_sim_3 ( char *string1, char *string2, char *ignore, int **mat)
7656 len1=strlen (string1);
7657 len2=strlen (string2);
7659 if ( len1!=len2)return 0;
7661 for ( a=0; a< len1; a++)
7665 if ( !is_in_set (r1, ignore) && !is_in_set (r2, ignore))
7667 sim+=mat[r1-'A'][r2-'A'];
7673 int * get_aln_col_weight ( Alignment *A, char *mode)
7679 col=vcalloc ( A->nseq, sizeof (int));
7680 weight=vcalloc (A->len_aln, sizeof (int));
7682 for (a=0; a< A->len_aln; a++)
7684 for ( b=0; b< A->nseq; b++)
7685 col[b]=A->seq_al[b][a];
7686 weight[a]=(find_group_aa_distribution (col, A->nseq,0,NULL,NULL, mode )*MAXID)/A->nseq;
7693 int analyse_aln_column ( Alignment *B, int col)
7699 static int ng_cw_star;
7700 static char **cw_star;
7703 static int ng_cw_col;
7704 static char **cw_col;
7707 static int ng_cw_dot;
7708 static char **cw_dot;
7716 if ( !B->S || !(B->S)->type)B= get_aln_type (B);
7718 if ( !mat)mat=vcalloc ( STRING, sizeof (char));
7722 cw_star=make_group_aa ( &ng_cw_star, strcpy ( mat,"idmat"));
7723 cw_col=make_group_aa ( &ng_cw_col, strcpy (mat,"clustalw_col"));
7724 cw_dot=make_group_aa ( &ng_cw_dot, strcpy (mat, "clustalw_dot"));
7727 cw_star_count=vcalloc (ng_cw_star, sizeof (int));
7728 cw_col_count=vcalloc ( ng_cw_col, sizeof (int));
7729 cw_dot_count=vcalloc (ng_cw_dot, sizeof (int));
7731 for ( a=0; a< B->nseq; a++)
7733 c=tolower (B->seq_al[a][col]);
7734 if (is_gap(c)){r=' ';break;}
7736 for ( b=0; b< ng_cw_star; b++)
7737 cw_star_count[b]+=is_in_set (c, cw_star[b]);
7738 for ( b=0; b< ng_cw_col; b++)
7739 cw_col_count[b]+=is_in_set (c, cw_col[b]);
7740 for ( b=0; b< ng_cw_dot; b++)
7741 cw_dot_count[b]+=is_in_set (c, cw_dot[b]);
7748 if ( !is_gap(c) && r==' ')
7749 for ( b=0; b< ng_cw_star; b++)if ( cw_star_count[b]==B->nseq){r='*'; break;}
7750 if ( !is_gap(c) && r==' ' && !(strm((B->S)->type, "DNA")||strm ((B->S)->type,"RNA")))
7751 for ( b=0; b< ng_cw_col ; b++)if ( cw_col_count [b]==B->nseq){r=':'; break;}
7752 if ( !is_gap(c) && r==' ' && !(strm((B->S)->type, "DNA")||strm ((B->S)->type,"RNA")))
7753 for ( b=0; b< ng_cw_dot ; b++)if ( cw_dot_count [b]==B->nseq){r='.'; break;}
7757 vfree(cw_star_count);
7758 vfree(cw_col_count);
7759 vfree(cw_dot_count);
7765 int ** get_cov_aln_array ( Alignment *A, char *mode)
7770 w=declare_int ( A->nseq, A->nseq);
7773 for ( a=0; a< A->nseq-1; a++)
7776 for ( t=0,b=a+1; b< A->nseq; b++)
7778 for ( c=0; c< A->len_aln; c++)
7780 t+=(!is_gap(A->seq_al[a][c]) &&!is_gap(A->seq_al[b][c]));
7782 w[a][b]=w[b][a]=(t*100)/A->len_aln;
7788 int ** get_cov_master_aln_array ( Alignment *A,int n, char *mode)
7793 w=declare_int ( A->nseq, A->nseq);
7796 for (b=0; b< A->nseq; b++)
7799 for (t=0, c=0; c< A->len_aln; c++)
7801 t+=(!is_gap(A->seq_al[n][c]) &&!is_gap(A->seq_al[n][c]));
7803 w[n][b]=w[b][n]=(t*100)/A->len_aln;
7808 int ** get_sim_master_aln_array ( Alignment *A,int n, char *mode)
7813 w=declare_int ( A->nseq, A->nseq);
7816 for ( a=0; a< A->nseq; a++)
7818 if ( strm (mode, "cdna"))
7819 w[n][a]=w[a][n]=get_cdna_seq_sim ( A->cdna_cache[0], A->seq_al[a], A->seq_al[n],GAP_LIST, mode);
7821 w[n][a]=w[a][n]=get_seq_sim ( A->seq_al[n], A->seq_al[a],GAP_LIST, mode);
7825 int ** get_dist_aln_array ( Alignment *A, char *mode)
7830 w=get_sim_aln_array ( A, mode);
7831 return sim_array2dist_array(w,MAXID);
7833 Sequence * seq2filter (Sequence *Sin, int min, int max)
7843 S=duplicate_sequence (Sin);
7844 for (a=0; a<S->nseq; a++)ungap(S->seq[a]);
7845 keep=vcalloc (S->nseq, sizeof (int));
7846 M=read_matrice ("blossum62mt");
7847 for (a=0; a<S->nseq; a++)
7849 output_completion ( stderr, a, S->nseq, 100, "Distance Matrix Computation: ");
7850 for ( b=a+1; b<S->nseq; b++)
7853 sim=idscore_pairseq(S->seq[a], S->seq[b],-10, -2,M, "sim");
7854 if ( sim>min && sim<max)keep[a]=keep[b]=1;
7855 fprintf ( stderr, "\nSim %d Min %d Max %d", sim, min, max);
7859 tmpfile=vtmpnam (NULL);
7860 fp=vfopen (tmpfile, "w");
7861 for (n=0,a=0; a< S->nseq; a++)
7864 fprintf ( fp, ">%s %s\n%s", S->name[a], S->seq_comment[a], S->seq[a]);
7868 if (n==0) return NULL;
7869 Sout=main_read_seq(tmpfile);
7870 free_int (M, -1); vfree (keep); free_sequence (S, -1);
7874 Alignment * grep_seq (Alignment *S,char *field, char *mode, char *string)
7882 fp=vfopen (tmp, "w");
7884 if ( !strm(mode, "KEEP") && ! strm (mode, "REMOVE"))
7886 add_warning ( stderr, "\nERROR: +grep <field> <KEEP|REMOVE> <string> [FATAL: %s]", PROGRAM);
7887 myexit (EXIT_FAILURE);
7889 else if ( !strm(field, "SEQ") && ! strm (field, "COMMENT") && ! strm(field, "NAME"))
7891 add_warning ( stderr, "\nERROR: +grep <NAME|COMMENT|SEQ> <mode> <string> [FATAL: %s]", PROGRAM);
7892 myexit (EXIT_FAILURE);
7896 for (n=0, a=0; a< S->nseq; a++)
7900 if (strm(field, "NAME") && perl_strstr (S->name[a], string))found=1;
7901 else if (strm(field, "COMMENT") && S->seq_comment[a][0] && perl_strstr (S->seq_comment[a], string) )found=1;
7902 else if (strm(field, "SEQ") && perl_strstr (S->seq_al[a], string))found=1;
7904 if ( (strm (mode, "KEEP") && found) || (strm (mode, "REMOVE") && !found))
7907 fprintf (fp, ">%s", S->name[a]);
7908 if (S->seq_comment[a][0])fprintf (fp, " %s", S->seq_comment[a]);
7909 fprintf (fp, "\n%s\n", S->seq_al[a]);
7916 if ( n==0) return NULL;
7918 return main_read_aln (tmp, NULL);
7921 Alignment * modify_seq (Alignment *S, char *field, char *string1, char *string2)
7928 fp=vfopen (tmp, "w");
7929 for ( a=0; a< S->nseq; a++)
7931 if (strm(field, "NAME"))S->name[a]=substitute ( S->name[a], string1, string2);
7932 else if (strm(field, "COMMENT"))S->seq_comment[a]=substitute ( S->seq_comment[a], string1, string2);
7933 else if (strm(field, "SEQ"))S->seq_al[a]=substitute ( S->seq_al[a], string1, string2);
7934 fprintf (fp, ">%s", S->name[a]);
7935 if (S->aln_comment[a][0])fprintf (fp, " %s", S->aln_comment[a]);
7936 fprintf (fp, "\n%s\n", S->seq_al[a]);
7940 S=main_read_aln (tmp, NULL);
7944 int ** seq2sim_mat (Sequence *S, char *mode)
7946 return seq2comp_mat ( S,mode, "sim");
7948 int ** seq2cov_mat (Sequence *S, char *mode)
7950 return seq2comp_mat ( S,mode, "cov");
7953 int ** seq2comp_mat (Sequence *S, char *mode, char *comp_mode)
7962 /*Use pre_computed value if available in the current dir*/
7964 name=path2filename(S->file[0]);
7965 sprintf ( file, "%s%s.%s.%s_file", get_cache_dir(),name, mode, comp_mode);
7966 A=seq2aln(S,NULL, RM_GAP);
7967 if ( check_file_exists (file) && is_distance_matrix_file (file) && (sim=input_similarities(file, A, NULL))!=NULL)
7969 display_input_filename (stderr, "SIMILARITY_MATRIX", "SIMILARITY_MATRIX_FORMAT_01", file, CHECK);
7970 fprintf ( stderr, "\n");
7977 M=read_matrice (mode);
7978 sim=declare_int ( S->nseq, S->nseq);
7979 for ( a=0; a< S->nseq; a++)
7985 for ( a=0; a<S->nseq-1; a++)
7988 output_completion4halfmat ( stderr, a, S->nseq, 100, "Similarity Matrix Computation: ");
7989 for ( b=a+1; b< S->nseq; b++)
7991 sim[a][b]=sim[b][a]=idscore_pairseq(S->seq[a], S->seq[b],-12, -1,M, comp_mode);
7995 sprintf ( mode2, "_memory_%ld", (long int)sim);
7996 output_similarities( file, A, mode2);
7997 display_output_filename (stderr, "SIMILARITY_MATRIX", "SIMILARITY_MATRIX_FORMAT_01", file, CHECK);
7998 fprintf ( stderr, "\n");
8004 int ** fast_aln2sim_list (Alignment *A, char *mode, int *ns, int **ls)
8007 int p1, p2, p3, r1, r2;
8008 int gap,pos0,pos1,pos2,len,sim;
8009 int a, b, c, m, s=0,s1, s2, n;
8015 ns=vcalloc (2, sizeof (int));
8016 ns[0]=ns[1]=A->nseq;
8017 ls=declare_int (2, A->nseq);
8018 for ( a=0; a< 2; a++)
8019 for (b=0; b<A->nseq; b++)
8024 simm=declare_int (ns[0]*ns[1]+1, 3);
8026 if (strstr (mode, "sim1"))m=0;
8027 else if (strstr (mode, "sim2"))m=1;
8028 else if (strstr (mode, "sim3"))m=2;
8029 else if (strstr (mode, "gap1"))m=3;
8030 else if (strstr (mode, "cov1"))m=4;
8031 else if (strstr (mode, "logid"))m=5;
8036 for (n=0,a=0; a<ns[0]; a++)
8039 for ( b=0; b<ns[1]; b++, n++)
8042 gap=pos0=pos1=pos2=len=sim=0;
8044 for ( c=0; c< A->len_aln; c++)
8046 r1=tolower (A->seq_al[s1][c]);
8047 r2=tolower (A->seq_al[s2][c]);
8051 if ( p3==0)continue;
8060 if (m==0)s=(pos0==0)?0:(sim*MAXID)/pos0; //sim1
8061 else if (m==1) s=(MIN(pos1,pos2)==0)?0:(sim*MAXID)/MIN(pos1,pos2);//sim2
8062 else if (m==2) s=(MAX(pos1,pos2)==0)?0:(sim*MAXID)/MAX(pos1,pos2);//sim3
8063 else if (m==3) s=(len==0) ?0:((len-gap)*MAXID)/len;//gap1
8064 else if (m==4) s=(len==0) ?0:((pos0)*MAXID)/len; //cov
8067 s=logid_score ( sim, len);
8075 if ( free_ns) {vfree(ns); free_int (ls, -1);}
8080 int ** fast_aln2sim_mat (Alignment *A, char *mode)
8083 int p1, p2, p3, r1, r2;
8084 int gap,pos0,pos1,pos2,len,sim;
8087 simm=declare_int (A->nseq, A->nseq);
8091 if (strstr (mode, "sim1"))m=0;
8092 else if (strstr (mode, "sim2"))m=1;
8093 else if (strstr (mode, "sim3"))m=2;
8094 else if (strstr (mode, "gap1"))m=3;
8095 else if (strstr (mode, "cov1"))m=4;
8096 else if (strstr (mode, "logid"))m=5;
8101 for ( a=0; a< A->nseq-1; a++)
8104 for ( b=a+1; b< A->nseq; b++)
8106 gap=pos0=pos1=pos2=len=sim=0;
8108 for ( c=0; c< A->len_aln; c++)
8110 r1=tolower (A->seq_al[a][c]);
8111 r2=tolower (A->seq_al[b][c]);
8115 if ( p3==0)continue;
8124 if (m==0)simm[a][b]=simm[b][a]=(pos0==0)?0:(sim*MAXID)/pos0; //sim1
8125 else if (m==1) simm[a][b]=simm[b][a]=(MIN(pos1,pos2)==0)?0:(sim*MAXID)/MIN(pos1,pos2);//sim2
8126 else if (m==2) simm[a][b]=simm[b][a]=(MAX(pos1,pos2)==0)?0:(sim*MAXID)/MAX(pos1,pos2);//sim3
8127 else if (m==3) simm[a][b]=simm[b][a]=(len==0) ?0:((len-gap)*MAXID)/len;//gap1
8128 else if (m==4) simm[a][b]=simm[b][a]=(len==0) ?0:((pos0)*MAXID)/len; //cov
8132 //Inspired from Muscle +mafft 5
8133 simm[a][b]=simm[b][a]=logid_score ( sim, len);
8139 int logid_score ( int sim, int len)
8143 if ( len==0)return (int)(0.33*(float)MAXID);
8145 score=(float)sim/(float)len;
8146 if (score>0.9) score=1.0;
8147 else score=-log10 (1.0-score);
8149 score=(score*MAXID);
8152 int ** aln2sim_mat (Alignment *A, char*mode)
8156 if ( strstr (mode, "idmat"))return fast_aln2sim_mat(A, mode);
8157 return get_sim_aln_array(A, mode);
8159 int ** aln2cov (Alignment *A)
8162 int r1, r2, gr1, gr2, pos0, gap;
8164 cov=declare_int (A->nseq, A->nseq);
8166 for (a=0; a< A->nseq-1; a++)
8169 for ( b=a+1; b<A->nseq; b++)
8171 for (gap=0,pos0=0,c=0;c<A->len_aln; c++)
8175 gr1=is_gap(r1); gr2=is_gap(r2);
8176 if ( gr1+gr2==0)pos0++;
8177 else if ( gr1+gr2<2)gap++;
8179 cov[a][b]=cov[b][a]=((gap+pos0)==0)?0:((pos0*100)/(gap+pos0));
8184 int ** get_raw_sim_aln_array (Alignment *A, char *mode)
8188 int a, b, c, r1, r2, set, max, min;
8190 w=declare_int (A->nseq, A->nseq);
8191 if (strstr(mode, "sar"))M=NULL;
8192 else M=read_matrice (mode);
8196 for ( set=0,a=0; a< A->nseq; a++)
8197 for (b=a; b<A->nseq; b++)
8201 for (c=0; c<A->len_aln; c++)
8206 if ( !is_gap(r1) && !is_gap(r2))
8207 w[a][b]+=M[r1-'A'][r2-'A'];
8210 else if ( strm (mode, "sarmat2"))
8212 w[a][b]=get_sar_sim2 (A->seq_al[a], A->seq_al[b]);
8216 HERE ("ERROR: %s is an unknown mode of raw_sim\n", mode); myexit (EXIT_FAILURE);
8220 if (!set){min=max=w[a][b];set=1;}
8221 min=MIN(min,w[a][b]);
8222 max=MAX(max,w[a][b]);
8224 for (a=0; a<A->nseq; a++)
8225 for (b=a; b<A->nseq; b++)
8227 w[b][a]=((max-min)==0)?0:((w[b][a]-min)*100)/(max-min);
8233 int ** get_sim_aln_array ( Alignment *A, char *mode)
8239 w=declare_int ( A->nseq, A->nseq);
8241 for ( a=0; a< A->nseq-1; a++)
8243 for ( b=a+1; b< A->nseq; b++)
8246 w[a][b]=w[b][a]=generic_get_seq_sim ( A->seq_al[a], A->seq_al[b], (A->cdna_cache)?A->cdna_cache[0]:NULL, mode);
8251 int generic_get_seq_sim ( char *seq1, char *seq2, int*cache, char *mode)
8255 if ( strm (mode, "cdna"))
8256 return get_cdna_seq_sim ( cache, seq1, seq2,GAP_LIST, mode);
8257 else if ( strnm (mode, "ktup",4))
8258 return ktup_comparison (seq1, seq2,atoi(mode+4));
8259 else if ( strstr (mode, "sarmat2"))
8262 return get_sar_sim2 (seq1, seq2);
8264 else if ( strstr (mode, "sarmat"))
8265 return (int) get_sar_sim (seq1,seq2);
8268 return get_seq_sim ( seq1,seq2,GAP_LIST, mode);
8271 int *** get_winsim_aln_array ( Alignment *A,char *mode, int ***w)
8274 for ( a=0; a< A->nseq; a++)
8275 for ( b=0; b< A->nseq; b++)
8277 if ( strm (mode, "cdna"))
8278 w[a][b]=get_cdna_seq_winsim ( A->cdna_cache[0], A->seq_al[a], A->seq_al[b],GAP_LIST, mode, w[a][b]);
8280 w[a][b]=get_seq_winsim ( A->seq_al[a], A->seq_al[b],GAP_LIST, mode, w[a][b]);
8285 Alignment * seq2profile (Sequence *S, int i)
8289 if ((A=seq2R_template_profile (S, i)))
8298 fp=vfopen ( tmp, "w");
8299 fprintf (fp, ">%s\n%s\n", S->name[i], S->seq[i]);
8302 (S->T[i])->R=fill_R_template (S->name[i], tmp, S);
8304 return seq2R_template_profile (S, i);
8307 Alignment* remove_seq_from_aln (Alignment *A, char *seq)
8310 for (n=0,a=0; a<A->nseq; a++)
8312 if ( strm (seq, A->name[a]))continue;
8316 sprintf (A->name[n], "%s",A->name[a]);
8317 sprintf (A->seq_al[n], "%s",A->seq_al[a]);
8318 if (A->seq_comment[a])sprintf (A->seq_comment[n], "%s", A->seq_comment[a]);
8319 if (A->aln_comment[a])sprintf (A->aln_comment[n], "%s", A->aln_comment[a]);
8320 A->order[n][0]=A->order[a][0];
8321 A->order[n][1]=A->order[a][1];
8330 Alignment* aln2sub_aln_file (Alignment *A, int n, char **string)
8335 list=vcalloc (A->nseq, sizeof (char***));
8342 for (l=0,a=0; a< n; a++)l+=strlen (string[a]);
8343 buf=vcalloc ( 2*n+l+1, sizeof (char));
8344 for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");}
8345 list[0]=string2list (buf);
8348 else if ( file_exists (NULL,string[0]))
8350 list=read_group (string[0]);
8355 fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM);
8356 myexit (EXIT_FAILURE);
8365 n=atoi (list[a][0]);
8366 fp=vfopen (list[a][1], "w");
8369 i=name_is_in_list (list[a][b], A->name, A->nseq, MAXNAMES);
8370 if (n==3)ungap (A->seq_al[i]);
8371 fprintf (fp, ">%s\n%s\n", A->name[i], A->seq_al[i]);
8374 free_char (list[a], -1);
8380 Sequence *remove_empty_sequence (Sequence *S)
8386 c=vcalloc ( S->max_len+1, sizeof (char));
8388 for (a=0, b=0; a< S->nseq; a++)
8390 sprintf ( c, "%s",S->seq[a]);
8394 //vfree (S->seq[a]);
8396 add_warning ( stderr, "WARNING: Sequence %s does not contain any residue: automatically removed from the set [WARNING:%s]",S->name[a], PROGRAM);
8399 NS=duplicate_sequence (S);
8400 free_sequence (S, S->nseq);
8404 Alignment* aln2sub_seq (Alignment *A, int n, char **string)
8410 list=vcalloc (A->nseq, sizeof (char***));
8417 for (l=0,a=0; a< n; a++)l+=strlen (string[a]);
8418 buf=vcalloc ( 2*n+l+1, sizeof (char));
8419 for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");}
8420 list[0]=string2list (buf);
8423 else if ( file_exists (NULL,string[0]))
8425 list=read_group (string[0]);
8430 fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM);
8431 myexit (EXIT_FAILURE);
8444 B=main_read_aln (list[a][1], NULL);
8445 t=aln2most_similar_sequence(B, "idmat");
8446 subS=extract_one_seq(B->name[t],0,0,B,KEEP_NAME);
8447 S=add_sequence (subS,S,0);
8448 free_aln (B);free_sequence (subS, -1);
8449 vremove (list[a][1]);
8453 return seq2aln (S, NULL, RM_GAP);
8456 Alignment * aln2collapsed_aln (Alignment * A, int n, char **string)
8463 int a, b,c, ns, m, l;
8466 list=vcalloc (A->nseq, sizeof (char***));
8471 for (l=0,a=0; a< n; a++)l+=strlen (string[a]);
8472 buf=vcalloc ( 2*n+l+1, sizeof (char));
8473 for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");}
8475 list[0]=string2list (buf);ns=1;
8478 else if ( file_exists (NULL,string[0]))
8480 /*Format: Fasta like, the name fo the group followed with the name of the sequences
8481 ><Group name> <First Seq> <second seq> ....
8482 Groups must NOT be overlaping
8484 l=measure_longest_line_in_file (string[0])+1;
8485 buf=vcalloc (l, sizeof (char));
8487 fp=vfopen (string[0], "r");
8488 while ((c=fgetc(fp))!=EOF)
8490 buf=fgets (buf,l-1, fp);
8491 if ( c=='>')list[ns++]=string2list (buf);
8497 fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM);
8498 myexit (EXIT_FAILURE);
8501 vfree (buf); buf=NULL;
8503 /*Identify lost sequences*/
8504 collapsed=vcalloc (A->nseq, sizeof (int));
8505 for ( a=0; a< ns; a++)
8507 m=atoi (list[a][0]);
8508 for (b=2; b<m ; b++)
8510 c=name_is_in_list (list[a][b], A->name, A->nseq, MAXNAMES);
8511 if ( c>=0)collapsed[c]=1;
8514 for ( a=0; a< A->nseq; a++)
8516 if ( collapsed[a]==0)
8518 list[ns]=declare_char (3, MAXNAMES);
8519 sprintf ( list[ns][0], "3");
8520 sprintf ( list[ns][1], "%s", A->name[a]);
8521 sprintf ( list[ns][2], "%s", A->name[a]);
8531 list2=declare_char (A->nseq, 100);
8532 /*1 Collapse the alignment*/
8533 for ( a=0; a< ns; a++)
8535 sprintf ( list2[a], "%s", list[a][2]);
8537 B=extract_sub_aln2 ( A, ns, list2);
8538 /*2 Rename the sequences*/
8539 for ( a=0; a< ns; a++)
8541 sprintf ( B->name[a], "%s", list[a][1]);
8543 /*replace sequence with consensus*/
8545 for ( a=0; a< ns; a++)
8547 m=atoi (list[a][0]);
8548 for (c=0, b=2; b<m;c++, b++)
8550 sprintf ( list2[c], "%s", list[a][b]);
8552 buf=sub_aln2cons_seq_mat2 ( A,m-2,list2, "blosum62mt");
8553 sprintf (B->seq_al[a], "%s", buf);
8561 Alignment * aln2profile (Alignment * A)
8568 A->P=declare_profile (AA_ALPHABET,A->len_aln+1);
8571 free_int ((A->P)->count, -1);
8572 free_int ((A->P)->count2, -1);
8573 free_int ((A->P)->count3, -1);
8574 (A->P)->count=aln2count_mat (A);
8575 (A->P)->count2=aln2count_mat2 (A);
8577 cons=aln2cons_seq_mat (A, "blosum62mt");
8579 sprintf (B->seq_al[0], "%s", cons);
8581 (A->P)->count3=aln2count_mat2 (B);
8591 int** aln2count_mat2 ( Alignment *A)
8593 return sub_aln2count_mat2 (A, 0, NULL);
8596 int sub_aln2nseq_prf ( Alignment *A, int ns, int *ls)
8609 ls=vcalloc (n, sizeof (int));
8610 for ( a=0; a<A->nseq; a++)ls[a]=a;
8618 for (c=0,a=0; a<ns; a++)
8621 if ( A->S && (R=seq2R_template_profile (A->S, A->order[s][0]))!=NULL)
8631 if ( free_ls) vfree (ls);
8635 int** sub_aln2count_mat2 ( Alignment *A, int ns, int *ls)
8647 p=vcalloc ( n, sizeof (char*));
8648 ls=vcalloc (n, sizeof (int));
8649 for ( a=0; a<A->nseq; a++)ls[a]=a;
8655 p=vcalloc (n, sizeof (char*));
8658 for (c=0,a=0; a<ns; a++)
8661 if ( A->S && (R=seq2R_template_profile (A->S, A->order[s][0]))!=NULL)
8664 p=vrealloc (p, n*sizeof (char*));
8665 for (b=0; b<R->nseq; b++)
8667 p[c++]=R->seq_al[b];
8676 p[c++]=A->seq_al[s];
8679 count=sub_aln2count_mat3 (p,c);
8681 if ( free_ls) vfree (ls);
8684 int** sub_aln2count_mat3 (char **al, int ns)
8695 /*count[x][0]=n symbols in column
8696 count[x][1]=total_size of line
8697 count[x][2]=Gap frequency
8699 count[x][n]=symbol n
8700 count[x][n+1]=N occurence symbol n;
8701 count[x][n+2]=N frequence symbol n*100;
8703 special multi-channeling
8704 count[x][count[x][1]]=Nseq
8705 count[x][count[x][1]+s]=residue col x, sequence s
8709 for (a=0; a< 1000; a++)used[a]=0;
8712 count=declare_int (len+2,100+ns+2);
8713 count[len][0]=END_ARRAY;
8719 for (a=0; a<len; a++)
8721 for (us=ns, b=0; b<ns; b++)
8723 r=tolower (al[b][a]);
8728 count[a][used[r]*3+1]++;
8732 used[r]=++count[a][0];
8733 count[a][used[r]*3]=r;
8734 count[a][used[r]*3+1]++;
8737 count[a][1]=count[a][0]*3+2;
8738 /*count[a][2]=(A->nseq-us)*100/A->nseq;*/
8741 for (b=3; b<count[a][1]; b+=3)
8743 count[a][b+2]=(count[a][b+1]*100)/us;
8744 used[count[a][b]]=0;
8748 /*Option for multi channeling*/
8751 count[a][count[a][1]]=A->nseq;
8752 for (b=1; b<=A->nseq; b++)
8753 count [a][count[a][1]+b]=(is_gap(A->seq_al[b-1][a]))?0:A->seq_al[b-1][a];
8758 for (a=0; a< 5; a++)
8760 fprintf ( stderr, "\n");
8761 for ( b=3; b< count[a][1]; b+=3)
8763 fprintf ( stderr, "[%c %d]", count[a][b], count[a][b+1]);
8765 fprintf ( stderr, "\n");
8766 for ( b=0; b<ns; b++)
8768 fprintf ( stderr, "%c", al[b][a]);
8771 HERE ("End of Display");
8776 int** aln2count_mat ( Alignment *A)
8778 function documentation: start
8780 int output_freq_mat ( char *outfile, Aligmnent *A)
8782 This function counts the number of residues in each column of an alignment (Prot/NA)
8783 It outputs these values in the following format
8785 This format can be piped into:
8786 The routine used for computing the p-value gmat-inf-gc-v2c
8788 function documentation: end
8795 alp_size=sizeof (AA_ALPHABET);
8796 freq_mat=declare_int (alp_size+2, A->len_aln);
8799 for ( a=0; a<A->len_aln; a++)
8801 for ( b=0; b< A->nseq; b++)
8803 if ( is_gap ( A->seq_al[b][a]))freq_mat[alp_size][a]++;
8806 x=tolower(A->seq_al[b][a]);
8807 freq_mat[x-'a'][a]++;
8808 freq_mat[alp_size+1][a]++;
8816 char *aln2random_seq (Alignment *A, int pn1, int pn2, int pn3, int gn)
8822 Given the frequencies in A ( read as total counts of each Residue in
8823 freq[A->nseq][A->len_aln], and pn1, pn2 and pn3:
8825 1-Generate a new amino-acid at each position
8826 2-Insert Gaps, using a HMM.
8829 pn3=Weight of the noise induced with sub mat.
8831 pn1=% noise type 1 ( Varies with entropi)
8832 n1=Ratio noise type 1
8835 t1=Noise 1 expressed in Nseq
8837 ncat=number of non 0 cat for a given position
8838 ICi initial count for residue i
8844 Ci= ICi*(T-(t1+t2))/T +(t1)/al+(t2)/al
8850 float T, tot_t1, tot_t2,tot_t3, n1, n2, n3;
8856 double *t1, *t2,*t3;
8860 /*Viterbi Parameters */
8863 int AL=0; /*Allowed Transition*/
8864 int F=-100000; /*Forbiden Transition*/
8869 int state,best_state=0, score, best_score=0;
8879 seq=vcalloc ( A->len_aln+1, sizeof (char));
8880 count=aln2count_mat(A);
8881 freq=aln2count_mat(A);
8889 for ( a=0; a< A->len_aln; a++)
8891 for ( b=0; b<26; b++)
8892 freq[b][a]=freq[b][a]*((T)/(A->nseq-freq[26][a]));
8893 freq[26][a]= (freq[26][a]*T)/A->nseq;
8897 init_freq=vcalloc ( 26, sizeof (double));
8898 blur_freq=vcalloc ( 26, sizeof (double));
8900 tot_t1=tot_t2=tot_t3=0;
8902 t1=vcalloc ( 27, sizeof (double));
8903 t2=vcalloc ( 27, sizeof (double));
8904 t3=vcalloc ( 27, sizeof (double));
8905 for (a=0; a< A->len_aln; a++)
8908 /*Compute Frequencies*/
8909 for (tot=0, b=0; b<26; b++)
8913 init_freq[b]=freq[b][a];
8917 /*Count the number of different amino acids*/
8918 for ( ncat=0, b=0; b<=26; b++)
8920 ncat+=(freq[b][a]!=0)?1:0;
8922 /*Blurr the distribution using */
8923 blur_freq=compute_matrix_p (init_freq,tot);
8926 /*compute noise 1: biased with blurred content * enthropy--> keeps prosite motifs*/
8927 tot_t1=T*n1*(1-1/ncat);
8928 for ( b=0; b< 26; b++)if ( is_aa(b+'A')){t1[b]=blur_freq[b]*(1-1/ncat)*n1;}
8930 /*Compute noise 2: completely random*/
8932 for ( b=0; b< 26; b++)if ( is_aa(b+'A')){t2[b]=tot_t2/21;}
8934 /*compute noise 3: biased with the sole content(pam250mt)*/
8936 for ( b=0; b<26; b++)if ( is_aa(b+'A')){t3[b]=blur_freq[b]*n3;}
8938 for ( b=0; b<26; b++)
8941 freq[b][a]=freq[b][a]*(T-(tot_t1+tot_t2+(tot_t3)))/T+t1[b]+t2[b]+t3[b];
8944 /*end of the loop that mutates position a*/
8951 /*1-Generate the amino acids of the new sequence new*/
8956 for ( a=0; a< A->len_aln; a++)
8959 for (T=0,b=0; b<26; b++)T+=freq[b][a];
8961 for (c=0,b=0; b<26; b++)
8971 if ( c!=-1)seq[a]='-';
8976 /*2 Generate the gaps in the new sequence*/
8984 transitions=declare_int ( nstate, nstate);
8985 score_tab=declare_int ( A->len_aln+2, nstate );
8986 state_tab=declare_int ( A->len_aln+2, nstate );
8990 for (a=0; a<nstate;a++)
8991 for (b=0; b<nstate;b++)
8992 {transitions[a][b]=F;}
8994 GAP_TRANSITION=AL-gn;
8996 transitions[IGAP ][IGAP ]=AL;
8997 transitions[IAA][IAA]=AL;
8998 transitions[IAA ][IGAP]=GAP_TRANSITION;
8999 transitions[IGAP][IAA ]=GAP_TRANSITION;
9002 for ( p=1; p<=A->len_aln; p++){for (state=0; state< nstate; state++){score_tab[p][state]=F;state_tab[p][state]=-1;} }
9004 for (p=1; p<= A->len_aln; p++)
9006 for (max=0,a=0; a<26; a++)max=MAX(max, freq[a][p-1]);
9007 max=(max*(A->nseq-count[26][p-1]))/A->nseq;
9009 for (state=0; state< nstate; state++)
9014 if ( state==IGAP) e=gf-50;
9015 else if ( state==IAA ) e=max-50;
9016 for (p_state=0; p_state<nstate; p_state++)
9018 score=(score_tab[p-1][p_state]==F)?F:(e+transitions[p_state][state]+score_tab[p-1][p_state]);
9019 if(p_state==0 || score>best_score){ best_score=score;best_state=p_state;}
9021 score_tab[p][state]=best_score;
9022 state_tab[p][state]=best_state;
9026 for (state=0; state<nstate; state++)
9028 if (state==0 || score_tab[p-1][state]>best_score){best_score=score_tab[p-1][state]; best_state=state;}
9031 for (p=A->len_aln; p>0;)
9033 if ( best_state==IGAP)
9037 else if ( best_state==IAA)
9041 best_state=state_tab[p][best_state];
9046 free_int (freq, -1);
9050 /********************************************************************/
9052 /* Weighting functions */
9056 /********************************************************************/
9057 Alignment * master_trimseq( Alignment *A, Sequence *S,char *mode)
9062 int use_aln=0, upper_sim=0, min_nseq=0, lower_sim=0;
9063 float f_upper_sim, f_lower_sim;
9064 char weight_mode[1000];
9067 int trim_direction=TOP;
9077 (trim)_<seq or aln>_%<percentage of tot weight to keep>_n<number of seq to keep>_w<weight mode>
9082 seq_list=vcalloc ( S->nseq, sizeof (int));
9083 for ( a=0; a< A->nseq; a++)
9089 use_aln=aln_is_aligned(A);
9097 sprintf (weight_mode, "pwsim");
9098 sprintf ( method, "clustering2");
9103 upper_sim=lower_sim=min_nseq;
9104 sprintf (weight_mode, "pwsim");
9105 sprintf ( method, "clustering2");
9109 U or % (deprecated) Upper bound for pairwise similarity
9110 L or m (depercated) Lower bound for pairwise similarity
9111 n max number of sequences
9112 N max number of sequences as a fraction of thet total
9114 T print Table of distances
9119 while ( (p=strtok(mode, "_")))
9122 if (strm (p, "seq"))use_aln=0;
9123 else if ( strm(p,"aln"))use_aln=1;
9124 else if (p[0]=='s')statistics=1;
9125 else if (p[0]=='t')table=1;
9126 else if (p[0]=='U')upper_sim=atoi(p+1);
9127 else if (p[0]=='L')lower_sim=atoi(p+1);
9128 else if (p[0]=='n')min_nseq=atoi(p+1);
9129 else if (p[0]=='N')min_nseq=atoi(p+1)*-1;
9130 else if (p[0]=='B')trim_direction=BOTTOM;
9131 else if (p[0]=='T')trim_direction=TOP;
9132 else if (p[0]=='W')sprintf (weight_mode, "%s", p+1);
9133 else if (p[0]=='M')sprintf (method, "%s", p+1);
9137 while ((p=strtok(NULL, ":")))
9142 seq_list[atoi(p+1)-1]=2;
9144 else if ( (a=name_is_in_list (p, A->name, A->nseq, 100))!=-1)
9153 if ( !upper_sim && !min_nseq && !lower_sim)upper_sim=50;
9159 fprintf ( stderr, "\ntrimseq requires a set of sequences[FATAL:%s]\n", PROGRAM);
9163 else if ( min_nseq> S->nseq)
9167 else if ( min_nseq<0)
9171 add_warning ( stderr, "\nWARNING: trimseq: Nseq(N) max_val=100%% [Automatic reset]\n");
9175 min_nseq=(int)((float)S->nseq*((float)min_nseq/100)*-1);
9179 NA=seq2subseq3 (A, S,use_aln,lower_sim,upper_sim,min_nseq,trim_direction, weight_mode,&sim_weight, seq_list );
9183 fprintf ( stderr, "\nSIMILARITY MATRIX\n");
9184 for ( a=0; a< A->nseq-1; a++)
9185 for ( b=a+1; b< A->nseq; b++)
9187 fprintf ( stderr, "%15s Vs %15s : %3.2f %% id\n", A->name[a], A->name[b], 100-sim_weight[a][b]);
9192 f_upper_sim=(upper_sim>100)?((float)upper_sim/(float)100):upper_sim;
9193 f_lower_sim=(upper_sim>100)?((float)lower_sim/(float)100):lower_sim;
9195 fprintf ( stderr, "\nTRIM Informations:\n");
9196 fprintf ( stderr, "\tUse...........: %s\n",(use_aln)?"multiple_aln":"pairwise_aln");
9197 fprintf ( stderr, "\tcluster_mode..: %s\n" ,method);
9198 fprintf ( stderr, "\tsim_mode......: %s\n" ,weight_mode);
9199 fprintf ( stderr, "\tlower_id_bound: %.2f%%\n" ,(f_lower_sim==0)?-1:f_lower_sim);
9200 fprintf ( stderr, "\tupper_id_bound: %.2f%%\n",(f_upper_sim==0)?-1:f_upper_sim);
9201 fprintf ( stderr, "\tnseq_kept.....: %d (out of %d)\n" ,NA->nseq, S->nseq);
9202 fprintf ( stderr, "\treduction.....: %d%% of original set\n" ,(NA->nseq*100)/S->nseq);
9203 fprintf ( stderr, "\tTrim_direction: From %s \n" ,(trim_direction==BOTTOM)?"Bottom":"Top");
9209 Alignment *sim_filter (Alignment *A, char *in_mode, char *seq)
9214 int maxnseq, nseq_ratio, nc;
9220 int direction=1;//remove the higher than
9221 int coverage=0; //remove based on coverage
9223 int maxsim, minsim, maxcov, mincov;
9225 if ( !field) field=vcalloc (1000, sizeof (char));
9227 mode=vcalloc ( strlen (in_mode)+10, sizeof (char));
9228 sprintf ( mode, "_%s_", in_mode);
9230 strget_param ( mode, "_I", "100", "%d", &maxsim);
9231 strget_param ( mode, "_i", "0", "%d", &minsim);
9232 strget_param ( mode, "_C", "100", "%d", &maxcov);
9233 strget_param ( mode, "_c", "0", "%d", &mincov);
9239 keep=vcalloc ( A->nseq, sizeof (int));
9240 list=vcalloc ( A->nseq, sizeof (int));
9248 else s=name_is_in_list (seq, A->name, A->nseq, 100);
9252 if ( s==-1)printf_exit (EXIT_FAILURE, stderr, "ERROR: %s is not a valid sequence", seq);
9258 if ( strstr (mode, "_seq_"))
9263 M=read_matrice ("blosum62mt");
9264 seq=declare_char (A->nseq, A->len_aln+1);
9265 for (a=0; a<A->nseq; a++)
9267 sprintf ( seq[a], "%s", A->seq_al[a]);
9271 sim=declare_int (A->nseq, A->nseq);
9272 cov=declare_int (A->nseq, A->nseq);
9274 for (a=0; a<A->nseq; a++)
9278 sim[s][a]=sim[a][s]=idscore_pairseq(seq[s], seq[a],-12, -1,M,"sim");
9279 cov[s][a]=cov[a][s]=idscore_pairseq(seq[s], seq[a],-12, -1,M,"cov");
9283 free_char (seq, -1);
9288 sim=aln2sim_mat (A, "idmat");
9292 for (a=0; a< A->nseq; a++)
9297 if ( sim[s][a]>maxsim || sim[s][a]<minsim|| cov[s][a]<mincov||cov[s][a]>maxcov)keep[a]=-1;
9302 for ( n=0, a=0; a< A->nseq; a++)
9310 R=extract_sub_aln (A, n, list);
9311 free_int (sim, -1); free_int (cov, -1);vfree (list);
9317 static int find_worst_seq ( int **sim, int n, int *keep, int max, int direction);
9318 Alignment *simple_trimseq (Alignment *A, Alignment *K, char *in_mode, char *seq_list, int **sim)
9322 int maxnseq, maxsim, nseq_ratio, nc;
9328 int direction=1;//remove the higher than
9329 int coverage=0; //remove based on coverage
9335 if ( !field) field=vcalloc (1000, sizeof (char));
9337 mode=vcalloc ( strlen (in_mode)+10, sizeof (char));
9338 sprintf ( mode, "_%s_", in_mode);
9340 strget_param ( mode, "_%%", "0", "%d", &maxsim);
9341 strget_param ( mode, "_n", "0", "%d", &maxnseq);
9342 strget_param ( mode, "_N", "0", "%d", &nseq_ratio);
9343 strget_param ( mode, "_F", "0", "%d", &nc);
9344 strget_param ( mode, "_O", "0", "%d", &outlayers);
9345 strget_param ( mode, "_K", "0", "%d", &KeepN);
9347 strget_param ( mode, "_f", "NAME", "%s", field);
9349 if ( strstr (mode, "_P_"))Print=1;
9351 if ( strstr (mode, "_min"))direction=-1;
9354 if ( strstr (mode, "_cov"))coverage=1;
9360 maxnseq=(A->nseq*nseq_ratio)/100;
9373 keep=vcalloc ( A->nseq, sizeof (int));
9374 list=vcalloc ( A->nseq, sizeof (int));
9379 /*Remove Sequences that do not have at least one residue in the first and last nc columns*/
9382 int left, right, full_n,x, y;
9387 full_list=vcalloc ( A->nseq, sizeof (int));
9389 for (x=0; x< A->nseq; x++)
9391 for ( left=0,y=0; y<MIN(A->len_aln,nc); y++)
9392 if (!is_gap(A->seq_al[x][y]))left=1;
9394 for ( right=0,y=MAX(0,(A->len_aln-nc)); y<A->len_aln; y++)
9395 if (!is_gap(A->seq_al[x][y]))right=1;
9397 if ( left && right)full_list[full_n++]=x;
9399 F=extract_sub_aln (A, full_n, full_list);
9405 /*Reorder the sequences according to the tree order: hopefully better phylogenetic coverage after trim*/
9406 if (strstr (mode, "_T"))
9411 if (!sim)sim=sim_array2dist_array ( NULL, MAXID);
9412 T=int_dist2nj_tree (sim, A->name, A->nseq, NULL);
9413 O=tree2seq (T[3][0], NULL);
9414 A=reorder_aln (A, O->name, O->nseq);
9417 free_sequence (O, -1);
9422 if ( strstr (mode, "seq_") && !sim)sim=seq2comp_mat (aln2seq(A), "blosum62mt", "sim");
9423 else sim=aln2sim_mat (A, "idmat");
9428 if ( strstr (mode, "seq_") && !sim)sim=seq2comp_mat (aln2seq(A), "blosum62mt", "cov");
9429 else sim=aln2cov (A);
9434 if ( K && K->nseq>0)
9436 for ( a=0; a< K->nseq; a++)
9437 if ( (k=name_is_in_list (K->name[a], A->name, A->nseq, MAXNAMES+1))!=-1)
9445 for ( a=0; a< A->nseq; a++)
9447 if (strstr (field, "NAME") && perl_strstr (A->name[a], seq_list)){keep[a]=1;}
9448 else if (strstr (field, "COMMENT") && A->seq_comment && perl_strstr(A->seq_comment[a], seq_list)){keep[a]=1;}
9449 else if (strstr (field, "SEQ") && perl_strstr((A->S)->seq[a], seq_list)){keep[a]=1;}
9453 for (a=0; a<KeepN; a++)keep[a]=1;
9457 for ( a=0; a< A->nseq; a++)
9458 if ( keep[a]) fprintf ( stderr, "\nFORCED KEEP %s", A->name[a]);
9464 while ( (s=find_worst_seq (sim, A->nseq, keep, maxsim, direction))!=-1 && new_nseq>maxnseq)
9466 for ( a=0; a< A->nseq; a++)sim[a][s]=sim[s][a]=-1;
9475 tot_avg=vcalloc ( A->nseq, sizeof (int));
9477 for (a=0; a<A->nseq; a++)
9479 if ( keep[a]==-1)tot_avg[a]=-1;
9482 for (nn=0, b=0; b< A->nseq; b++)
9484 if (a==b || keep[b]==-1)continue;
9487 tot_avg[a]+=sim[a][b];
9491 tot_avg[a]=(nn==0)?-1:(tot_avg[a])/nn;
9494 for ( a=0; a<A->nseq; a++)
9496 if (tot_avg[a]!=-1 && tot_avg[a]<outlayers)
9498 fprintf ( stderr, "\nREMOVED OUTLAYER: %3d %% avg similarity with remaining sequences [Seq %s]", tot_avg[a],A->name[a]);
9505 for ( n=0, a=0; a< A->nseq; a++)
9513 R=extract_sub_aln (A, n, list);
9514 free_int (sim, -1); vfree (list);
9519 int find_worst_seq ( int **sim, int n, int *keep,int max,int direction)
9525 sc=declare_int (n, 2);
9526 if (direction==-1)max=100-max;
9528 for ( a=0; a< n; a++) sc[a][0]=a;
9529 for ( a=0; a< n-1; a++)
9531 for ( b=a+1; b<n; b++)
9534 if (sim[a][b]>=0)si=(direction==-1)?100-sim[a][b]:sim[a][b];
9538 if ( keep[a]!=1)sc[a][1]+=si;
9539 if ( keep[b]!=1)sc[b][1]+=si;
9544 sort_int_inv ( sc, 2, 1, 0, n-1);
9545 if ( sc[0][1]>0)r=sc[0][0];
9549 if (r!=-1 && keep && keep[r])return -1;
9553 int find_worst_seq_old ( int **sim, int n, int *keep,int max,int direction)
9558 sc=declare_int (n, 2);
9560 for ( a=0; a< n; a++) sc[a][0]=a;
9561 for ( a=0; a< n-1; a++)
9563 for ( b=a+1; b<n; b++)
9569 if ( keep[a]!=1)sc[a][1]+=sim[a][b];
9570 if ( keep[b]!=1)sc[b][1]+=sim[a][b];
9573 else if ( direction == -1)
9575 if ( sim[a][b]<max && sim[a][b]>=0)
9577 if ( keep[a]!=1)sc[a][1]+=sim[a][b];
9578 if ( keep[b]!=1)sc[b][1]+=sim[a][b];
9584 if ( direction ==1) //remove max
9586 sort_int_inv ( sc, 2, 1, 0, n-1);
9587 if ( sc[0][1]>0)r=sc[0][0];
9591 else if ( direction ==-1)//remove min
9593 sort_int_inv ( sc, 2, 1, 0, n-1);
9594 if ( sc[0][1]>=0)r=sc[0][0];
9596 HERE ("** %d %d\n", r,sc[0][1]);
9599 if (r!=-1 && keep && keep[r])return -1;
9604 Alignment * trimseq( Alignment *A, Sequence *S,char *mode)
9609 int use_aln=0, upper_sim=0, min_nseq=0, lower_sim=0;
9610 char weight_mode[1000];
9613 int trim_direction=TOP;
9618 float f_lower_sim, f_upper_sim;
9624 (trim)_<seq or aln>_%<percentage of tot weight to keep>_n<number of seq to keep>_w<weight mode>
9629 seq_list=vcalloc ( S->nseq, sizeof (int));
9630 for ( a=0; a< A->nseq; a++)
9636 use_aln=aln_is_aligned(A);
9645 sprintf (weight_mode, "pwsim_fragment");
9646 sprintf ( method, "clustering2");
9651 upper_sim=lower_sim=min_nseq;
9652 sprintf (weight_mode, "pwsim_fragment");
9653 sprintf ( method, "clustering2");
9657 U or % (deprecated) Upper bound for pairwise similarity
9658 L or m (depercated) Lower bound for pairwise similarity
9659 n max number of sequences
9660 N max number of sequences as a fraction of thet total
9662 T print Table of distances
9667 while ( (p=strtok(mode, "_")))
9670 if (strm (p, "seq"))use_aln=0;
9671 else if ( strm(p,"aln"))use_aln=1;
9672 else if (p[0]=='s')statistics=1;
9673 else if (p[0]=='t')table=1;
9674 else if (p[0]=='p')print_name=1;
9675 else if (p[0]=='U')upper_sim=atoi(p+1);
9676 else if (p[0]=='L')lower_sim=atoi(p+1);
9677 else if (p[0]=='n')min_nseq=atoi(p+1);
9678 else if (p[0]=='N')min_nseq=atoi(p+1)*-1;
9679 else if (p[0]=='B')trim_direction=BOTTOM;
9680 else if (p[0]=='T')trim_direction=TOP;
9681 else if (p[0]=='W')sprintf (weight_mode, "%s", p+1);
9682 else if (p[0]=='M')sprintf (method, "%s", p+1);
9686 while ((p=strtok(NULL, ":")))
9689 if ( (a=name_is_in_list (p, A->name, A->nseq, 100))!=-1)
9697 if ( !upper_sim && !min_nseq && !lower_sim)upper_sim=50;
9703 fprintf ( stderr, "\ntrimseq requires a set of sequences[FATAL:%s]\n", PROGRAM);
9707 else if ( min_nseq> S->nseq)
9711 else if ( min_nseq<0)
9715 add_warning ( stderr, "\nWARNING: trimseq: Nseq(N) max_val=100%% [Automatic reset]\n");
9719 min_nseq=(int)((float)S->nseq*((float)min_nseq/100)*-1);
9723 NA=seq2subseq2 (A, S,use_aln,lower_sim,upper_sim,min_nseq,trim_direction, weight_mode,&sim_weight, seq_list );
9727 fprintf ( stderr, "\nSIMILARITY MATRIX\n");
9728 for ( a=0; a< A->nseq-1; a++)
9729 for ( b=a+1; b< A->nseq; b++)
9731 fprintf ( stderr, "%15s Vs %15s : %3.2f %% id\n", A->name[a], A->name[b], 100-sim_weight[a][b]);
9735 NA=seq_name2removed_seq_name(S, NA,sim_weight);
9739 fprintf ( stderr, "\nList of sequences with their closest removed neighbors\n");
9740 for ( a=0; a< NA->nseq; a++)fprintf ( stderr, "\n%s: %s\n", NA->name[a], NA->seq_comment[a]);
9745 f_lower_sim=(lower_sim>100)?(float)lower_sim/100:lower_sim;
9746 f_upper_sim=(upper_sim>100)?(float)upper_sim/100:upper_sim;
9748 fprintf ( stderr, "\nTRIM seq Informations:\n");
9749 fprintf ( stderr, "\tUse...........: %s\n",(use_aln)?"multiple_aln":"pairwise_aln");
9750 fprintf ( stderr, "\tcluster_mode..: %s\n" ,method);
9751 fprintf ( stderr, "\tsim_mode......: %s\n" ,weight_mode);
9752 fprintf ( stderr, "\tlower_id_bound: %.2f%%\n" ,(f_lower_sim==0)?-1:f_lower_sim);
9753 fprintf ( stderr, "\tupper_id_bound: %.2f%%\n",(f_upper_sim==0)?-1:f_upper_sim);
9754 fprintf ( stderr, "\tnseq_kept.....: %d (out of %d)\n" ,NA->nseq, S->nseq);
9755 fprintf ( stderr, "\treduction.....: %d%% of original set\n" ,(NA->nseq*100)/S->nseq);
9756 fprintf ( stderr, "\tTrim_direction: From %s \n" ,(trim_direction==BOTTOM)?"Bottom":"Top");
9762 Alignment * tc_trimseq( Alignment *A, Sequence *S,char *mode)
9766 char *trimfile, *alnfile;
9768 int a, nseq=0, sim=0;
9770 char command[100000];
9771 char keep_list[10000];
9773 int top, bottom, middle, pmiddle;
9777 seq_list=vcalloc ( S->nseq, sizeof (int));
9778 for ( a=0; a< A->nseq; a++)
9783 trimfile=vtmpnam (NULL);
9784 alnfile=vtmpnam (NULL);
9785 if ( !aln_is_aligned (A))
9787 fprintf ( stderr, "\ntrimTC: computation of an Approximate MSA [");
9788 A=compute_tcoffee_aln_quick ( A, NULL);
9789 fprintf ( stderr, "DONE]\n");
9791 output_clustal_aln (alnfile, A);
9794 while ( (p=strtok(mode, "#")))
9799 if (p[0]=='%' || p[0]=='S')sim=(p[1]=='%')?atoi(p+2):atoi(p+1);
9800 else if (p[0]=='n' || p[0]=='N')nseq=atoi(p+1);
9803 if ( (a=name_is_in_list (p+1, A->name, A->nseq, 100))!=-1)
9810 if ( nseq ==0 && sim ==0)
9812 fprintf ( stderr, "\nERROR: trimTC\nIndicate the maximum number of sequences Nnseq\nOR the maximum average similarity of the chosen sequencesSx\nEX: +trimTC S20 OR +trimTC N5");
9813 fprintf ( stderr, "\n[FATAL:%s]", PROGRAM);
9814 myexit (EXIT_FAILURE);
9817 for ( a=0; a<A->nseq; a++)if (seq_list[a]==2){strcat ( keep_list, A->name[a]);strcat ( keep_list," ");}
9821 sprintf ( command , "%s -infile %s -trim -trimfile=%s -split_score_thres %d -convert -iterate 0 ",get_string_variable("t_coffee"), alnfile, trimfile,sim);
9822 if ( keep_list[0]){strcat ( command, " -seq_to_keep ");strcat ( command, keep_list);}
9823 my_system ( command);
9824 TS=read_sequences (trimfile);
9826 else if ( nseq && A->nseq>nseq)
9830 pmiddle=0;middle=50;
9832 sprintf ( command , "%s -infile %s -trim -trimfile=%s -split_score_thres %d -convert -iterate 0",get_string_variable("t_coffee"), alnfile, trimfile,middle);
9833 if ( keep_list[0]){strcat ( command, " -seq_to_keep ");strcat ( command, keep_list);}
9834 my_system ( command);
9836 TS=read_sequences (trimfile);
9837 fprintf ( stderr, "\n\tTrimTC: Sim %d Nseq %d\t",middle, TS->nseq);
9839 if ( TS->nseq>nseq)top=middle;
9840 else if ( TS->nseq<nseq)bottom=middle;
9842 middle=(top-bottom)/2+bottom;
9844 while (TS->nseq!=nseq && pmiddle!=middle)
9847 sprintf ( command , "%s -infile %s -trim -trimfile=%s -split_score_thres %d -convert -iterate 0 ",get_string_variable("t_coffee"), alnfile, trimfile,middle);
9848 if ( keep_list[0]){strcat ( command, " -seq_to_keep ");strcat ( command, keep_list);}
9849 my_system ( command);
9850 free_sequence (TS, -1);
9851 TS=read_sequences (trimfile);
9852 fprintf ( stderr, "\n\tTrimTC: Sim %d Nseq %d\t", middle, TS->nseq);
9854 if ( TS->nseq>nseq)top=middle;
9855 else if ( TS->nseq<nseq)bottom=middle;
9857 middle=(top-bottom)/2+bottom;
9864 NA=seq2aln (TS, NULL, 1);
9866 fprintf ( stderr, "\n");
9871 Alignment* seq2subseq3( Alignment *A, Sequence *S,int use_aln, int int_lower_sim,int int_upper_sim, int min_nseq, int trim_direction, char *weight_mode, float ***sim_weight, int *seq_list)
9880 float sim, lower_sim, upper_sim;
9882 lower_sim=(int_lower_sim>100)?(float)int_lower_sim/100:int_lower_sim;
9883 upper_sim=(int_upper_sim>100)?(float)int_upper_sim/100:int_upper_sim;
9885 sim_weight[0]=get_weight ((use_aln)?A:NULL, S, weight_mode);
9887 name=declare_char (S->nseq, (MAXNAMES+1));
9888 seq= declare_char (S->nseq, S->max_len+1);
9891 Remove every sequence that is more than upper_sim and less than lower_sim similar to the master sequences
9892 the master sequence(s) are those for which seq_list[x]==2
9901 for (a=0; a< A->nseq; a++)
9903 if ( seq_list[a]==2)
9906 for ( b=0; b< A->nseq;b++)
9908 sim=100-sim_weight[0][a][b];
9909 if (seq_list[b]==1 && (sim>upper_sim || sim<lower_sim))
9919 /*Prepare the new sequence List*/
9921 for (b=0, a=0; a<S->nseq; a++)
9925 sprintf ( name[b], "%s", S->name[a]);
9926 sprintf ( seq[b] , "%s",(use_aln)?A->seq_al[a]: S->seq[a] );
9932 NS=fill_sequence_struc (new_nseq,seq,name);
9933 NA=seq2aln(NS,NULL,1);
9937 NA=realloc_aln2 ( NA,A->max_n_seq,A->len_aln+1);
9939 for (b=0, a=0; a<S->nseq; a++)
9943 sprintf ( NA->seq_al[b] , "%s",A->seq_al[a]);
9948 NA->len_aln=A->len_aln;
9955 Alignment* seq2subseq2( Alignment *A, Sequence *S,int use_aln, int int_lower_sim,int int_upper_sim, int min_nseq, int trim_direction, char *weight_mode, float ***sim_weight, int *seq_list)
9964 float lower_sim, upper_sim;
9966 lower_sim=(int_lower_sim>100)?(float)int_lower_sim/100:int_lower_sim;
9967 upper_sim=(int_upper_sim>100)?(float)int_upper_sim/100:int_upper_sim;
9970 sim_weight[0]=get_weight ((use_aln)?A:NULL, S, weight_mode);
9972 name=declare_char (S->nseq, (MAXNAMES+1));
9973 seq= declare_char (S->nseq, S->max_len+1);
9977 2 REMOVE CLOSELY RELATED SEQUENCES
9978 3 IF STILL TOO MANY SEQUENCES:
9979 REMOVE THE MOST CLOSELY RELATED ONES
9983 /*1 Remove outlayers*/
9988 /*1 Remove outlayers*/
9989 while ( lower_sim && (extreme_seq(BOTTOM,A,sim_weight[0],seq_list, &seq_index) <lower_sim) && ((new_nseq)>min_nseq) && seq_index!=-1)
9992 if ( seq_list[seq_index]==1)
9994 seq_list[seq_index]=0;
9998 /*2 Remove close relative*/
10001 while ( upper_sim && (extreme_seq(TOP, A,sim_weight[0],seq_list, &seq_index)>upper_sim) && ((new_nseq)>min_nseq)&& seq_index!=-1)
10004 if ( seq_list[seq_index]==1)
10006 seq_list[seq_index]=0;
10012 /*Remove extra sequences*/
10014 while ( min_nseq>0 && new_nseq>min_nseq && seq_index!=-1)
10017 extreme_seq(trim_direction, A,sim_weight[0],seq_list, &seq_index);
10019 if ( seq_index==-1)break;
10020 if ( seq_list[seq_index]==1)
10022 seq_list[seq_index]=0;
10028 /*Prepare the new sequence List*/
10030 for (b=0, a=0; a<S->nseq; a++)
10034 sprintf ( name[b], "%s", S->name[a]);
10035 sprintf ( seq[b] , "%s",(use_aln)?A->seq_al[a]: S->seq[a] );
10041 NS=fill_sequence_struc (new_nseq,seq,name);
10042 NA=seq2aln(NS,NULL,1);
10046 NA=realloc_aln2 ( NA,A->max_n_seq,A->len_aln+1);
10048 for (b=0, a=0; a<S->nseq; a++)
10052 sprintf ( NA->seq_al[b],"%s",A->seq_al[a]);
10057 NA->len_aln=A->len_aln;
10065 float extreme_seq (int direction, Alignment *A,float **sim_weight,int *seq_list, int *seq_index)
10068 /*find the closest relative of each sequence
10070 Direction= BOTTOM: the sequence whose closest relative is the most distant
10071 Direction= TOP: the sequence whose closest relative is the closest
10072 weight: different sequences=100
10073 similar sequences =0
10077 float top_sim,bottom_sim, best_sim, sim;
10078 int top_seq, bottom_seq;
10080 bottom_seq=top_seq=seq_index[0]=-1;
10084 for (a=0; a< A->nseq; a++)
10086 if (seq_list[a]!=1)continue;
10088 for ( best_sim=0, b=0; b< A->nseq; b++)
10090 if ( a==b || !seq_list[b])continue;
10092 sim=100-sim_weight[a][b];
10099 if ( best_sim>top_sim)
10105 if ( best_sim<bottom_sim)
10108 bottom_sim=best_sim;
10112 if ( direction==BOTTOM ){seq_index[0]= bottom_seq; return bottom_sim;}
10113 else if ( direction==TOP){seq_index[0]= top_seq; return top_sim;}
10124 Alignment* seq2subseq1( Alignment *A, Sequence *S,int use_aln, int percent,int max_nseq, int ms,char *weight_mode)
10126 float **pw_weight,**sim_weight, **seq_weight;
10128 float sum, chosen,last_chosen, last_nchosen,nchosen;
10129 int condition1, condition2;
10132 char **name, **seq;
10133 float score, best_score;
10135 int *seq_list, *used_seq_list;
10139 (trim)_<seq or aln>_%<percentage of tot weight to keep>_n<number of seq to keep>_w<weight mode>
10142 sim_weight=get_weight ((use_aln)?A:NULL, S, weight_mode);
10143 pw_weight=declare_float (S->nseq, S->nseq);
10144 seq_weight=declare_float ( S->nseq, 2);
10147 for (best_score=0,a=0; a<S->nseq; a++)
10149 for ( b=0; b<S->nseq; b++)
10151 if ( a==b)continue;
10152 seq_weight[a][0]+=sim_weight[a][b];
10154 seq_weight[a][0]=seq_weight[a][0]/(S->nseq-1);
10155 score=seq_weight[a][0]=100-seq_weight[a][0];
10157 if ( score>best_score)
10164 for (a=0; a<S->nseq; a++)
10166 for ( b=0; b<S->nseq; b++)
10168 if ( a==b)continue;
10169 pw_weight[a][b]=sim_weight[a][b]*seq_weight[a][0]*seq_weight[b][0]/(100*100);
10175 seq_list=vcalloc ( S->nseq, sizeof (int));
10176 used_seq_list=vcalloc ( S->nseq, sizeof (int));
10180 name=declare_char (S->nseq, (MAXNAMES+1));
10181 seq= declare_char (S->nseq, S->max_len+1);
10183 /*compute the normalization factor*/
10184 for (sum=0,d=0; d< S->nseq; d++)
10186 for (score=0,c=0; c<S->nseq; c++)
10189 score=MAX(score, 100-sim_weight[c][d]);
10194 /*chose the first sequence */
10195 for ( best_score=0,a=0; a< S->nseq; a++)
10197 for (score=0, b=0; b< S->nseq; b++)
10199 score+=100-sim_weight[a][b];
10201 if ( score>best_score)
10210 last_chosen=chosen=((best_score/S->nseq)*100)/sum;
10211 nchosen=last_nchosen=1;
10212 seq_list[0]=best_seq;
10213 used_seq_list[best_seq]=1;
10215 sprintf ( name[0],"%s", S->name[seq_list[0]]);
10216 sprintf ( seq[0],"%s", S->seq[seq_list[0]]);
10217 nchosen=last_nchosen=1;
10220 fprintf ( stderr, "\nTRIM:\n");
10221 fprintf ( stderr, "\n1-Chosen Sequences\n");
10222 /*Assemble the list of sequences*/
10223 for (a=1; a< S->nseq; a++)
10225 for (best_score=0,b=0; b< S->nseq; b++)
10227 if (used_seq_list[b]);
10230 score=pw_weight[seq_list[0]][b]+1;
10231 for (c=0; c<a; c++)
10232 score=MIN(score,pw_weight[seq_list[c]][b]);
10234 if ( score>=best_score)
10242 seq_list[a]=best_seq;
10243 used_seq_list[best_seq]=1;
10247 for ( chosen=0,d=0; d< S->nseq; d++)
10249 for (score=0, c=0; c<=a; c++)
10251 if ( seq_list[c]!=d)
10252 score=MAX(score, 100-sim_weight[seq_list[c]][d]);
10258 chosen=((chosen/S->nseq)*100)/sum;
10261 condition1= (int)chosen<=(int)percent || !percent;
10262 condition2=(nchosen)<=max_nseq || !max_nseq;
10264 if (condition1 && condition2)
10266 fprintf ( stderr, "\tADD %s (set score: %.2f %%)\n", S->name[seq_list[a]], chosen);
10267 sprintf ( name[a],"%s", S->name[seq_list[a]]);
10268 sprintf ( seq[a],"%s", S->seq[seq_list[a]]);
10275 last_chosen=chosen;
10276 last_nchosen=nchosen;
10279 NS=fill_sequence_struc (last_nchosen,seq,name);
10280 NA=seq2aln(NS,NULL,1);
10281 fprintf ( stderr, "\n2-Informations:\n");
10282 fprintf ( stderr, "\tUse...........: %s\n",(use_aln)?"multiple_aln":"pairwise_aln");
10283 fprintf ( stderr, "\tweight_mode...: %s\n" ,weight_mode);
10284 fprintf ( stderr, "\tpercent_weight: %.2f%% (max=%d%%)\n",last_chosen,percent);
10285 fprintf ( stderr, "\tn_seq.........: %d\n" ,NS->nseq);
10286 fprintf ( stderr, "\treduction.....: %d%% of original set\n" ,(NS->nseq*100)/S->nseq);
10290 Sequence * seq_weight2species_weight (Alignment *A, Sequence *S)
10296 S->W=declare_weights(S->nseq);
10297 if (!A->S || !(A->S)->W)aln2voronoi_weights (A);
10299 wseq=((A->S)->W)->SEQ_W;
10301 for ( a=0; a< S->nseq; a++)
10303 for (b=0; b<A->nseq; b++)
10304 if ( strstr (A->name[b], S->name[a]))wsp[a]+=wseq[b];
10306 for (a=0; a<S->nseq; a++)
10307 fprintf ( stderr, "\nVoronoi Weights: Species %s ---> %.2f\n", S->name[a], wsp[a]);
10310 Alignment * aln2voronoi_weights (Alignment *A)
10317 tab=declare_int (256, A->nseq+1);
10318 if (A->S)free_sequence (A->S, (A->S)->nseq);
10320 (A->S)->W=declare_weights (A->nseq);
10321 w=((A->S)->W)->SEQ_W;
10323 for (a=0; a<A->len_aln; a++)
10325 for ( b=0; b<A->nseq; b++)
10327 c= A->seq_al[b][a];
10331 tab[c][++tab[c][0]]=b;
10334 for (c=0; c<256; c++)
10338 for (b=1; b<=tab[c][0]; b++)
10340 w[tab[c][b]]+=(float)1/(float)tab[c][0];
10341 t+=(float)1/(float)tab[c][0];
10347 for (a=0; a<A->nseq; a++)
10349 w[a]=(w[a]/t)*A->nseq;
10355 float ** get_weight ( Alignment *A, Sequence *S, char *mode)
10360 char command[LONG_STRING];
10361 char program[LONG_STRING];
10366 if ( !mode || !mode[0] || strm (mode, "msa"))
10368 if ( getenv ( "SEQ2MSA_WEIGHT")==NULL)sprintf (program, "%s",SEQ2MSA_WEIGHT);
10369 else sprintf ( program, "%s", (getenv ( "SEQ2MSA_WEIGHT")));
10371 else if ( strm(mode, "pwsim") ||strm(mode, "pwsim_fragment") )
10373 return seq2pwsim (A, S, mode);
10377 if (getenv (mode))sprintf ( program, "%s", (getenv (mode)));
10378 else fprintf ( stderr, "\nERROR: %s is not a valid mode for weight computation [FATAL:%s]", mode, PROGRAM);
10382 seq_name=vtmpnam(NULL);
10383 aln_name=vtmpnam(NULL);
10384 weight_name=vtmpnam(NULL);
10385 weight=declare_float (S->nseq+1, 2);
10391 output_clustal_aln (seq_name,A);
10392 output_fasta_seq (aln_name,A);
10393 sprintf ( command, "%s %s -i %s -w %s", program, seq_name, aln_name, weight_name);
10398 output_fasta_seq (seq_name,A);
10399 sprintf ( command, "%s %s -w %s", program, seq_name, weight_name);
10403 my_system ( command);
10405 fp=vfopen( weight_name, "r");
10406 while ( (c=fgetc(fp))!='$');
10409 while ( (fscanf (fp, "%*s %f\n",&(weight[c][1])))==1)
10410 {weight[c][0]=c;c++;}
10417 float **seq2pwsim ( Alignment *A, Sequence *S, char *mode)
10423 W=declare_float (S->nseq, S->nseq);
10427 for (a=0; a< S->nseq; a++)
10428 for ( b=a; b<S->nseq; b++)
10434 B=align_two_sequences ((S)->seq[a], (S)->seq[b],"pam250mt", -10, -1, "fasta_pair_wise");
10435 for (t=0,d=0,c=0; c<B->len_aln; c++)
10437 d+=(B->seq_al[0][c]==B->seq_al[1][c] && !is_gap(B->seq_al[0][c]));
10438 t+=(!is_gap(B->seq_al[0][c]) && !is_gap(B->seq_al[1][c]));
10440 t=(strm ( mode, "pwsim_fragment"))?B->len_aln:t;
10447 for (t=0,d=0,c=0; c<A->len_aln; c++)
10449 d+=(A->seq_al[a][c]==A->seq_al[b][c] && !is_gap(A->seq_al[a][c]));
10450 t+=(!is_gap(A->seq_al[a][c]) && !is_gap(A->seq_al[b][c]));
10456 W[a][b]=W[b][a]=(1-d)*100;
10464 float **seq2pwsim_fragment ( Alignment *A, Sequence *S, char *mode)
10472 W=declare_float (S->nseq, S->nseq);
10477 for (a=0; a< S->nseq; a++)
10478 for ( b=a; b<S->nseq; b++)
10484 B=align_two_sequences ((S)->seq[a], (S)->seq[b],"pam250mt", -10, -1, "fasta_pair_wise");
10485 for (t=0,d=0,c=0; c<B->len_aln; c++)
10487 d+=(B->seq_al[0][c]==B->seq_al[1][c] && !is_gap(B->seq_al[0][c]));
10488 t+=(!is_gap(B->seq_al[0][c]) && !is_gap(B->seq_al[1][c]));
10496 for (t=0,d=0,c=0; c<A->len_aln; c++)
10498 d+=(A->seq_al[a][c]==A->seq_al[b][c] && !is_gap(A->seq_al[a][c]));
10499 t+=(!is_gap(A->seq_al[a][c]) && !is_gap(A->seq_al[b][c]));
10505 W[a][b]=W[b][a]=(1-d)*100;
10513 /********************************************************************/
10515 /* AMINO ACID FUNCTIONS */
10519 /********************************************************************/
10520 //Builds an extended alphabet from a string
10521 char** string2alphabet (char *string, int depth, int *falp_size)
10537 array=vcalloc ( 256, sizeof (int));
10542 falp=declare_char (l+1, 2);
10544 alp=declare_char(l,2);
10547 array=vcalloc ( 256, sizeof (int));
10550 if (!array[(int)string[a]])
10552 array[(int)string[a]]=1;
10553 sprintf (alp[alp_size++], "%c", string[a]);
10554 sprintf (falp[falp_size[0]++], "%c", string[a]);
10557 sprintf ( falp[falp_size[0]++], "*");
10562 free_char (alp, -1);
10565 alp2=vcalloc ( depth, sizeof (char**));
10566 alp2_size=vcalloc (depth, sizeof (int));
10568 for (a=0; a<depth; a++)
10571 alp2_size[a]=alp_size;
10575 for (a=2; a<=depth; a++)
10577 char ***result_array;
10579 result_array=generate_array_string_list (a, alp2, alp2_size, &n, NULL, NO_OVERLAP);
10581 falp=vrealloc (falp, sizeof (char**)*max_s);
10582 for (b=0; b<n; b++)
10585 for (c=0; c<a; c++)
10587 strcat (buf, result_array[b][c]);
10589 falp[falp_size[0]]=vcalloc (strlen (buf)+1, sizeof (char));
10590 sprintf ( falp[falp_size[0]++], "%s", buf);
10591 vfree ( result_array[b]);
10593 vfree (result_array);
10597 falp[falp_size[0]]=vcalloc (2, sizeof (char));
10598 sprintf ( falp[falp_size[0]++], "*");
10599 free_char (alp, -1);
10603 char** make_group_aa (int *ngroup, char *mode)
10605 /*mode: indicates which matrix will be used for the grouping*/
10606 /*n_group: pointer to the number of groups */
10607 /*return value: an array of strings containing the AA of each group */
10616 matrix_name=vcalloc ( 100, sizeof (char));
10618 if (ngroup[0]==-1)extend=1;
10621 group_list=declare_char ( 100, 27);
10625 sprintf ( group_list[ngroup[0]++], "gG");
10626 sprintf ( group_list[ngroup[0]++], "pP");
10627 sprintf ( group_list[ngroup[0]++], "aA");
10628 sprintf ( group_list[ngroup[0]++], "cC");
10629 sprintf ( group_list[ngroup[0]++], "dD");
10630 sprintf ( group_list[ngroup[0]++], "eE");
10632 sprintf ( group_list[ngroup[0]++], "fF");
10633 sprintf ( group_list[ngroup[0]++], "hH");
10634 sprintf ( group_list[ngroup[0]++], "iI");
10635 sprintf ( group_list[ngroup[0]++], "kK");
10636 sprintf ( group_list[ngroup[0]++], "lL");
10637 sprintf ( group_list[ngroup[0]++], "mM");
10638 sprintf ( group_list[ngroup[0]++], "nN");
10639 sprintf ( group_list[ngroup[0]++], "qQ");
10640 sprintf ( group_list[ngroup[0]++], "rR");
10642 sprintf ( group_list[ngroup[0]++], "sS");
10643 sprintf ( group_list[ngroup[0]++], "tT");
10644 sprintf ( group_list[ngroup[0]++], "vV");
10645 sprintf ( group_list[ngroup[0]++], "wW");
10646 sprintf ( group_list[ngroup[0]++], "*");
10649 if ( mode && mode[0]=='_'){mode++;sprintf ( matrix_name, "%s", mode);}
10651 if (mode==NULL || mode[0]=='\0')sprintf ( matrix_name, "idmat");
10652 else if ( strstr (mode, "sim") || strm (mode, "idmat") || mode==NULL)
10654 sprintf ( group_list[ngroup[0]++], "aA");
10655 sprintf ( group_list[ngroup[0]++], "bB");
10656 sprintf ( group_list[ngroup[0]++], "cC");
10657 sprintf ( group_list[ngroup[0]++], "dD");
10658 sprintf ( group_list[ngroup[0]++], "eE");
10659 sprintf ( group_list[ngroup[0]++], "fF");
10660 sprintf ( group_list[ngroup[0]++], "gG");
10661 sprintf ( group_list[ngroup[0]++], "hH");
10662 sprintf ( group_list[ngroup[0]++], "iI");
10663 sprintf ( group_list[ngroup[0]++], "jJ");
10664 sprintf ( group_list[ngroup[0]++], "kK");
10665 sprintf ( group_list[ngroup[0]++], "lL");
10666 sprintf ( group_list[ngroup[0]++], "mM");
10667 sprintf ( group_list[ngroup[0]++], "nN");
10668 sprintf ( group_list[ngroup[0]++], "oO");
10669 sprintf ( group_list[ngroup[0]++], "pP");
10670 sprintf ( group_list[ngroup[0]++], "qQ");
10671 sprintf ( group_list[ngroup[0]++], "rR");
10672 sprintf ( group_list[ngroup[0]++], "sS");
10673 sprintf ( group_list[ngroup[0]++], "tT");
10674 sprintf ( group_list[ngroup[0]++], "uU");
10675 sprintf ( group_list[ngroup[0]++], "vV");
10676 sprintf ( group_list[ngroup[0]++], "wW");
10677 sprintf ( group_list[ngroup[0]++], "xX");
10678 sprintf ( group_list[ngroup[0]++], "yY");
10679 sprintf ( group_list[ngroup[0]++], "zZ");
10680 vfree (matrix_name);
10683 else if ( strm (mode, "simple"))
10685 sprintf ( group_list[ngroup[0]++], "avilmAVILM");
10686 sprintf ( group_list[ngroup[0]++], "dekrDEKR");
10687 sprintf ( group_list[ngroup[0]++], "stcnqhSTCNQH");
10688 sprintf ( group_list[ngroup[0]++], "wfyWFY");
10689 sprintf ( group_list[ngroup[0]++], "gG");
10690 sprintf ( group_list[ngroup[0]++], "pP");
10691 vfree (matrix_name);
10695 else if ( strm (mode, "mafft"))
10699 sprintf ( group_list[ngroup[0]++],"agjopstAGJOPST");
10700 sprintf ( group_list[ngroup[0]++],"ilmvILMV");
10701 sprintf ( group_list[ngroup[0]++],"bdenqzBDENQZ");
10702 sprintf ( group_list[ngroup[0]++],"hkrHKR");
10703 sprintf ( group_list[ngroup[0]++],"fwyFWY");
10704 sprintf ( group_list[ngroup[0]++],"cC");
10705 vfree (matrix_name);
10708 else if ( strm (mode, "clustalw"))
10711 sprintf ( group_list[ngroup[0]++],"astaASTA");
10712 sprintf ( group_list[ngroup[0]++],"bneqkBNEQK");
10713 sprintf ( group_list[ngroup[0]++],"cnhqkCNHQK");
10714 sprintf ( group_list[ngroup[0]++],"dndeqDNDEQ");
10715 sprintf ( group_list[ngroup[0]++],"eqhrkEQHRK");
10716 sprintf ( group_list[ngroup[0]++],"fmilvFMILV");
10717 sprintf ( group_list[ngroup[0]++],"gmilfGMILF");
10718 sprintf ( group_list[ngroup[0]++],"hhyHHY");
10719 sprintf ( group_list[ngroup[0]++],"ifywIFYW");
10720 sprintf ( group_list[ngroup[0]++],"jcJC");
10721 sprintf ( group_list[ngroup[0]++],"kpKP");
10722 vfree (matrix_name);
10725 else if ( strm (mode, "polarity"))
10728 sprintf ( group_list[ngroup[0]++],"eqrsdnkhtEQRSDNKHT");
10729 sprintf ( group_list[ngroup[0]++],"pP");
10730 sprintf ( group_list[ngroup[0]++],"gG");
10731 sprintf ( group_list[ngroup[0]++],"cC");
10732 sprintf ( group_list[ngroup[0]++],"fywFYW");
10733 sprintf ( group_list[ngroup[0]++],"iavlmIAVLM");
10734 vfree (matrix_name);
10737 else if ( strm (mode, "vasiliky"))
10740 sprintf ( group_list[ngroup[0]++], "rkRK");
10741 sprintf ( group_list[ngroup[0]++], "deDE");
10742 sprintf ( group_list[ngroup[0]++], "qhQH");
10743 sprintf ( group_list[ngroup[0]++], "vilmVILM");
10744 sprintf ( group_list[ngroup[0]++], "fyFY");
10745 sprintf ( group_list[ngroup[0]++], "sS");
10746 sprintf ( group_list[ngroup[0]++], "wW");
10747 sprintf ( group_list[ngroup[0]++], "aA");
10748 sprintf ( group_list[ngroup[0]++], "cC");
10749 sprintf ( group_list[ngroup[0]++], "gG");
10750 sprintf ( group_list[ngroup[0]++], "nN");
10751 sprintf ( group_list[ngroup[0]++], "pP");
10752 sprintf ( group_list[ngroup[0]++], "tT");
10753 vfree (matrix_name);
10756 else if ( strm (mode, "clustalw_col"))
10758 sprintf ( group_list[ngroup[0]++], "staSTA");
10759 sprintf ( group_list[ngroup[0]++], "neqkNEQK");
10760 sprintf ( group_list[ngroup[0]++], "nhqkNHQK");
10761 sprintf ( group_list[ngroup[0]++], "ndeqNDEQ");
10762 sprintf ( group_list[ngroup[0]++], "qhrkQHRK");
10763 sprintf ( group_list[ngroup[0]++], "milvMILV");
10764 sprintf ( group_list[ngroup[0]++], "milfMILF");
10765 sprintf ( group_list[ngroup[0]++], "hyHY");
10766 sprintf ( group_list[ngroup[0]++], "fywFYW");
10767 sprintf ( group_list[ngroup[0]++], "gG");
10768 sprintf ( group_list[ngroup[0]++], "pP");
10769 sprintf ( group_list[ngroup[0]++], "cC");
10770 vfree (matrix_name);
10774 else if ( strm (mode, "clustalw_dot"))
10776 sprintf ( group_list[ngroup[0]++], "csaCSA");
10777 sprintf ( group_list[ngroup[0]++], "atvATV");
10778 sprintf ( group_list[ngroup[0]++], "sagSAG");
10779 sprintf ( group_list[ngroup[0]++], "stnkSTNK");
10780 sprintf ( group_list[ngroup[0]++], "stpaSTPA");
10781 sprintf ( group_list[ngroup[0]++], "sgndSGND");
10782 sprintf ( group_list[ngroup[0]++], "sndeqkSNDEQK");
10783 sprintf ( group_list[ngroup[0]++], "ndeqhkNDEQHK");
10784 sprintf ( group_list[ngroup[0]++], "neqhrkNEQHRK");
10785 sprintf ( group_list[ngroup[0]++], "fvlimFVLIM");
10786 sprintf ( group_list[ngroup[0]++], "hfyHFY");
10787 vfree (matrix_name);
10790 else if ( strm (mode, "make_all"))
10793 sprintf ( group_list[0], "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
10794 vfree (matrix_name);
10797 else sprintf ( matrix_name, "%s", mode);
10799 matrix=read_matrice ( matrix_name);
10801 for ( a=0;a< 26; a++)
10803 if ( matrix[a][a]>0)
10805 for ( c=0,b=0;b< 26; b++)
10808 if ( matrix[a][b]>0 && matrix[b][b]>0)
10815 for ( is_in=0,b=0; b< ngroup[0]; b++)if ( strcmp (buf, group_list[b])==0)is_in=1;
10816 if (is_in==0)sprintf ( group_list[ngroup[0]++], "%s", buf);
10820 free_int (matrix, -1);
10821 vfree (matrix_name);
10825 char** make_group_aa_upgma (char*matrix, int max_n)
10830 int a, b, ba, bb, best, set, l, n;
10833 group_list=declare_char (l+1, l+1);
10834 for (a=0; a<l; a++)group_list[a][0]='a'+a;
10835 mat=read_matrice(matrix);
10836 used=vcalloc ( l, sizeof (int));
10841 for (set=0,a=0; a<l-1; a++)
10842 for (b=a+1; b<l; b++)
10844 if (used[a]||used[b])continue;
10846 if (set==0 || mat[a][b]>best)
10855 for (a=0; a<l; a++)
10857 mat[ba][a]=mat[a][ba]=(mat [ba][a]+mat[bb][a])/2;
10860 strcat (group_list[ba], group_list[bb]);
10861 vfree (group_list[bb]);
10862 group_list[bb]=NULL;
10867 for (n=0,a=0; a<l; a++)
10869 if ( group_list[a])
10870 group_list[n++]=group_list[a];
10872 vfree (used); free_int (mat, -1);
10876 int find_group_aa_distribution (char *col, int nseq,int n_group, char **gl, int *distrib, char *mode )
10878 static int *distribution;
10880 static int ln_group;
10889 lgl=make_group_aa ( &ln_group, mode);
10902 if ( distribution==NULL || ln_group<n_group)distribution=vcalloc ( n_group2, sizeof (int));
10903 if ( distrib==NULL)d=distribution;
10907 for ( a=0; a< n_group2; a++)d[a]=0;
10909 for ( a=0; a< nseq; a++)
10911 for ( b=0; b< n_group2; b++)
10912 d[b]+=is_in_set (col[a], gl2[b]);
10915 for ( a=0; a< n_group2; a++)
10922 int is_in_same_group_aa ( char r1, char r2, int n_group, char **gl, char *mode)
10926 static int ln_group;
10931 /*use mode=idmat for similarity based on id*/
10935 if (mode==NULL)return (r1==r2)?1:0;
10937 if ( strm (mode, "clean"))
10939 free_char (lgl, -1);
10944 else if ( strstr (mode, "cov"))
10951 lgl=make_group_aa ( &ln_group, mode);
10965 for ( a=0; a< n_group2; a++)
10966 if ( is_in_set ( r1, gl2[a]) && is_in_set ( r2, gl2[a]))return 1;
10971 Alignment * gene2prot (Alignment *A){return A; }
10972 char * test_gene2prot (Constraint_list *CL, int s1)
10975 int F=-10000000; /*FORBIDEN STATE*/
10976 int AL=0; /*ALLOWED STATE*/
10977 int SPLICE_PENALTY=1000;
10978 int FRAME_PENALTY=1000;
10981 int START, ORF1, ORF2, ORF3, s5NC;
10982 int s3NC,ORF3_G1, ORF3_T2, ORF3_NC, ORF3_A3, ORF3_T4;
10983 int U1_G1, U1_T2, U1_NC, U1_A3, U1_T4;
10984 int U2_G1, U2_T2, U2_NC, U2_A3, U2_T4;
10985 int U1, U2, U3, U4, U5, END;
10996 int orf1, orf2, orf3, ncp, p, state, pstate, e, best_state_p=0, best_state_v=0, best_pstate_p=0, best_pstate_v;
10997 char *seq, *seq2, *seq3;
11002 int s, r, s2, r2, w2;
11007 seq=vcalloc ( strlen ((CL->S)->seq[s1])+1, sizeof (char));
11008 seq2=vcalloc ( strlen ((CL->S)->seq[s1])+1, sizeof (char));
11009 seq3=vcalloc ( strlen ((CL->S)->seq[s1])+1, sizeof (char));
11010 sprintf ( seq, "%s", (CL->S)->seq[s1]);
11014 for ( a=0; a< l; a++) seq[a]=tolower ( seq[a]);
11015 for ( a=0; a< l; a++) seq[a]=(seq[a]=='t')?'u': seq[a];
11018 potential=vcalloc (l+1, sizeof (int));
11020 for (nal=0, s=0; s<(CL->S)->nseq; s++)
11022 for ( r=1; r<=(CL->S)->len[s]; r++)
11024 for ( b=1; b<CL->residue_index[s1][r][0]; b++)
11027 s2=CL->residue_index[s][r][b+SEQ2];
11028 r2=CL->residue_index[s][r][b+R2];
11029 w2=CL->residue_index[s][r][b+WE];
11030 if (s==s1)potential[r-1]+=w2;
11031 else if ( s2==s1)potential[r2-1]+=w2;
11039 SPLICE_PENALTY=10000;
11040 FRAME_PENALTY=1000;
11044 START=nstate++; ORF1=nstate++; ORF2=nstate++; ORF3=nstate++; s5NC=nstate++;
11046 ORF3_G1=nstate++;U1_G1=nstate++;U2_G1=nstate++;
11047 ORF3_T2=nstate++;U1_T2=nstate++;U2_T2=nstate++;
11048 ORF3_NC=nstate++;U1_NC=nstate++;U2_NC=nstate++;
11049 ORF3_A3=nstate++;U1_A3=nstate++;U2_A3=nstate++;
11050 ORF3_T4=nstate++;U1_T4=nstate++;U2_T4=nstate++;
11053 U1=nstate++; U2=nstate++; U3=nstate++; U4=nstate++; U5=nstate++;
11056 is_coding=vcalloc ( nstate, sizeof (int));
11057 is_coding[ORF1]=is_coding[ORF2]=is_coding[ORF3]=is_coding[U1]=is_coding[U2]=1;
11058 is_coding[U3]=is_coding[U4]=is_coding[U5]=1;
11060 is_t4=vcalloc ( nstate, sizeof (int));
11061 is_t4[ORF3_T4]=is_t4[U1_T4]=is_t4[U2_T4]=1;
11062 transitions=declare_int ( nstate, nstate);
11063 for (a=0; a< nstate; a++)
11064 for ( b=0; b< nstate; b++)transitions[a][b]=F;
11066 transitions[START][ORF1]=AL;
11067 transitions[START][s5NC]=AL-FRAME_PENALTY;
11068 transitions[s5NC][s5NC]=AL;
11070 transitions[s5NC][ORF1]=AL-FRAME_PENALTY;
11072 transitions[ORF1][ORF2]=AL;
11073 transitions[ORF2][ORF3]=AL;
11074 transitions[ORF3][U1]=AL;
11075 transitions[ORF3][ORF1]=AL;
11076 transitions[ORF3][ORF3_G1]=AL-SPLICE_PENALTY;
11079 transitions[ORF3_G1][ORF3_T2]=AL;
11080 transitions[ORF3_T2][ORF3_NC]=AL;
11081 transitions[ORF3_NC][ORF3_NC]=AL;
11082 transitions[ORF3_NC][ORF3_A3]=AL;
11083 transitions[ORF3_A3][ORF3_T4]=AL;
11084 transitions[ORF3_T4][ORF1]=AL-SPLICE_PENALTY;
11086 transitions[U1][U2]=AL;
11087 transitions[U1][U1_G1]=AL-SPLICE_PENALTY;
11088 transitions[U1_G1][U1_T2]=AL;
11089 transitions[U1_T2][U1_NC]=AL;
11090 transitions[U1_NC][U1_NC]=AL;
11091 transitions[U1_NC][U1_A3]=AL;
11092 transitions[U1_A3][U1_T4]=AL;
11093 transitions[U1_T4][U3]=AL-SPLICE_PENALTY;
11094 transitions[U3][U4]=AL;
11095 transitions[U4][ORF1]=AL;
11097 transitions[U2][U2_G1]=AL-SPLICE_PENALTY;
11098 transitions[U2_G1][U2_T2]=AL;
11099 transitions[U2_T2][U2_NC]=AL;
11100 transitions[U2_NC][U2_NC]=AL;
11101 transitions[U2_NC][U2_A3]=AL;
11102 transitions[U2_A3][U2_T4]=AL;
11103 transitions[U2_T4][U5]=AL-SPLICE_PENALTY;
11104 transitions[U5][ORF1]=AL;
11106 transitions[ORF3][s3NC]=AL-FRAME_PENALTY;
11107 transitions[ORF3][END]=AL;
11108 transitions[s3NC][END]=AL;
11111 v_tab=declare_int ( l+1,nstate);
11112 v_tab_p=declare_int ( l+1,nstate);
11113 last_coding=declare_int ( l+1,nstate);
11114 last_t4=declare_int ( l+1,nstate);
11116 for (a=0; a< l; a++) potential[a]-=200;
11118 codon=vcalloc ( 4, sizeof (char));
11119 best_pstate_p=START;
11122 for ( p=1; p<=l; p++)
11124 if (translate_dna_codon (seq+(p-1), 'x')=='x' || p>(l-2))orf1=F;
11125 else orf1=potential[p-1];
11127 if (p<2 || translate_dna_codon (seq+(p-2), 'x')=='x' || p>(l-1))orf2=F;
11128 else orf2=potential[p-1];
11131 if (p<3 || translate_dna_codon (seq+(p-3), 'x')=='x' || p>l)orf3=F;
11132 else orf3=potential[p-1];
11134 if ( best_int (3, 1, &a, orf1, orf2, orf3)!=F)ncp=-best_int (3, 1, &a, orf1, orf2, orf3);
11137 for ( state=0; state< nstate; state++)
11140 if ( state==ORF1)e=orf1;
11141 else if ( state==ORF2)e=orf2;
11142 else if ( state==ORF3)e=orf3;
11143 else if ( state>=U1 && state<=U3)
11147 else if ( state==U4)
11150 codon[1]=seq[last_coding[p-1][U3]-1];
11151 codon[0]=seq[last_coding[p-2][U1_T4]-1];
11152 if ( translate_dna_codon (codon, 'x')=='x')e=F;
11155 else if ( state==U5)
11158 codon[1]=seq[last_coding[p-1][U2_T4]-1];
11159 q=seq[last_coding[p-1][U2_T4]];
11160 codon[0]=seq[last_coding[q-1][U1]-1];
11161 if ( translate_dna_codon (codon, 'x')=='x')e=F;
11165 else if (state>=ORF3_G1 && state<=U2_G1)e=(p<l-1 && seq[p-1]=='g' && seq[p]=='u')?ncp:F;
11166 else if ( state>=ORF3_T2 && state<=U2_T2)
11168 e=(p>1 && seq[p-2]=='g' && seq[p-1]=='u')?ncp:F;
11170 else if ( state>=ORF3_A3 && state<=U2_A3)e=(seq[p-1]=='a')?ncp:F;
11171 else if ( state>=ORF3_T4 && state<=U2_T4)e=(seq[p-1]=='u')?ncp:F;
11174 for ( pstate=0; pstate<nstate; pstate++)
11176 if (e==F || transitions[pstate][state]==F || v_tab[p-1][pstate]==F)v=F;
11177 else v=e+transitions[pstate][state]+v_tab[p-1][pstate];
11179 if ( pstate==0 || v>best_pstate_v)
11180 {best_pstate_v=v;best_pstate_p=pstate;}
11182 v_tab[p][state]=best_pstate_v;
11183 v_tab_p[p][state]=best_pstate_p;
11185 if (!is_coding[state])last_coding[p][state]=last_coding[p-1][best_pstate_p];
11186 else if (is_coding[state])last_coding[p][state]=p;
11190 if (is_coding[state] && last_t4[p-1][best_pstate_p]==0)last_t4[p][state]=p;
11191 else last_t4[p][state]=last_t4[p-1][best_pstate_p];
11193 else if (is_t4[state])last_t4[p][state]=p;
11195 if (state==0 ||best_pstate_v>best_state_v ){best_state_p=state; best_state_v=best_pstate_v;}
11199 for ( p=l; p>0; p--)
11201 if ( best_state_p>=ORF1 && best_state_p<=ORF3){seq2[tot++]=tolower (seq[p-1]);}
11202 else if ( best_state_p>=U1 && best_state_p<=U5){seq2[tot++]=tolower (seq[p-1]);}
11203 if (best_state_p==ORF1)seq[p-1]=toupper (seq[p-1]);
11204 else if (best_state_p==ORF2 || best_state_p==ORF3)seq[p-1]=tolower (seq[p-1]);
11205 else if ( best_state_p==ORF3_NC || best_state_p==U1_NC || best_state_p==U2_NC) seq[p-1]='.';
11206 else if ( best_state_p==U1 || best_state_p==U2 || best_state_p==U3 || best_state_p==U4 || best_state_p==U5) seq[p-1]=best_state_p-U1+'1';
11207 else seq[p-1]=toupper (seq[p-1]);
11208 best_state_p=v_tab_p[p][best_state_p];
11211 for ( a=0, b=tot-1; b>=0; b--, a++)
11214 fprintf ( stderr, "\n%s\n", seq);
11215 fprintf ( stderr, "\nN coding=%d\n", tot);
11216 for ( a=0; a< tot; a+=3)
11218 b=translate_dna_codon (seq3+a, 'x');
11219 fprintf ( stderr, "%c",b);
11220 if ( b=='x'){fprintf ( stderr, "\n");myexit (EXIT_SUCCESS);}
11223 fprintf ( stderr, "\n");
11224 myexit (EXIT_SUCCESS);
11230 Alignment * dna_aln2_3frame_cdna_aln(Alignment *A,int *ns,int **l_s)
11234 B=realloc_aln2 (NULL,6,strlen(A->seq_al[l_s[0][0]])+strlen(A->seq_al[l_s[1][0]]));
11235 for ( a=0; a< 3; a++)
11237 B->seq_al[a]=translate_dna_seq (A->seq_al[l_s[0][0]]+a, 0, 'o',B->seq_al[a]);
11238 B->seq_al[a+3]=translate_dna_seq (A->seq_al[l_s[1][0]]+a, 0, 'o',B->seq_al[a+3]);
11240 for ( a=1; a<3; a++)
11242 if ( strlen(B->seq_al[a])<strlen(B->seq_al[0])) B->seq_al[a]=strcat ( B->seq_al[a], "x");
11243 if ( strlen(B->seq_al[a+3])<strlen(B->seq_al[3])) B->seq_al[a+3]=strcat ( B->seq_al[a+3], "x");
11247 B->len_aln=strlen (B->seq_al[0]);
11252 //For normal distribution scan
11254 #define PI 3.141592653589793238462643
11257 double normal(double x, double mean, double std)
11259 return (1/(std*sqrt(2.0*PI)))*exp((-0.5*(x-mean)*(x-mean))/(std*std));
11262 int ** get_sim_aln_array_normal_distribution ( Alignment *A, char *mode, int *STD, int *CENTER)
11268 w=declare_int ( A->nseq, A->nseq);
11270 for ( a=0; a< A->nseq-1; a++)
11272 for ( b=a+1; b< A->nseq; b++)
11275 w[a][b]=w[b][a]=generic_get_seq_sim_normal_distribution ( A->seq_al[a], A->seq_al[b], (A->cdna_cache)?A->cdna_cache[0]:NULL, mode, STD, CENTER);
11280 int generic_get_seq_sim_normal_distribution ( char *seq1, char *seq2, int*cache, char *mode, int *STD, int *CENTER)
11282 return get_seq_sim_distribution ( seq1,seq2,GAP_LIST, mode, STD, CENTER);
11285 int get_seq_sim_distribution ( char *string1, char *string2, char *ignore, char *in_mode, int *STD, int *CENTER)
11298 sprintf ( mode, "%s", in_mode);
11300 /*mode: <mat>__<sim_mode>
11301 mat: idscore to get the alignment done
11302 any legal cw matrix
11303 sim_mode: sim1->identities/matches
11304 sim2->identities/min len
11308 if ( (p=strstr (mode, "_"))!=NULL)
11315 if (strstr (mode, "idscore"))
11318 if (!mat) mat=read_matrice ("blosum62mt");
11319 return idscore_pairseq (string1, string2, -12, -1, mat,mode);
11322 len1=strlen (string1);
11323 for ( sim=pos0=0,a=0; a< len1; a++)
11327 p1=1-is_in_set (r1, ignore);
11328 p2=1-is_in_set (r2, ignore);
11332 if (is_in_same_group_aa(r1,r2,0, NULL, mode))
11334 sim += normal(a, *CENTER, *STD);
11343 if ( p==NULL || strm (p, "sim1") || strm (p, "sim"))
11345 r=(pos0==0)?0:(sim*MAXID);
11347 /* else if ( strm (p, "sim2"))
11349 r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MIN(pos1,pos2);
11351 else if ( strm (p, "sim3"))
11353 r=(pos1==0 || pos2==0)?0:(sim*MAXID)/MAX(pos1,pos2);
11355 else if ( strm (p, "gap1"))
11357 r=(len1==0)?MAXID:(gap*MAXID)/len1;
11360 else if ( strm (p, "logid"))
11362 r=logid_score (pos0, sim);
11370 Alignment *aln2clean_pw_aln (Alignment *A, OveralnP *F)// char *mode, int t, int f, int p1,int p2, int p3, char *fsa_mode)
11377 if (F->t==0)F->t=2;
11379 C=declare_int ( A->nseq, A->len_aln);
11380 T=declare_int ( A->nseq, A->len_aln);
11381 B=copy_aln (A, NULL);
11383 for (a=0; a< A->nseq;a++)
11385 for (b=0; b<A->nseq; b++)
11388 w=pw_aln2clean_aln_weight (A->seq_al[a], A->seq_al[b], 1,F);//f,p1, p2, p3, fsa_mode);
11389 for (c=0; c<A->len_aln; c++)
11391 if (A->seq_al[a][c]=='-')continue;
11401 for (a=0; a<A->nseq; a++)
11403 for (b=0; b<A->len_aln; b++)
11408 else if (T[a][b]==0);
11412 r=(C[a][b]*10)/T[a][b];
11414 if (!F->mode || strm (F->mode, "number"))
11415 B->seq_al[a][b]='0'+r;
11416 else if ( F->mode && (strm (F->mode, "unalign") ||strm (F->mode, "unalign2")))
11417 B->seq_al[a][b]='0'+r;
11418 else if ( F->mode && strm (F->mode, "lower") )
11420 if (r<=F->t)B->seq_al[a][b]=tolower (B->seq_al[a][b]);
11421 else B->seq_al[a][b]=toupper (B->seq_al[a][b]);
11427 if (F->mode && strm (F->mode, "unalign"))
11429 A=unalign_aln (A, B, F->t);
11431 B=copy_aln (A, NULL);
11433 else if (F->mode && strm (F->mode, "unalign2"))
11435 A=unalign_aln_2 (A, B, F->t);
11437 B=copy_aln (A, NULL);
11448 char **pw_aln2clean_pw_aln_fsa1 (char ** aln, OveralnP *F);
11449 char **pw_aln2clean_pw_aln_fsa2 (char ** aln, OveralnP *F);
11451 int * pw_aln2clean_aln_weight ( char *seq1, char *seq2, int w, OveralnP *F)
11457 if ( (l=strlen (seq1)) !=strlen (seq2))
11459 HERE ("\n%s\n%s\n", seq1, seq2);
11460 printf_exit ( EXIT_FAILURE, stderr, "\nERROR: Comparing unaligned sequences [FATAL:%s]", PROGRAM);
11464 aln=declare_char (2, l+1);
11465 sprintf ( aln[0], "%s", seq1);
11466 sprintf ( aln[1], "%s", seq2);
11469 aln=pw_aln2clean_pw_aln (aln, F);
11471 weight=vcalloc (l+1, sizeof (int));
11472 for (a=0; a<l; a++)
11474 if ( aln[0][a] || seq1[a]=='x' || seq1[a]=='X' || seq2[a]=='x' || seq2[a]=='X')weight[a]=w;
11476 free_char (aln, -1);
11482 char **pw_aln2clean_pw_aln (char ** aln, OveralnP *F)
11485 if ( strm (F->model, "fsa2"))return pw_aln2clean_pw_aln_fsa2 (aln,F);
11486 else if ( strm (F->model, "fsa1"))return pw_aln2clean_pw_aln_fsa1 (aln,F);
11487 else return pw_aln2clean_pw_aln_fsa1 (aln,F);
11490 char **pw_aln2clean_pw_aln_fsa2 (char ** aln, OveralnP *FO)
11492 int a, b, c, d, l, id;
11493 int c1, c2, e0, e1,tb, obs;
11495 int **mat, **tran, **p, **t, *s, *ids;
11497 int S, M1, M2, m1, m2,B1, B2,G1,G2, K;
11499 int MID_EXON_FACTOR=50;
11505 if ( getenv ("MID_EXON_FACTOR"))MID_EXON_FACTOR=atoi (getenv ("MID_EXON_FACTOR"));
11509 if (!smat)smat=read_matrice ( "blosum62mt");
11513 if ( l!=strlen (aln[1]))
11515 printf_exit ( EXIT_FAILURE, stderr, "\nERROR: unaligned strings");
11520 s=vcalloc (l, sizeof (int));
11521 ids=vcalloc (l, sizeof (int));
11523 //record the id level of each posotion
11524 for (b=0; b<l; b++)
11526 c1=tolower(aln[0][b]);c2=tolower(c2=aln[1][b]);
11528 if (c1=='-' || c2=='-' || c1=='X' || c2=='X' || c1!=c2)ids[b]=0;
11532 //record the state of each position: M, m, T, gap
11533 for (id=0,b=0,a=0;a<l; a++)
11535 c1=aln[0][a];c2=aln[1][a];
11536 if (islower (c1))s[a]=3;
11537 else if (c1=='-' || c2=='-' || c1=='X' || c2=='X')s[a]=2;
11541 sc=smat[c1-'A'][c2-'A'];
11542 if (sc>=2){id++; s[a]=1;}
11550 vfree(s);vfree (ids);
11556 FO->p1=(FO->p1==0)?5:FO->p1;
11557 FO->p2=(FO->p2==0)?15:FO->p2;
11558 FO->p3=(FO->p3==0)?0:FO->p3;
11559 FO->p4=(FO->p4==0)?100:FO->p4;
11562 T1=100*(float)id/(float)b;
11563 T2=(FO->f==0)?30:T1*(float)((float)FO->f/(float)100);
11573 M1=ns++;//1 matched aligned
11574 m1=ns++;//2 mmatched aligned
11575 M2=ns++;//3 matched unaligned
11576 m2=ns++;//4 mmatched unaligned
11577 B1=ns++;//5 transition aligned
11578 B2=ns++;//6 transition unaligned
11580 mat=declare_int (ns, 4);
11581 tran=declare_int (ns, ns);
11582 p=declare_int (l+1, ns);
11583 t=declare_int (l+1, ns);
11586 mat[M1][0]=F; //non id
11588 mat[M1][2]=0; //gap
11589 mat[M1][3]=F; //transition
11616 //transition values
11626 tran[M1][m2]=-FO->p4;
11627 tran[M1][M1]=+FO->p2;
11631 tran[M1][B2]=-FO->p1;
11634 tran[M2][m2]=+FO->p3;
11648 tran[m1][B2]=-FO->p1;
11652 tran[m2][M1]= -FO->p4;
11653 tran[m2][M2]= +FO->p3;
11666 tran[B2][m1]= -FO->p1;
11668 tran[B2][M1]= -FO->p1;
11674 translate=vcalloc (ns, sizeof (int));
11682 for (a=1;a<=l; a++)
11686 for (cs=0; cs<ns; cs++)
11688 for (ps=0; ps<ns; ps++)
11690 c=p[a-1][ps]+mat[cs][obs]+tran[ps][cs];
11691 if (ps==0 || c>=best){t[a][cs]=ps;best=p[a][cs]=c;}
11698 for (a=0; a<ns; a++)
11700 if (a==0 || p[l][a]>=best){tb=a;best=p[l][a];}
11703 for (a=l; a>0; a--)
11709 aln[0][p2]=aln[1][p2]=translate[tb];
11717 free_int (mat, -1);
11718 free_int (tran, -1);
11722 char **pw_aln2clean_pw_aln_fsa1 (char ** aln, OveralnP *FO)
11724 int a, b, c, d, l, id;
11725 int c1, c2, e0, e1,tb, obs;
11727 int **mat, **tran, **p, **t, **s;
11729 int S, M1, M2, m1, m2, K;
11736 if (!smat)smat=read_matrice ( "blosum62mt");
11740 if ( l!=strlen (aln[1]))
11742 printf_exit ( EXIT_FAILURE, stderr, "\nERROR: unaligned strings");
11746 s=declare_int (l+1, 2);
11747 for (id=0,b=0,a=0;a<l; a++)
11749 c1=aln[0][a];c2=aln[1][a];
11751 if ( c1=='-' || c2=='-' || c1=='x' || c1=='X' || c2=='x' || c2=='X')continue;
11755 sc=smat[c1-'A'][c2-'A'];
11756 if (sc>=2){id++; s[b][0]=1;}
11768 FO->f=(FO->f==0)?30:FO->f;
11769 FO->p1=(FO->p1==0)?90:FO->p1;
11770 FO->p2=(FO->p2==0)?15:FO->p2;
11771 FO->p3=(FO->p3==0)?0:FO->p3;
11773 l=b;//length of the ungapped aln
11774 T1=100*(float)id/(float)b;
11785 M1=ns++;//1 matched aligned
11786 m1=ns++;//2 mmatched aligned
11787 M2=ns++;//3 matched unaligned
11788 m2=ns++;//4 mmatched unaligned
11790 mat=declare_int (ns, 2);
11791 tran=declare_int (ns, ns);
11792 p=declare_int (l+1, ns);
11793 t=declare_int (l+1, ns);
11816 tran[M1][m2]=-FO->p1;// -P;
11817 tran[M1][M1]=+FO->p2;
11822 tran[M2][m2]=+FO->p3;
11835 tran[m2][M1]=-FO->p1;
11836 tran[m2][M2]=+FO->p3;
11839 translate=vcalloc (ns, sizeof (int));
11847 for (a=1;a<=l; a++)
11851 for (cs=0; cs<ns; cs++)
11853 for (ps=0; ps<ns; ps++)
11855 c=p[a-1][ps]+mat[cs][obs]+tran[ps][cs];
11856 if (ps==0 || c>=best){t[a][cs]=ps;best=p[a][cs]=c;}
11863 for (a=0; a<ns; a++)
11865 if (a==0 || p[l][a]>=best){tb=a;best=p[l][a];}
11867 for (a=l; a>0; a--)
11870 aln[0][p2]=aln[1][p2]=translate[tb];
11879 free_int (mat, -1);
11880 free_int (tran, -1);
11884 float* analyze_overaln ( Alignment *iA, Alignment *iB, char *mode, int filter, int f, int p1,int p2, int p3)
11890 F=vcalloc (1, sizeof (OveralnP));
11896 sprintf (F->mode, "%s", mode);
11900 A=copy_aln (iA, NULL);
11901 B=copy_aln (iB, NULL);
11903 C=aln2gap_cache (A,0);
11904 A=filter_aln_upper_lower (A, C, 0, 0);
11905 D=aln2clean_pw_aln (B, F);
11906 r=aln2pred (A,D,mode);
11913 float* aln2pred ( Alignment *A, Alignment*B, char *mode)
11915 int a, b, c, d, i, l, salp, s, n;
11916 static char **list, *buf1, *buf2, *alp, *alp_lu;
11919 int fp, fn, tn, tp;
11920 int tfp, tfn, ttn, ttp;
11921 float sp, sn, sen2, best, result;
11925 fresult=vcalloc ( 3, sizeof (float));
11927 if ( mode && strstr (mode, "case"))
11929 A=aln2case_aln (A,"u","l");
11930 B=aln2case_aln (B,"u","l");
11933 if (mode && strstr (mode, "printaln"))
11938 C=copy_aln (B, NULL);
11939 for (a=0; a<B->nseq; a++)
11941 i=name_is_in_list (C->name[a], S->name, S->nseq, 100);
11943 for (b=0; b<C->len_aln; b++) C->seq_al[a][b]='-';
11945 for (d=0,b=0; b<C->len_aln; b++)
11947 if ( !is_gap (C->seq_al[a][b]))
11949 if (C->seq_al[a][b]==S->seq[i][d])C->seq_al[a][b]=toupper(C->seq_al[a][b]);
11957 vfree (alp);vfree (alp_lu);
11958 alp=vcalloc ( 256, sizeof (char));
11959 alp_lu=vcalloc ( 256, sizeof (char));
11961 for (c=0; c<2; c++)
11965 for (salp=0,a=0; a<AL->nseq; a++)
11967 for (b=0; b<AL->len_aln; b++)
11969 c=AL->seq_al[a][b];
11970 if (!is_gap(c) && !alp[c])
11980 vfree (buf1); vfree(buf2);
11981 buf1=vcalloc ( A->len_aln+1, sizeof (char));
11982 buf2=vcalloc ( B->len_aln+1, sizeof (char));
11984 free_arrayN ((void **)r, 3);
11985 r=declare_arrayN(3, sizeof (int),A->nseq,salp+1,salp+1);
11986 free_char ( list, -1);
11987 list=declare_char ( A->nseq, 100);
11988 for (n=0,a=0; a< A->nseq; a++)
11990 for ( b=0; b<B->nseq; b++)
11992 if ( strm (A->name[a], B->name[b]))
11994 sprintf ( buf1, "%s", A->seq_al[a]);
11995 sprintf ( buf2, "%s", B->seq_al[b]);
11996 ungap (buf1); ungap (buf2);
11997 if ((l=strlen (buf1))!=strlen (buf2))continue;
12000 sprintf ( list[n], "%s", A->name[a]);
12001 for (c=0; c<l; c++)
12006 r[n][alp[c1]][alp[c2]]++;
12016 for ( s=1; s<=salp; s++)
12019 sprintf (type, "_%c_", alp_lu[s]);
12021 for (a=0; a<n; a++)
12024 for (b=1; b<=salp; b++)
12026 for (c=1; c<=salp; c++)
12028 if ( b==s && c==s) tp+=r[a][b][c];
12029 else if ( b==s && c!=s)fn+=r[a][b][c];
12030 else if ( b!=s && c==s)fp+=r[a][b][c];
12031 else if ( b!=s && b!=s)tn+=r[a][b][c];
12040 rates2sensitivity (tp, tn, fp, fn, &sp, &sn, &sen2, &best);
12041 if ( mode && strstr (mode, "printstat"))fprintf ( stdout, ">%s S=%c sp=%6.2f sn=%6.2f sen2=%6.2f best=%6.2f\n", list[a],alp_lu[s],sp, sn, sen2, best);
12044 rates2sensitivity (ttp, ttn, tfp, tfn, &sp, &sn, &sen2, &best);
12045 if (mode && strstr (mode, "printstat"))fprintf ( stdout, ">TOT S=%c sp=%6.2f sn=%6.2f re=%6.2f best=%6.2f\n", alp_lu[s],sp, sn, sen2, best);
12047 if ( mode && strstr (mode, type))
12057 Alignment * mark_exon_boundaries (Alignment *A, Alignment *E)
12062 buf2=vcalloc ( E->len_aln+1, sizeof (char));
12063 buf =vcalloc ( E->len_aln+1, sizeof (char));
12065 for (a=0; a< A->nseq; a++)
12067 i=name_is_in_list (A->name[a], E->name, E->nseq, 100);
12068 if ( i==-1) continue;
12069 sprintf (buf, "%s", E->seq_al[i]);
12073 for (c=0, b=0; b<l; b++)if (buf[b]!='o' && buf[b]!='b' && buf[b]!='j')buf2[c++]=toupper(buf[b]);
12076 //lowercase the boundaries of buf2;
12077 for ( c=0,b=0; b<l; b++)
12079 //ENSEMBL: o: 0, b:1 j:2
12080 if (buf[b]=='b' || buf[b]=='o' && c>=1)buf2[c-1]=tolower(buf2[c-1]);
12081 else if (buf[b]=='j' &&c<l)buf2[c+1]=tolower(buf2[c+1]);
12085 for (c=0,b=0; b<A->len_aln; b++)
12087 if (!is_gap(A->seq_al[a][b]))
12089 A->seq_al[a][b]=buf2[c++];
17980 /******************************COPYRIGHT NOTICE*******************************/
17981 /*© Centro de Regulacio Genomica */
17983 /*Cedric Notredame */
17984 /*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
17985 /*All rights reserved.*/
17986 /*This file is part of T-COFFEE.*/
17988 /* T-COFFEE is free software; you can redistribute it and/or modify*/
17989 /* it under the terms of the GNU General Public License as published by*/
17990 /* the Free Software Foundation; either version 2 of the License, or*/
17991 /* (at your option) any later version.*/
17993 /* T-COFFEE is distributed in the hope that it will be useful,*/
17994 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
17995 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
17996 /* GNU General Public License for more details.*/
17998 /* You should have received a copy of the GNU General Public License*/
17999 /* along with Foobar; if not, write to the Free Software*/
18000 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
18001 /*............................................... |*/
18002 /* If you need some more information*/
18003 /* cedric.notredame@europe.com*/
18004 /*............................................... |*/
18008 /******************************COPYRIGHT NOTICE*******************************/