7 #include "io_lib_header.h"
8 #include "util_lib_header.h"
9 #include "dp_lib_header.h"
10 #include "define_header.h"
20 static NT_node compute_fj_tree (NT_node T, Alignment *A, int limit, char *mode);
21 static NT_node compute_cw_tree (Alignment *A);
22 static NT_node compute_std_tree (Alignment *A, int n, char **arg);
23 static NT_node tree2fj_tree (NT_node T);
24 int tree_contains_duplicates (NT_node T);
25 int display_tree_duplicates (NT_node T);
27 static int compare_node1 ( int *b1, int *b2, int n);
28 static int compare_node2 ( int *b1, int *b2, int n);
29 static int find_seq_chain (Alignment *A, int **sim,int *used,int seq0,int seq1, int seq2,int chain_length, int limit, int max_chain, int *nseq);
30 int new_display_tree (NT_node T, int n);
31 NT_node display_code (NT_node T, int nseq, FILE *fp);
32 NT_node display_dist (NT_node T, int n, FILE *fp);
33 /*********************************************************************/
35 /* dpa_tree_manipulation */
37 /*********************************************************************/
38 static NT_node code_dpa_tree ( NT_node T, int **D);
39 NT_node collapse_sub_tree ( NT_node T,int nseq, int *list, char *new_name);
40 NT_node seq2dpa_tree (Sequence *S, char *mode)
45 CL=declare_constraint_list_simple (S);
46 CL->local_stderr=NULL;
49 CL->DM=cl2distance_matrix (CL,NOALN,(mode==NULL)?"ktup":mode, NULL, 0);
51 T=int_dist2nj_tree ( (CL->DM)->similarity_matrix, S->name, S->nseq, vtmpnam (NULL));
54 Tree=recode_tree (Tree, S);
55 Tree=reset_dist_tree (Tree, -1);
57 Tree=code_dpa_tree (Tree, (CL->DM)->similarity_matrix);
58 free_distance_matrix (CL->DM);
62 NT_node tree2dpa_tree (NT_node T, Alignment *A, char *mode)
64 /*This Function sets the branches with Length values used by DP*/
65 /*The tree must be rooted*/
71 T=reset_dist_tree (T, -1);
72 D=get_sim_aln_array (A,mode);
74 T=code_dpa_tree (T, D);
78 NT_node code_dpa_tree ( NT_node T, int **D)
93 nl=(T->left)->nseq;ll=(T->left)->lseq;
94 nr=(T->right)->nseq;lr=(T->right)->lseq;
96 for (tot=0,n=0, a=0; a< nl; a++)
97 for ( b=0; b< nr; b++, n++)
100 min=MIN(min,(D[ll[a]][lr[b]]));
102 /* T->dist=(mode==AVERAGE)?(tot/n):min:;*/
103 T->dist=(n>0)?tot/n:0;
105 code_dpa_tree ( T->right, D);
106 code_dpa_tree ( T->left, D);
110 static int group_number;
111 char *tree2Ngroup (Alignment *A, NT_node T, int max_n, char *fname, char *mat)
113 double top, bot, mid, pmid;
123 list=declare_char ( 2, 100);
124 sprintf (list[0], "%s",mat);
126 fprintf ( stderr, "\nCompute Phylogenetic tree [Matrix=%s]", mat);
127 T=compute_std_tree(A,1, list);
128 fprintf ( stderr, "\nCompute dpa tree");
129 T=tree2dpa_tree (T,A, mat);
137 n=tree2group_file (T,S,0, max_n, fname);
138 fprintf ( stderr, "\n#TrimTC: Split in %d Groups at a minimum of %d%% ID\n",n, (int)max_n);
144 if ( max_n>S->nseq)max_n=S->nseq;
148 n=tree2group_file(T, S,0, (int)mid,fname);
149 mid=dichotomy((double)n, (double)max_n,(pmid=mid), &bot, &top);
150 while (n!=max_n && (int)pmid!=(int)mid)
152 n=tree2group_file(T, S,0, (int)mid, fname);
153 mid=dichotomy((double)n, (double)max_n,(pmid=mid), &bot, &top);
155 fprintf ( stderr, "\nDONE2");
156 fprintf ( stderr, "\n#TrimTC: Split in %d Groups at a minimum of %d%% ID\n",n, (int)mid);
161 static int group_number;
162 int tree2group_file ( NT_node T,Sequence *S, int maxnseq, int minsim, char *name)
167 fp=vfopen (name, "w");
168 vfclose (tree2group (T, S,maxnseq,minsim, "tree2ngroup",fp));
170 return count_n_line_in_file(name);
174 FILE * tree2group ( NT_node T,Sequence *S, int maxnseq, int minsim,char *name, FILE *fp)
181 m=(maxnseq==0)?S->nseq:maxnseq;
186 if ( T->nseq<=m && T->dist>=d)
189 fprintf ( fp, ">%s_%d ", (name)?name:"", ++group_number);
190 for ( a=0; a< T->nseq; a++)
191 fprintf ( fp, "%s ", S->name[T->lseq[a]]);
193 if (!T->parent)group_number=0;
198 fp=tree2group (T->right, S, maxnseq, minsim, name,fp);
199 fp=tree2group (T->left, S, maxnseq, minsim, name,fp);
200 if (!T->parent)group_number=0;
208 NT_node tree2collapsed_tree (NT_node T, int n, char **string)
217 list=vcalloc (A->nseq, sizeof (char***));
218 nlist=vcalloc (A->nseq, sizeof (int));
225 for (l=0,a=0; a< n; a++)l+=strlen (string[a]);
226 buf=vcalloc ( 2*n+l+1, sizeof (char));
227 for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");}
228 list[0]=string2list (buf);
231 else if ( file_exists (NULL,string[0]))
233 list=read_group (string[0]);
237 fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM);
238 myexit (EXIT_FAILURE);
247 for (b=0; b<A->nseq; b++)nlist[b]=0;
250 i=name_is_in_list (list[a][b], A->name, A->nseq, MAXNAMES);
253 T=collapse_sub_tree ( T,A->nseq,nlist,list[a][1]);
254 free_char (list[a], -1);
261 NT_node collapse_sub_tree ( NT_node T,int nseq, int *list, char *new_name)
269 while (a<nseq && list[a]==T->lseq2[a]){a++;}
272 sprintf ( T->name, "%s", new_name);
274 T->left=T->right=NULL;
279 collapse_sub_tree (T->right, nseq, list, new_name);
280 collapse_sub_tree (T->left, nseq, list, new_name);
286 /*********************************************************************/
291 /*********************************************************************/
292 NT_node remove_leaf ( NT_node T);
293 NT_node prune_root (NT_node T);
294 NT_node main_prune_tree ( NT_node T, Sequence *S)
296 T=prune_tree ( T, S);
300 NT_node prune_tree ( NT_node T, Sequence *S)
305 if (T->leaf && T->isseq && name_is_in_list (T->name,S->name, S->nseq, 100)==-1)
313 for (a=0; a< S->nseq; a++)
315 HERE ("prune pb ---%s", S->name[a]);
319 C=(P->right==T)?P->left:P->right;
320 PP=C->parent=P->parent;
322 if (PP && PP->right==P)PP->right=C;
323 else if (PP)PP->left=C;
326 if (T==P->right)P->right=NULL;
334 prune_tree (T->left, S);
335 prune_tree (T->right, S);
337 return prune_root(T);
340 NT_node prune_root (NT_node T)
342 //This function prunes the root if needed (and frees it).
343 if (T->parent)return T;
345 if (!T->right && T->left)
347 return prune_root (T->left);
349 else if (T->right && !T->left)
352 return prune_root (T->right);
359 /*********************************************************************/
361 /* tree comparison */
364 /*********************************************************************/
365 int main_compare_cog_tree (NT_node T1, char *cogfile)
368 int a, nbac, n=0, p, c, b;
371 array=file2list(cogfile, ";\n");
372 nbac=atoi(array[0][0])-2;
374 A=declare_aln2 (nbac+1, 10);
375 for (a=0; a<nbac; a++)
377 sprintf ( A->name[a], "%s", array[0][a+2]);
379 A->seq_al[a][1]='\0';
382 sprintf ( A->name[nbac], "cons");
389 while (array[n]!=NULL)
391 for (b=0; b<nbac; b++)
393 p=atoi (array[1][b+2]);p=(p==0)?'O':'I';
394 c=atoi (array[n][b+2]);c=(c==0)?'O':'I';
397 A->seq_al[b][2]='\0';
400 sprintf (A->file[0], "%s", array[n][1]);
402 main_compare_aln_tree (T1, A, stdout);
409 int main_compare_aln_tree (NT_node T1, Alignment *A, FILE *fp)
413 fprintf ( fp, "\nTOT_CLASH COG %s N %d", A->file[0], compare_aln_tree (T1, A, &n, fp));
418 int compare_aln_tree (NT_node T, Alignment *A, int *n, FILE *fp)
423 i=name_is_in_list (T->name, A->name, A->nseq, 100);
424 T->seqal=A->seq_al[i];
430 if (!(T->left )->seqal)compare_aln_tree (T->left, A,n, fp);
431 if (!(T->right)->seqal)compare_aln_tree (T->right, A,n, fp);
433 seq1=(T->left)->seqal;
434 seq2=(T->right)->seqal;
435 (T->left)->seqal=(T->right)->seqal=NULL;
438 if (strm (seq1, seq2))
446 if (seq1[0]!=seq2[0] && seq1[1]!=seq2[1])
448 fprintf ( fp, "\nNODE_CLASH: COG %s (%s,%s):(",A->file[0],seq1,seq2 );
449 display_leaf_below_node (T->left, fp);
450 fprintf ( fp, ");(");
451 display_leaf_below_node (T->right, fp);
460 //**********************************************************************
462 int compare_split (int *s1, int *s2, int l);
463 int get_split_size (int *s, int l);
465 int main_compare_splits ( NT_node T1, NT_node T2, char *mode,FILE *fp)
467 Sequence *S1, *S2, *S;
474 if ( tree_contains_duplicates (T1))
476 display_tree_duplicates (T1);
477 printf_exit (EXIT_FAILURE, stderr, "\nFirst Tree Contains Duplicated Sequences [main_compare_trees][FATAL:%s]", PROGRAM);
480 else if ( tree_contains_duplicates (T2))
482 display_tree_duplicates (T2);
483 printf_exit (EXIT_FAILURE, stderr, "\nSecond Tree Contains Duplicated Sequences [main_compare_trees]");
487 //Identify the commom Sequence Set
488 S1=tree2seq(T1, NULL);
491 S2=tree2seq(T2, NULL);
494 S=trim_seq ( S1, S2);
496 //Prune the trees and recode the subtree list
497 T1=prune_tree (T1, S);
498 T1=recode_tree(T1, S);
500 T2=prune_tree (T2, S);
501 T2=recode_tree(T2, S);
503 sl1=declare_int (10*S->nseq, S->nseq);
504 sl2=declare_int (10*S->nseq, S->nseq);
508 tree2split_list (T1, S->nseq, sl1, &n1);
509 tree2split_list (T2, S->nseq, sl2, &n2);
515 n=get_split_size (sl1[a], S->nseq);
516 for (best=0,b=0; b<n2; b++)
518 s=compare_split (sl1[a], sl2[b], S->nseq);
521 fprintf ( fp, "\n%4d %4d ", MIN(n,(S->nseq)), best);
522 for (b=0; b<S->nseq; b++)fprintf ( fp, "%d", sl1[a][b]);
525 free_sequence (S, -1);
526 free_sequence (S1, -1);
527 free_sequence (S2, -1);
531 int compare_split (int *s1, int *s2, int l)
533 int n1, n2, score1, score2, a;
534 n1=get_split_size (s1, l);
535 n2=get_split_size (s2, l);
537 for (score1=0,a=0; a< l; a++)
539 score1+=(s1[a]==1 && s2[a]==1)?1:0;
541 score1=(score1*200)/(n1+n2);
543 for ( score2=0, a=0; a<l; a++)
545 score2+=(s1[a]==0 && s2[a]==1)?1:0;
547 score2=(score2*200)/((l-n1)+n2);
548 return MAX(score1, score2);
550 int get_split_size (int *s, int l)
553 for (a=b=0; a<l; a++)b+=s[a];
556 //**********************************************************************
559 // TREEE COMPARISON FUNCTIONS
563 //////////////////////////////////////////////////////////////////////
566 void normalizeScore(float *score, int len)
569 float SCORE_MIN = FLT_MAX;
570 float SCORE_MAX = FLT_MIN;
571 for(i = 0; i < len; i++)
573 if(score[i] < SCORE_MIN)
574 SCORE_MIN = score[i];
575 if(score[i] > SCORE_MAX)
576 SCORE_MAX = score[i];
578 for(i = 0; i < len; i++)
579 score[i] = (9*(score[i]-SCORE_MIN)/(SCORE_MAX-SCORE_MIN));
582 int new_compare_trees ( NT_node T1, NT_node T2, int nseq, Tree_sim *TS);
583 NT_node new_search_split (NT_node T, NT_node B, int nseq);
584 int new_compare_split ( int *b1, int *b2, int n);
585 Tree_sim* tree_scan_pos (Alignment *A, int start, int end, char *ptree, NT_node RT);
586 Tree_sim* tree_scan_pos_woble (Alignment *A, int center, int max, char *ptree, NT_node RT, int *br, int *bl );
587 Tree_sim* tree_scan_pair_pos (Alignment *A, int start, int end, int start2, int end2,char *ptree, NT_node RT);
588 Tree_sim* tree_scan_multiple_pos (int *poslist, int *wlist,int nl, Alignment *A, char *ptree, NT_node RT);
589 NT_node aln2std_tree(Alignment *A, int ipara1, int ipara2, char *mode);
591 NT_node tree_scan (Alignment *A,NT_node RT, char *pscan, char *ptree)
593 int l, a,ax, c, cx, b;
599 char *pcFileName = A->file[0];
600 char prefix[200] ={0};
601 int len = (strrchr(pcFileName,'.')?strrchr(pcFileName,'.')-pcFileName:strlen(pcFileName));
602 strncpy(prefix, pcFileName, len);
605 char out_format[100];
607 char *score_csv_file = vcalloc(200, sizeof (char));
608 char *score_html_file = vcalloc(200, sizeof (char));
609 char *hit_matrix_file = vcalloc(200, sizeof (char));
610 char *hit_html_file = vcalloc(200, sizeof (char));
611 char *tree_file = vcalloc(200, sizeof (char));
613 sprintf(score_csv_file, "%s%s", prefix, ".score_csv");
614 sprintf(score_html_file, "%s%s", prefix, ".ts_html");
615 sprintf(hit_matrix_file, "%s%s", prefix, ".hit_matrix");
616 sprintf(hit_html_file, "%s%s", prefix, ".hit_html");
617 sprintf(tree_file, "%s%s", prefix, ".trees_txt");
619 if ( pscan && strstr ( pscan, "help"))
621 fprintf ( stdout, "\n+tree_scan| _W_ : Window size for the tree computation|STD size in norscan mode");
622 fprintf ( stdout, "\n+tree_scan| _MODE_ : Mode for the number of windows (single, double, list, scan, pairscan, norscan, hit, norhit)");
623 fprintf ( stdout, "\n+tree_scan| _MINW_ : Minimum Window size when using the scan mode (4)");
624 fprintf ( stdout, "\n+tree_scan| _OUTTREE_ : specify the format of outputing tree in every position (default: not ouput)");
628 strget_param (pscan, "_W_", "5", "%d",&w);
629 strget_param (pscan, "_MODE_", "single", "%s",mode);
630 strget_param (pscan, "_MINW_", "1", "%d",&start);
631 strget_param (pscan, "_POSFILE_", "NO", "%s", posfile);
632 strget_param (pscan, "_OUTTREE_", "", "%s", &out_format);
634 if(strlen(out_format) > 1)
637 l=intlen (A->len_aln);
639 poslist=vcalloc ( A->len_aln, sizeof (int));
641 fascore = vcalloc(A->len_aln, sizeof (float));
643 if ( strm (posfile, "NO"))
646 for ( a=0; a< A->len_aln; a++)poslist[nl++]=a+1;
651 p=file2pos_list (A,posfile);
652 poslist=pos2list (p, A->len_aln, &nl);
653 for (a=0; a<nl; a++)poslist[a]++;
658 NT_node *TreeArray = vcalloc(nl, sizeof (NT_node));
660 if ( strm (mode, "woble"))
662 for (ax=0; ax<nl; ax++)
668 TS=tree_scan_pos_woble (A,a,w,ptree, RT, &left, &right);
669 fprintf ( stdout, "P: %*d I: %*d %*d SIM: %6.2f\n", l,a,l,left,l,right,TS->uw);
673 else if ( strm (mode, "single"))
675 for (ax=0; ax<nl; ax++)
685 if (pstart<1 || pstart>A->len_aln)continue;
686 if (pend<1 || pend>A->len_aln)continue;
687 TS=tree_scan_pos (A, pstart,pend, ptree, RT);
688 fprintf ( stdout, "P: %*d I: %*d %*d SIM: %6.2f L: %2d\n", l,a,l,pstart,l,pend,TS->uw, (w*2)+1);
692 else if (strm (mode, "scan")||strm (mode, "hit"))
695 fp_ts=vfopen (score_csv_file, "w");
696 fprintf ( fp_ts, "Position,Win_Beg,Win_End,Similarity,Win_Len\n");
697 for ( ax=0; ax<nl; ax++)
700 int best_pos=0, best_w=0, best_start, best_end;
703 best_pos = best_start = best_end = a;
704 for (b=start; b<=w; b++)
711 if (pstart<1 || pstart>A->len_aln)continue;
712 if (pend<1 || pend>A->len_aln)continue;
713 TS=tree_scan_pos (A, pstart,pend, ptree, RT);
714 if (TS->uw>=best_score)
715 {best_score=TS->uw;best_w=b;best_start=pstart; best_end=pend;}
718 fprintf (fp_ts, "%*d,%*d,%*d,%6.2f,%2d\n", l,best_pos, l,best_start, l,best_end, best_score,(best_w*2)+1);
719 fascore[ax]=(float)best_score;
720 if(strlen(out_format) > 1)
721 vfclose (print_tree (aln2std_tree(A, best_start, best_end, mode), out_format, vfopen (tree_file, "a+")));
722 if(strm (mode, "hit"))
723 TreeArray[ax] = aln2std_tree(A, best_start, best_end, mode);
727 //tree scan by using normal distribution window
729 //generate hit matrix
730 else if ( strm (mode, "norscan")||strm (mode, "norhit"))
733 ptree=vcalloc(100, sizeof (char));
734 fp_ts=vfopen (score_csv_file, "w");
735 fprintf ( fp_ts, "Position,Similarity,STD_Len\n");
736 for ( ax=0; ax<nl; ax++)
738 float best_score=DBL_MIN;
739 int best_STD = start;
741 for (b=start; b<=w; b++)
744 sprintf ( ptree, "+aln2tree _COMPARE_nordisaln__STD_%d__CENTER_%d_", b, a); //should be used a or ax
745 TS=tree_scan_pos (A, 1, nl, ptree, RT);
746 if (TS->uw>=best_score)
747 {best_score=TS->uw;best_STD=b;}
750 fascore[ax]=best_score;
751 fprintf ( fp_ts, "%*d,%6.2f,%d\n", l,a, fascore[ax], best_STD);
752 if(strlen(out_format) > 1)
753 vfclose (print_tree (aln2std_tree(A, best_STD, a, mode), out_format, vfopen (tree_file, "a+")));
754 if(strm (mode, "norhit"))
755 TreeArray[ax] = aln2std_tree(A, best_STD, a, mode);
759 //generate hit matrix
760 if (strm (mode, "hit")||strm (mode, "norhit"))
762 //Compute the pair score of tree scan segqtion
763 fprintf (stdout, "[STRAT] Calculate the hit matrix of the tree scan\n");
764 float **ffpHitScoreMatrix;
766 ffpHitScoreMatrix=vcalloc (nl, sizeof (float*));
768 for(i = 0; i < nl; i++)
769 ffpHitScoreMatrix[i]=vcalloc (nl-i, sizeof (float));
771 fprintf (stdout, "Process positions\n", i);
772 for(i = 0; i < nl; i++)
774 fprintf (stdout, "%d, ", i);
775 for(j = i; j < nl; j++)
778 TS=tree_cmp (TreeArray[i], TreeArray[j]);
779 ffpHitScoreMatrix[i][j-i] = TS->uw;
784 fprintf (stdout, "\n");
785 output_hit_matrix(hit_matrix_file, ffpHitScoreMatrix, nl);
786 fprintf (stdout, "[END]Calculate the hit matrix of the tree scan\n");
788 //Output Hit Score into color html
789 output_hit_color_html (A, ffpHitScoreMatrix, nl, hit_html_file);
790 vfree(ffpHitScoreMatrix);
792 else if ( strm (mode, "pairscan"))
796 for ( ax=0; ax<nl; ax++)
799 int best_pos=0, best_w=0, best_w2=0, best_start, best_end, best_pos2=0, best_start2, best_end2;
802 int pstart, pend, p2start, p2end;
804 for ( cx=0; cx<nl; cx++)
808 for (d=start; d<=w; d++)
810 for (b=start; b<=w; b++)
816 if (pstart<1 || pstart>A->len_aln)continue;
817 if (pend<1 || pend>A->len_aln)continue;
818 if (p2start<1 || p2start>A->len_aln)continue;
819 if (p2end<1 || p2end>A->len_aln)continue;
820 if (pstart<=p2start && pend>=p2start) continue;
821 if (pstart<=p2end && pend>=p2end) continue;
822 TS=tree_scan_pair_pos (A, pstart,pend,p2start, p2end, ptree, RT);
824 if (TS->uw>=best_score){best_score=TS->uw; best_pos=a;best_w=b;best_start=pstart; best_end=pend; best_pos2=c, best_w2=d, best_start2=p2start, best_end2=p2end;set=1;}
828 if (set)fprintf ( stdout, "P1: %*d I1: %*d %*d P2: %*d I2: %*d %*d SIM: %6.2f L: %2d\n", l,best_pos, l,best_start, l,best_end, l, best_pos2, l, best_start2, l, best_end2, best_score,(best_w*2)+1 );
834 else if ( strm (mode, "multiplescan"))
836 int n, **wlist, best_pos;
839 wlist=generate_array_int_list (nl*2,start, w,1, &n, NULL);
840 HERE ("Scan %d Possibilities", n);
842 for (best_score=best_pos=0,a=0; a<n; a++)
844 TS=tree_scan_multiple_pos (poslist,wlist[a],nl, A, ptree, RT);
845 if (TS && TS->uw>best_score)
848 fprintf ( stdout, "\n");
851 fprintf ( stdout, "[%3d %3d %3d]", poslist[b], wlist[a][b*2], wlist[a][b*2+1]);
853 fprintf ( stdout, " SCORE: %.2f", best_score);
859 //Output Tree Scan core into color html
860 normalizeScore(fascore, nl);
866 ST=copy_aln (A, NULL);
867 for (a=0; a<ST->nseq; a++)
869 i=name_is_in_list (ST->name[a],S->name, S->nseq, 100);
872 for (b=0; b<ST->len_aln; b++)
876 r1 = (int)fascore[b] + 48;
881 output_color_html ( A, ST, score_html_file);
886 vfree(score_csv_file);
887 vfree(score_html_file);
888 vfree(hit_matrix_file);
889 vfree(hit_html_file);
894 NT_node aln2std_tree(Alignment *A, int ipara1, int ipara2, char *mode)
898 char *cpSet = vcalloc(100, sizeof (char));
900 if(strm (mode, "norhit"))
902 B=extract_aln (A, 1, A->len_aln);
903 sprintf ( cpSet, "+aln2tree _COMPARE_nordisaln__STD_%d__CENTER_%d_", ipara1, ipara2);
906 B=extract_aln (A, ipara1, ipara2);
908 T=compute_std_tree (B, (cpSet)?1:0, (cpSet)?&cpSet:NULL);
913 Tree_sim* tree_scan_multiple_pos (int *poslist, int *wlist,int nl, Alignment *A, char *ptree, NT_node RT)
917 int a, b, n, s, p, left, right;
922 //poslist positions come [1..n]
926 pos=vcalloc ( A->len_aln+1, sizeof (int));
927 B=copy_aln (A, NULL);
934 for (b=p-left; b<=p+right; b++)
936 if (b<1 ||b>A->len_aln) return NULL;
939 if (pos[b]>1) return NULL;
943 for (s=0; s<A->nseq; s++)
945 for (n=0,a=1; a<=A->len_aln; a++)
947 if (pos[a])B->seq_al[s][n++]=A->seq_al[s][a-1];
952 for (s=0; s<A->nseq; s++)B->seq_al[s][B->len_aln]='\0';
954 T=compute_std_tree (B, (ptree)?1:0, (ptree)?&ptree:NULL);
962 Tree_sim* tree_scan_pair_pos (Alignment *A, int start, int end, int start2, int end2,char *ptree, NT_node RT)
965 Alignment *B,*B1, *B2;
970 B=copy_aln (A, NULL);
971 B1=extract_aln (A,start,end);
972 B2=extract_aln (A,start2, end2);
975 for ( a=0; a< B->nseq;a++)
976 sprintf (B->seq_al[a], "%s%s", B1->seq_al[a], B2->seq_al[a]);
977 B->len_aln=strlen (B->seq_al[0]);
979 T=compute_std_tree (B, (ptree)?1:0, (ptree)?&ptree:NULL);
983 free_aln (B);free_aln(B1); free_aln(B2);
988 Tree_sim* tree_scan_pos (Alignment *A, int start, int end, char *ptree, NT_node RT)
994 if ( start<1 || start>A->len_aln) return NULL;
995 if ( end<1 || end>A->len_aln) return NULL;
997 B=extract_aln (A,start,end);
998 T=compute_std_tree (B, (ptree)?1:0, (ptree)?&ptree:NULL);
1000 free_tree(T);free_aln (B);
1003 Tree_sim* tree_scan_pos_woble (Alignment *A, int center, int max, char *ptree, NT_node RT, int *br, int *bl )
1013 BTS=vcalloc (1, sizeof (Tree_sim));
1015 for (left=0; left<max; left++)
1016 for (right=0; right<max; right++)
1020 TS=tree_scan_pos (A, start, end, ptree, RT);
1022 if (TS && TS->uw >best_score)
1026 br[0]=right; bl[0]=left;
1033 Tree_sim* tree_cmp( NT_node T1, NT_node T2)
1035 Sequence *S1, *S2, *S;
1039 Tree_sim *TS1, *TS2;
1041 if ( tree_contains_duplicates (T1))
1043 display_tree_duplicates (T1);
1044 printf_exit (EXIT_FAILURE, stderr, "\nFirst Tree Contains Duplicated Sequences [main_compare_trees][FATAL:%s]", PROGRAM);
1047 else if ( tree_contains_duplicates (T2))
1049 display_tree_duplicates (T2);
1050 printf_exit (EXIT_FAILURE, stderr, "\nSecond Tree Contains Duplicated Sequences [main_compare_trees]");
1054 //Identify the commom Sequence Set
1055 S1=tree2seq(T1, NULL);
1056 S2=tree2seq(T2, NULL);
1058 S=trim_seq ( S1, S2);
1062 fprintf ( stderr, "\nERROR: Your two do not have enough common leaf to be compared [FATAL:PROGRAM]");
1065 //Prune the trees and recode the subtree list
1066 T1=prune_tree (T1, S);
1067 T1=recode_tree(T1, S);
1069 T2=prune_tree (T2, S);
1070 T2=recode_tree(T2, S);
1072 TS1=vcalloc (1, sizeof (Tree_sim));
1073 TS2=vcalloc (1, sizeof (Tree_sim));
1075 new_compare_trees ( T1, T2, S->nseq, TS1);
1076 new_compare_trees ( T2, T1, S->nseq, TS2);
1079 TS1->n=tree2nnode (T1);
1082 TS2->n=tree2nnode (T2);
1083 /*if (TS1->n !=TS2->n)
1084 printf_exit (EXIT_FAILURE, stderr,"\nERROR: Different number of Nodes in the two provided trees after prunning [FATAL: %s]", PROGRAM);
1087 free_sequence (S, -1);
1088 free_sequence (S1, -1);
1089 free_sequence (S2, -1);
1091 TS1->uw=(TS1->uw+TS2->uw)*100/(TS1->max_uw+TS2->max_uw);
1092 TS1->w=(TS1->w+TS2->w)*100/(TS1->max_w+TS2->max_w);
1093 TS1->d=(TS1->d+TS2->d)*100/(TS1->max_d+TS2->max_d);
1094 TS1->rf=(TS1->rf+TS2->rf)/2;
1099 NT_node main_compare_trees ( NT_node T1, NT_node T2, FILE *fp)
1103 T=tree_cmp (T1, T2);
1104 fprintf ( fp, "\n#tree_cmp|T: %.f W: %.2f L: %.2f RF: %d N: %d S: %d", T->uw, T->w, T->d, T->rf, T->n, T->nseq);
1105 fprintf ( fp, "\n#tree_cmp_def|T: ratio of identical nodes");
1106 fprintf ( fp, "\n#tree_cmp_def|W: ratio of identical nodes weighted with the min Nseq below node");
1107 fprintf ( fp, "\n#tree_cmp_def|L: average branch length similarity");
1108 fprintf ( fp, "\n#tree_cmp_def|RF: Robinson and Foulds");
1109 fprintf ( fp, "\n#tree_cmp_def|N: number of Nodes in T1 [unrooted]");
1110 fprintf ( fp, "\n#tree_cmp_def|S: number of Sequences in T1\n");
1116 int new_compare_trees ( NT_node T1, NT_node T2, int nseq, Tree_sim *TS)
1122 if (!T1 || !T2) return 0;
1124 n+=new_compare_trees (T1->left, T2, nseq,TS);
1125 n+=new_compare_trees (T1->right, T2, nseq,TS);
1127 //Exclude arbitrary splits (dist==0)
1128 if ((T1->dist==0) && !(T1->parent))return n;
1130 N=new_search_split (T1, T2, nseq);
1132 t2=(N)?FABS(N->dist):0;
1133 TS->max_d+=MAX(t1, t2);
1139 w=MIN((nseq-T1->nseq),T1->nseq);
1148 //T1->dist=T1->nseq;
1152 //T1->dist=T1->nseq*-1;
1163 NT_node new_search_split (NT_node T, NT_node B, int nseq)
1166 if (!T || !B) return NULL;
1167 else if ( new_compare_split (T->lseq2, B->lseq2, nseq)==1)return B;
1168 else if ( (N=new_search_split (T, B->right, nseq)))return N;
1169 else return new_search_split (T, B->left, nseq);
1171 int new_compare_split ( int *b1, int *b2, int n)
1175 for (flag=1, a=0; a<n; a++)
1176 if (b1[a]!=b2[a])flag=0;
1177 if (flag) return flag;
1179 for (flag=1, a=0; a<n; a++)
1180 if (b1[a]==b2[a])flag=0;
1184 float compare_trees ( NT_node T1, NT_node T2, int nseq,int mode)
1186 /*search each branch of T1 in T2*/
1190 if ( !T1 || !T2)return 0;
1192 if (getenv4debug("DEBUG_TREE_COMPARE"))display_node (T1, "\nNODE ", nseq);
1194 if (T1->parent && T1->nseq>1)n+=search_node ( T1, T2, nseq, mode);
1196 n+=compare_trees ( T1->left, T2, nseq, mode);
1197 n+=compare_trees ( T1->right, T2, nseq, mode);
1202 float search_node ( NT_node B, NT_node T, int nseq, int mode)
1205 if ( !B || !T) return -1;
1206 if (getenv4debug("DEBUG_TREE_COMPARE"))display_node ( T, "\n\t", nseq);
1208 n=compare_node ( B->lseq2, T->lseq2, nseq );
1212 if (getenv4debug("DEBUG_TREE_COMPARE"))fprintf ( stderr, "[1][%d]", (int)evaluate_node_similarity ( B, T, nseq, mode));
1213 if (mode==RECODE)B->dist=B->leaf;
1214 return evaluate_node_similarity ( B, T, nseq, mode);
1218 if (getenv4debug("DEBUG_TREE_COMPARE"))fprintf ( stderr, "[-1]");
1219 if (mode==RECODE)B->dist=-B->leaf;
1224 if (getenv4debug("DEBUG_TREE_COMPARE"))fprintf ( stderr, "[0]");
1225 n=search_node ( B, T->left, nseq, mode);
1227 n=search_node ( B, T->right, nseq, mode);
1229 n=search_node ( B, T->bot, nseq, mode);
1235 float evaluate_node_similarity ( NT_node B, NT_node T, int nseq, int mode)
1239 if ( mode==TOPOLOGY || mode ==RECODE)
1241 for ( a=0; a< nseq; a++)
1242 if ( B->lseq2[a]!=T->lseq2[a]) return 0;
1245 else if ( mode == WEIGHTED)
1247 for (c=0, a=0; a< nseq; a++)
1249 if ( B->lseq2[a]!=T->lseq2[a]) return 0;
1250 else c+=B->lseq2[a];
1252 return (float)(MIN(c,nseq));
1254 else if ( mode == LENGTH )
1258 for (c=0, a=0; a< nseq; a++)
1260 if ( B->lseq2[a]!=T->lseq2[a]) return 0;
1262 d1=FABS((B->dist-T->dist));
1263 d2=MAX(B->dist, T->dist);
1264 return (d2>0)?(d1*100)/d2:0;
1271 int compare_node ( int *b1, int *b2, int nseq)
1275 n1=compare_node1 ( b1, b2, nseq);
1276 /*fprintf ( stderr, "[%d]", n1);*/
1277 if ( n1==1) return 1;
1279 n2=compare_node2 ( b1, b2, nseq);
1280 /* fprintf ( stderr, "[%d]", n2);*/
1281 if ( n2==1)return 1;
1282 else if ( n2==-1 && n1==-1) return -1;
1285 int compare_node1 ( int *b1, int *b2, int n)
1290 for ( a=0; a< n; a++)
1294 if ( l1==1 && l2==0) return -1;
1299 int compare_node2 ( int *b1, int *b2, int n)
1305 for ( a=0; a< n; a++)
1309 if ( l1==1 && l2==0) return -1;
1314 void display_node (NT_node N, char *string,int nseq)
1317 fprintf ( stderr, "%s", string);
1318 for (a=0; a< nseq; a++)fprintf ( stderr, "%d", N->lseq2[a]);
1322 /*********************************************************************/
1324 /* FJ_tree Computation */
1327 /*********************************************************************/
1330 NT_node tree_compute ( Alignment *A, int n, char ** arg_list)
1332 if (n==0 || strm (arg_list[0], "cw"))
1334 return compute_cw_tree (A);
1336 else if ( strm (arg_list[0], "fj"))
1338 return compute_fj_tree ( NULL, A, (n>=1)?atoi(arg_list[1]):8, (n>=2)?arg_list[2]:NULL);
1341 else if ( ( strm (arg_list[0], "nj")))
1343 return compute_std_tree (A, n, arg_list);
1346 return compute_std_tree (A, n, arg_list);
1349 NT_node compute_std_tree (Alignment *A, int n, char **arg_list)
1351 return compute_std_tree_2 (A, NULL, list2string (arg_list, n));
1353 NT_node compute_std_tree_2 (Alignment *A, int **s, char *cl)
1356 NT_node T, **BT=NULL;
1365 tree_name =vtmpnam (NULL);
1367 if (strstr (cl, "help"))
1369 fprintf ( stdout, "\n+aln2tree| _MATRIX_ : matrix used for the comparison (idmat, sarmat, pam250mt..)\n");
1370 fprintf ( stdout, "\n+aln2tree| _SCORE_ : score mode used for the distance (sim, raw)\n");
1371 fprintf ( stdout, "\n+aln2tree| _COMPARE_: comparison mode (aln, ktup, align, nordisaln)\n");
1372 fprintf ( stdout, "\n+aln2tree| _TMODE_ : tree mode (nj, upgma)\n");
1373 exit (EXIT_SUCCESS);
1377 //matrix: idmat, ktup,sarmat, sarmat2
1378 strget_param (cl, "_MATRIX_", "idmat", "%s",matrix);
1381 strget_param (cl, "_SCORE_", "sim", "%s",score);
1383 //compare: aln, ktup, align
1384 strget_param (cl, "_COMPARE_", "aln", "%s",compare);
1386 //compare: aln, ktup, align
1387 strget_param (cl, "_TMODE_", "nj", "%s",tmode);
1390 if ( strm (compare, "nordisaln"))
1392 strget_param (cl, "_STD_", "1", "%d", &STD);
1393 strget_param (cl, "_CENTER_", "5", "%d", &CENTER);
1395 //Use external msa2tree methods
1396 if ( strm (tmode, "cw"))
1399 return compute_cw_tree (A);
1402 //compute distance matrix if needed
1406 if ( strm (compare, "ktup"))
1408 ungap_array (A->seq_al, A->nseq);
1409 s=get_sim_aln_array ( A,cl);
1411 else if ( strm ( compare, "aln"))
1413 if (strm (score, "sim"))
1414 s=get_sim_aln_array(A, matrix);
1415 else if ( strm (score, "raw"))
1417 s=get_raw_sim_aln_array (A,matrix);
1420 else if ( strm ( compare, "nordisaln"))
1422 s=get_sim_aln_array_normal_distribution(A, matrix, &STD, &CENTER);
1424 s=sim_array2dist_array(s, 100);
1428 if (strm (tmode, "nj"))
1431 BT=int_dist2nj_tree (s, A->name, A->nseq, tree_name);
1432 T=main_read_tree (tree_name);
1435 else if (strm (tmode, "upgma"))
1437 BT=int_dist2upgma_tree (s,A, A->nseq, tree_name);
1438 T=main_read_tree (tree_name);
1442 if ( strm ( cl, "dpa"))
1444 s=dist_array2sim_array(s, 100);
1445 T=code_dpa_tree (T,s);
1448 if (free_s)free_int (s, -1);
1452 NT_node similarities_file2tree (char *mat)
1461 tree_name =vtmpnam (NULL);
1463 s=input_similarities (mat,NULL, NULL);
1466 A=similarities_file2aln(mat);
1467 s=sim_array2dist_array(s, 100);
1470 int_dist2nj_tree (s, A->name, A->nseq, tree_name);
1471 T=main_read_tree(tree_name);
1476 NT_node compute_cw_tree (Alignment *A)
1478 char *tmp1, *tmp2, tmp3[1000];
1481 tmp1=vtmpnam (NULL);
1482 tmp2=vtmpnam (NULL);
1484 sprintf ( tmp3, "%s.ph", tmp1);
1485 output_clustal_aln (tmp1, A);
1486 sprintf ( command, "clustalw -infile=%s -tree -newtree=%s %s ", tmp1,tmp3, TO_NULL_DEVICE);
1487 my_system ( command);
1488 sprintf ( command, "mv %s %s", tmp3, tmp2);
1489 my_system ( command);
1490 return main_read_tree(tmp2);
1493 NT_node compute_fj_tree (NT_node T, Alignment *A, int limit, char *mode)
1495 static int in_fj_tree;
1496 if (!in_fj_tree)fprintf ( stderr, "\nComputation of an NJ tree using conserved positions\n");
1499 if (T && T->leaf<=2);
1502 T=aln2fj_tree(T,A,limit, mode);
1503 T->right=compute_fj_tree ( T->right, A, limit, mode);
1504 T->left=compute_fj_tree ( T->left, A, limit, mode);
1511 NT_node aln2fj_tree(NT_node T, Alignment *A, int limit_in, char *mode)
1515 Alignment *subA=NULL;
1520 S=tree2seq (T,NULL);
1526 for ( fraction_gap=100; fraction_gap<=100 && l<1; fraction_gap+=10)
1527 for ( limit=limit_in; limit>0 && l<1; limit--)
1529 fprintf ( stderr, "\n%d %d", limit, fraction_gap);
1531 subA=extract_sub_aln2 (A,S->nseq,S->name);
1532 subA=filter_aln4tree (subA,limit,fraction_gap, mode);
1536 /* while ( subA->len_aln<1)
1538 subA=extract_sub_aln2 (A,S->nseq,S->name);
1539 subA=filter_aln4tree (subA,limit,fraction_gap,mode);
1541 subA=extract_sub_aln2 (A,S->nseq,S->name);
1542 subA=filter_aln4tree (subA,--limit,fraction_gap, mode);
1546 NT=tree2fj_tree (NT);
1548 NT=realloc_tree (NT,A->nseq);
1549 fprintf ( stderr, "Limit:%d Gap: %d Columns: %4d Left: %4d Right %4d BL:%4.2f\n",limit,fraction_gap, subA->len_aln, (NT->right)->leaf,(NT->left)->leaf, (NT->left)->dist+(NT->right)->dist);
1554 NT->parent=T->parent;
1558 free_sequence (S, -1);
1562 Alignment * filter_aln4tree (Alignment *A, int n,int fraction_gap,char *mode)
1565 char *ungaped_aln_file;
1566 char *scored_aln_file;
1567 char *filtered_aln_file;
1571 aln_file=vtmpnam(NULL);
1572 ungaped_aln_file=vtmpnam (NULL);
1573 scored_aln_file=vtmpnam (NULL);
1574 scored_aln_file=vtmpnam(NULL);
1575 filtered_aln_file=vtmpnam(NULL);
1579 output_clustal_aln (aln_file, A);
1580 /* 1: remove columns with too many gaps*/
1581 sprintf ( command, "t_coffee -other_pg seq_reformat -in %s -action +rm_gap %d -output clustalw > %s", aln_file,fraction_gap, ungaped_aln_file);
1582 my_system ( command);
1583 /* 2: evaluate the alignment*/
1585 sprintf ( command, "t_coffee -other_pg seq_reformat -in %s -action +evaluate %s -output clustalw > %s", ungaped_aln_file,(mode)?mode:"categories", scored_aln_file);
1586 my_system ( command);
1588 /*3 extract the high scoring columns*/
1589 sprintf ( command, "t_coffee -other_pg seq_reformat -in %s -struc_in %s -struc_in_f number_aln -action +use_cons +keep '[%d-8]' +rm_gap -output clustalw > %s", ungaped_aln_file, scored_aln_file,n, filtered_aln_file);
1590 my_system ( command);
1594 A=main_read_aln ( filtered_aln_file, NULL);
1600 NT_node tree2fj_tree (NT_node T)
1606 L=find_longest_branch (T, NULL);
1607 T=reroot_tree (T, L);
1612 /*********************************************************************/
1614 /* Tree Filters and MAnipulation */
1617 /*********************************************************************/
1618 int tree2star_nodes (NT_node R, int n_max)
1621 else if (!R->left && !R->right)
1623 if (n_max>=1)R->dist=0;
1630 n+=tree2star_nodes (R->right, n_max);
1631 n+=tree2star_nodes (R->left, n_max);
1633 if (n<n_max)R->dist=0;
1638 NT_node aln2tree (Alignment *A)
1643 T=make_nj_tree (A, NULL, 0, 0, A->seq_al, A->name, A->nseq, NULL, NULL);
1644 tree2nleaf (T[3][0]);
1648 NT_node realloc_tree ( NT_node R, int n)
1653 R->right=realloc_tree (R->right,n);
1654 R->left=realloc_tree (R->left,n);
1655 R->bot=realloc_tree (R->bot,n);
1657 R->lseq=vrealloc (R->lseq, n*sizeof (int));
1658 R->lseq2=vrealloc (R->lseq2, n*sizeof (int));
1662 NT_node reset_boot_tree ( NT_node R, int n)
1668 R->right=reset_boot_tree (R->right,n);
1669 R->left=reset_boot_tree (R->left,n);
1670 R->bot=reset_boot_tree (R->bot,n);
1672 R->bootstrap=(float)n;
1676 NT_node tree_dist2normalized_tree_dist ( NT_node R, float max)
1681 tree_dist2normalized_tree_dist ( R->right, max);
1682 tree_dist2normalized_tree_dist ( R->left, max);
1683 R->bootstrap=(int)((R->dist*100)/max);
1687 NT_node reset_dist_tree ( NT_node R, float n)
1693 R->right=reset_dist_tree (R->right,n);
1694 R->left=reset_dist_tree (R->left,n);
1695 R->bot=reset_dist_tree (R->bot,n);
1697 if (R->parent && !(R->parent)->parent && !(R->parent)->bot)R->dist=n/2;
1704 NT_node* free_treelist (NT_node *L)
1707 while (L[n])free_tree (L[n++]);
1711 NT_node free_tree ( NT_node R)
1719 R->right=free_tree (R->right);
1720 R->left=free_tree (R->left);
1721 R->bot=free_tree (R->bot);
1728 NT_node free_tree_node ( NT_node R)
1737 vfree ( R->lseq); vfree ( R->lseq2);
1742 NT_node rename_seq_in_tree ( NT_node R, char ***list)
1744 if ( !R || !list) return R;
1748 R->right=rename_seq_in_tree (R->right, list);
1749 R->left=rename_seq_in_tree (R->left, list);
1750 R->bot=rename_seq_in_tree (R->bot, list);
1755 while ( list[n][0][0])
1757 if ( strm (list[n][0], R->name))sprintf (R->name, "%s",list[n][1]);
1763 Sequence * tree2seq (NT_node R, Sequence *S)
1769 S=declare_sequence (10, 10, tree2nseq (R));
1775 sprintf ( S->name[S->nseq++], "%s", R->name);
1779 S=tree2seq (R->left, S);
1780 S=tree2seq (R->right, S);
1785 NT_node balance_tree (NT_node T)
1791 else if ( T->leaf<=2)return T;
1794 if (!list)list=declare_int (3, 2);
1800 list[0][0]=(T->left)?(T->left)->leaf:0;
1802 list[1][0]=(T->right)?(T->right)->leaf:0;
1804 list[2][0]=(T->bot)?(T->bot)->leaf:0;
1807 sort_int (list,2,0,0,2);
1809 T->left=NL[list[2][1]];
1810 T->right=NL[list[1][1]];
1811 T->bot=NL[list[0][1]];
1813 T->left=balance_tree (T->left);
1814 T->right=balance_tree (T->right);
1815 T->bot=balance_tree (T->bot);
1819 FILE * display_tree (NT_node R, int nseq, FILE *fp)
1827 if ( R->nseq==1)fprintf (stderr,"\n[%s] ", R->name);
1828 else fprintf ( stderr, "\n[%d Node] ",R->nseq);
1829 for ( a=0; a< R->nseq; a++) fprintf ( stderr, "[%d]", R->lseq[a]);
1831 fprintf (fp, "\n %10s N ", R->name);
1832 for ( a=0; a< nseq; a++)fprintf (fp, "%d", R->lseq2[a]);
1833 fprintf (fp, "\n %10s D ", R->name);
1834 for ( a=0; a< nseq; a++)fprintf (fp, "%d", R->idist[a]);
1837 if (R->leaf==1) fprintf (fp, " %s", R->name);
1838 fprintf (fp, " :%.4f", R->dist);
1839 HERE ("\nGo Left");fp=display_tree (R->left, nseq, fp);
1840 HERE ("\nGo Right");fp=display_tree (R->right, nseq, fp);
1841 HERE ("\nGo Bot");fp=display_tree (R->bot, nseq, fp);
1845 int tree2nnode_unresolved (NT_node R, int *l)
1848 else if (R->leaf && R->dist==0){return 1;}
1852 n+=tree2nnode_unresolved (R->right, l);
1853 n+=tree2nnode_unresolved (R->left, l);
1867 int tree2nnode ( NT_node R)
1871 else if ( R->leaf==1){R->node=1;n=1;}
1875 n+=tree2nnode (R->right);
1876 n+=tree2nnode (R->left);
1877 n+=tree2nnode (R->bot);
1882 int tree2nleaf (NT_node R)
1885 else if (R->leaf==1){return 1;}
1886 else if (R->right==NULL && R->left==NULL && R->bot==NULL){R->leaf=1; return 1;}
1890 n+=tree2nleaf (R->right);
1891 n+=tree2nleaf (R->left);
1892 n+=tree2nleaf (R->bot);
1899 int tree2nseq ( NT_node R)
1901 return tree2nleaf(R);
1904 int tree_file2nseq (char *fname)
1910 string=vcalloc (count_n_char_in_file(fname)+1, sizeof (char));
1912 fp=vfopen (fname, "r");
1914 while ( (c=fgetc(fp))!=EOF){if (c=='(' || c==')' || c==',' || c==';') string[n++]=c;}
1915 vfclose (fp);string[n]='\0';
1917 for (n=0, p=1; p<strlen (string); p++)
1922 if ( a=='(' && b==',')n++;
1923 else if ( a==',' && b==')')n++;
1924 else if ( a==',' && b==',')n++;
1926 if ( getenv4debug("DEBUG_TREE"))fprintf (stderr, "\n[DEBUG_TREE:tree_file2nseq]%s", string);
1932 void clear_tree ( NT_node R)
1941 clear_tree ( R->right);
1942 clear_tree ( R->left);
1943 clear_tree ( R->bot);
1946 int display_leaf_below_node (NT_node T, FILE *fp)
1953 fprintf (fp, " %s", T->name);
1958 n+=display_leaf_below_node ( T->right, fp);
1959 n+=display_leaf_below_node ( T->left, fp);
1963 int display_leaf ( NT_node T, FILE *fp)
1967 else if ( T->visited)return 0;
1972 fprintf (fp, " %s", T->name);
1977 n+=display_leaf ( T->right, fp);
1978 n+=display_leaf ( T->left, fp);
1979 n+=display_leaf ( T->bot, fp);
1987 NT_node find_longest_branch ( NT_node T, NT_node L)
1990 if ( !L || T->dist>L->dist)
1996 if ( T->leaf==1)return L;
1999 L=find_longest_branch ( T->right, L);
2000 L=find_longest_branch ( T->left, L);
2004 int node2side (NT_node N);
2005 int test_print (NT_node T);
2006 NT_node straighten_node (NT_node N);
2009 NT_node reroot_tree ( NT_node TREE, NT_node Right)
2011 /*ReRoots the tree between Node R and its parent*/
2015 if (!EMPTY)EMPTY=vcalloc (1, sizeof (NT_node));
2016 if ( !Right->parent)return Right;
2018 TREE=unroot_tree (TREE);
2019 if (Right->parent==NULL && Right->bot)
2022 n1=tree2nleaf (TREE);
2024 NR=declare_tree_node(TREE->maxnseq);
2027 NR->left=Right->parent;
2030 Right->dist=Right->dist/2;
2032 if ((NR->left)->right==Right)(NR->left)->right=EMPTY;
2033 else if ( (NR->left)->left==Right) (NR->left)->left=EMPTY;
2038 NR->left=straighten_node (NR->left);
2042 (NR->left)->parent=NR;
2043 (NR->left)->dist=Right->dist;
2049 if ( n1!=n2){fprintf ( stderr, "\n%d %d", n1, n2);myexit (EXIT_FAILURE);}
2053 NT_node straighten_node ( NT_node N)
2060 if (N->right==EMPTY)N->right=N->parent;
2061 else if ( N->left==EMPTY) N->left=N->parent;
2064 if (Child->right==N)
2068 else if (Child->left==N)
2074 Child=straighten_node (Child);
2076 Child->dist=N->dist;
2079 else if ( N->bot && N->bot!=Previous)
2081 if ( N->right==EMPTY)N->right=N->bot;
2082 else if ( N->left==EMPTY)N->left=N->bot;
2093 int test_print (NT_node T)
2097 fprintf ( stderr, "\nEMPTY");
2099 else if ( !T->left && !T->right)
2101 fprintf ( stderr, "\n%s",T->name);
2105 fprintf ( stderr, "\nGoing Right");
2106 test_print (T->right);
2107 fprintf ( stderr, "\nGoing Left");
2108 test_print (T->left);
2112 int node2side (NT_node C)
2114 if ( !C->parent) return UNKNOWN;
2115 else if ( (C->parent)->left==C)return LEFT;
2116 else if ( (C->parent)->right==C)return RIGHT;
2117 else return UNKNOWN;
2119 NT_node straighten_tree ( NT_node P, NT_node C, float new_dist)
2123 if ( C==NULL)return NULL;
2130 if (C->left && C->right)
2141 else if ( C->left==NULL && C->right==NULL)
2145 else if ( C->right==P)
2150 C=straighten_tree(C, C->right, dist);
2152 else if ( C->left==P)
2156 C=straighten_tree (C, C->left, dist);
2158 else if ( C->parent==NULL)
2167 NT_node unroot_tree ( NT_node T)
2170 if (!T || T->visited) return T;
2173 if (T->parent==NULL)
2176 (T->right)->dist=(T->left)->dist=(T->right)->dist+(T->left)->dist;
2177 (T->right)->parent=T->left;
2178 (T->left)->parent=T->right;
2185 T->parent=unroot_tree (T->parent);
2186 T->right=unroot_tree (T->right);
2187 T->left=unroot_tree (T->left);
2193 FILE * print_tree_list ( NT_node *T, char *format,FILE *fp)
2198 fp=print_tree (T[a], format, fp);
2203 char * tree2string (NT_node T)
2205 if (!T) return NULL;
2211 if (!f)f=vtmpnam (NULL);
2213 print_tree (T, "newick", fp);
2215 return file2string (f);
2218 char * tree2file (NT_node T, char *name, char *mode)
2220 if (!name)name=vtmpnam (NULL);
2221 string2file (tree2string (T), name, mode);
2224 FILE * print_tree ( NT_node T, char *format,FILE *fp)
2229 S=tree2seq(T, NULL);
2233 free_sequence (S, -1);
2234 if ( format && strm (format, "binary"))
2235 fp=display_tree ( T,S->nseq, fp);
2236 else if ( ! format || strm2 (format, "newick_tree","newick"))
2238 /*T=balance_tree (T);*/
2239 fp=rec_print_tree (T, fp);
2240 fprintf ( fp, ";\n");
2244 fprintf ( stderr, "\nERROR: %s is an unknown tree format [FATAL:%s]\n", format, PROGRAM);
2245 myexit (EXIT_FAILURE);
2249 int print_newick_tree ( NT_node T, char *name)
2252 fp=vfopen (name, "w");
2253 fp=rec_print_tree (T,fp);
2254 fprintf (fp, ";\n");
2258 FILE * rec_print_tree ( NT_node T, FILE *fp)
2267 fprintf ( fp, " %s:%.5f",T->name, T->dist);
2271 if (T->left && T->right)
2273 fprintf ( fp, "(");fp=rec_print_tree ( T->left, fp);
2274 fprintf ( fp, ",");fp=rec_print_tree ( T->right, fp);
2276 if (T->parent || T->dist)
2278 if ( T->bootstrap!=0)fprintf (fp, " %d", (int)T->bootstrap);
2279 fprintf (fp, ":%.5f", T->dist);
2282 else if (T->left)fp=rec_print_tree (T->left, fp);
2283 else if (T->right)fp=rec_print_tree(T->right, fp);
2293 /*********************************************************************/
2295 /* Tree Functions */
2298 /*********************************************************************/
2300 int ** make_sub_tree_list ( NT_node **T, int nseq, int n_node)
2304 /*This function produces a list of all the sub trees*/
2317 /* Contains 4 i_nodes */
2318 /* 8 nodes (internal nodes +leaves) */
2329 int **sub_tree_list;
2335 sub_tree_list=declare_int ( (n_node), nseq);
2336 make_all_sub_tree_list (T[3][0],sub_tree_list, &n);
2341 sub_tree_list=declare_int (nseq, nseq);
2342 for ( a=0; a< nseq; a++)sub_tree_list[a][a]=1;
2345 return sub_tree_list;
2348 void make_all_sub_tree_list ( NT_node N, int **list, int *n)
2350 make_one_sub_tree_list (N, list[n[0]++]);
2353 make_all_sub_tree_list (N->left , list, n);
2354 make_all_sub_tree_list (N->right, list, n);
2359 void make_one_sub_tree_list ( NT_node T,int *list)
2368 make_one_sub_tree_list(T->left , list);
2369 make_one_sub_tree_list(T->right, list);
2375 NT_node old_main_read_tree(char *treefile)
2377 /*Reads a tree w/o needing the sequence file*/
2379 T=simple_read_tree (treefile);
2385 NT_node** simple_read_tree(char *treefile)
2389 T=read_tree ( treefile, &tot_node,tree_file2nseq (treefile),NULL);
2393 void free_read_tree ( NT_node **BT)
2400 for (s=0,a=0; a<3; a++)
2404 free_tree (BT[3][0]);
2409 NT_node** read_tree(char *treefile, int *tot_node,int nseq, char **seq_names)
2412 /*The Tree Root is in the TREE[3][0]...*/
2413 /*TREE[0][ntot]--> pointer to each node and leave*/
2419 int nnodes = 0;/*Number of Internal Nodes*/
2420 int ntotal = 0;/*Number of Internal Nodes + Number of Leaves*/
2424 NT_node seq_tree, root,p;
2428 rooted_tree=distance_tree=TRUE;
2430 fp = vfopen(treefile, "r");
2432 ch = (char)getc(fp);
2435 fprintf(stderr, "Error: Wrong format in tree file %s\n", treefile);
2436 myexit (EXIT_FAILURE);
2441 lu_ptr=(NT_node **)vcalloc(4,sizeof(NT_node*));
2442 lu_ptr[0] = (NT_node *)vcalloc(10*nseq,sizeof(NT_node));
2443 lu_ptr[1] = (NT_node *)vcalloc(10*nseq,sizeof(NT_node));
2444 lu_ptr[2] = (NT_node *)vcalloc(10*nseq,sizeof(NT_node));
2445 lu_ptr[3] =(NT_node *) vcalloc(1,sizeof(NT_node));
2447 seq_tree =(NT_node) declare_tree_node(nseq);
2449 set_info(seq_tree, NULL, 0, " ", 0.0, 0);
2452 fp=create_tree(seq_tree,NULL,&nseq_read, &ntotal, &nnodes, lu_ptr, fp);
2456 if (nseq != tot_nseq)
2458 fprintf(stderr," Error: tree not compatible with alignment (%d sequences in alignment and %d in tree\n", nseq,nseq_read);
2459 myexit (EXIT_FAILURE);
2462 if (distance_tree == FALSE)
2464 if (rooted_tree == FALSE)
2466 fprintf(stderr,"Error: input tree is unrooted and has no distances, cannot align sequences\n");
2467 myexit (EXIT_FAILURE);
2471 if (rooted_tree == FALSE)
2473 root = reroot(seq_tree, nseq,ntotal,nnodes, lu_ptr);
2474 lu_ptr[1][nnodes++]=lu_ptr[0][ntotal++]=root;
2487 for ( a=0; a< ntotal; a++)
2489 (lu_ptr[0][a])->isseq=(lu_ptr[0][a])->leaf;
2490 (lu_ptr[0][a])->dp=(lu_ptr[0][a])->dist;
2494 for ( a=0; a< nseq; a++)
2499 (lu_ptr[2][a])->order=(lu_ptr[2][a])->seq=a;
2503 for ( flag=0,b=0; b<nseq; b++)
2505 if ( strncmp ( (lu_ptr[2][a])->name, seq_names[b], MAXNAMES)==0)
2509 (lu_ptr[2][a])->order=(lu_ptr[2][a])->seq=b;
2510 /*vfree ( (lu_ptr[2][a])->name);*/
2511 sprintf ((lu_ptr[2][a])->name, "%s", seq_names[b]);
2516 if ( flag==0 && (lu_ptr[0][a])->leaf==1)
2518 fprintf ( stderr, "\n%s* not in tree",(lu_ptr[2][a])->name);
2519 for ( a=0; a< ntotal; a++)
2521 fprintf ( stderr, "\n%d %s",(lu_ptr[2][a])->leaf, (lu_ptr[2][a])->name);
2533 tnseq=tree_file2nseq(treefile);
2534 tree_names=vcalloc ( tnseq, sizeof (char*));
2535 for (a=0; a<tnseq; a++)
2537 s=(lu_ptr[2][a])->name;
2539 if ( name_is_in_list(s, seq_names, nseq, MAXNAMES+1)==-1)
2541 fprintf (stderr, "\nERROR: Sequence %s in the tree [%s] is not in the alignment[FATAL:%s]\n", s, treefile, PROGRAM);
2545 for (a=0; a<nseq; a++)
2548 if ( name_is_in_list(s, tree_names, nseq, MAXNAMES+1)==-1)
2550 fprintf (stderr, "\nERROR: Sequence %s in the sequences is not in the tree [%s][FATAL:%s]\n", s, treefile, PROGRAM);
2555 if ( fail_flag==1)exit (EXIT_FAILURE);
2558 for ( a=0; a< nseq; a++)
2565 p->lseq[p->nseq]=c_seq;
2575 FILE * create_linear_tree ( char **name, int n, FILE *fp)
2578 if (!name || n==0 ||!fp) return NULL;
2582 fprintf ( fp, "(%s,%s);",name[0],name[1]);
2584 fprintf ( fp, "((%s,%s),%s);",name[0],name[1], name[2]);
2588 for (a=0; a<n-2; a++)fprintf (fp, "(");
2589 fprintf (fp, "%s, %s),", name[0], name[1]);
2590 for ( a=2; a<n-2; a++)fprintf ( fp, "%s),",name[a]);
2591 fprintf ( fp, "%s,%s);\n", name[n-2], name[n-1]);
2595 FILE * create_tree(NT_node ptree, NT_node parent,int *nseq,int *ntotal,int *nnodes,NT_node **lu, FILE *fp)
2605 name=vcalloc ( MAXNAMES+1, sizeof (char));
2606 sprintf ( name, " ");
2608 ch = (char)getc(fp);
2614 lu[0][ntotal[0]] = lu[1][nnodes[0]] = ptree;
2617 create_tree_node(ptree, parent);
2618 fp=create_tree(ptree->left, ptree, nseq,ntotal,nnodes,lu,fp);
2619 ch = (char)getc(fp);
2622 fp=create_tree(ptree->right, ptree,nseq,ntotal,nnodes,lu,fp);
2623 ch = (char)getc(fp);
2627 ptree = insert_tree_node(ptree);
2628 lu[0][ntotal[0]] = lu[1][nnodes[0]] = ptree;
2631 fp=create_tree(ptree->right, ptree,nseq,ntotal,nnodes,lu,fp);
2632 rooted_tree = FALSE;
2633 if ( getenv4debug ( "DEBUG_TREE")){fprintf ( stderr, "\n[DEBUG_TREE:create_tree] Unrooted Tree");}
2638 ch = (char)getc(fp);
2643 lu[0][ntotal[0]] = lu[2][nseq[0]] = ptree;
2648 ch = (char)getc(fp);
2651 /*This protects names that are between single quotes*/
2654 if (i < MAXNAMES) name[i++] = ch;
2655 ch = (char)getc(fp);
2657 if (i < MAXNAMES) name[i++] = ch;
2658 while ((ch != ':') && (ch != ',') && (ch != ')'))ch = (char)getc(fp);
2662 while ((ch != ':') && (ch != ',') && (ch != ')'))
2664 if (i < MAXNAMES) name[i++] = ch;
2665 ch = (char)getc(fp);
2671 if ( i>=(MAXNAMES+1)){fprintf (stderr, "\nName is too long");myexit (EXIT_FAILURE);}
2672 if (ch != ':' && !isdigit(ch))
2674 /*distance_tree = FALSE*/;
2680 fscanf(fp,"%f",&dist);
2684 /*Tree with Bootstrap information*/
2685 else if (isdigit (ch))
2688 fscanf(fp,"%f",&bootstrap);
2689 if ( fscanf(fp,":%f",&dist)==1);
2699 set_info(ptree, parent, type, name, dist, bootstrap);
2706 NT_node declare_tree_node (int nseq)
2710 p= (NT_node)vcalloc (1, sizeof ( Treenode));
2718 p->name=(char*)vcalloc (MAXNAMES+1,sizeof (char));
2720 p->lseq=(int*)vcalloc ( nseq, sizeof (int));
2725 void set_info(NT_node p, NT_node parent, int pleaf, char *pname, float pdist, float bootstrap)
2730 p->bootstrap=bootstrap;
2734 sprintf (p->name, "%s", pname);
2743 NT_node insert_tree_node(NT_node pptr)
2748 newnode = declare_tree_node( pptr->maxnseq);
2749 create_tree_node(newnode, pptr->parent);
2751 newnode->left = pptr;
2752 pptr->parent = newnode;
2754 set_info(newnode, pptr->parent, 0, "", 0.0, 0);
2759 void create_tree_node(NT_node pptr, NT_node parent)
2761 pptr->parent = parent;
2762 pptr->left =declare_tree_node(pptr->maxnseq) ;
2763 (pptr->left)->parent=pptr;
2765 pptr->right =declare_tree_node(pptr->maxnseq) ;
2766 (pptr->right)->parent=pptr;
2769 FILE * skip_space(FILE *fp)
2778 fprintf ( stderr, "\nEOF");
2779 myexit (EXIT_FAILURE);
2786 NT_node reroot(NT_node ptree, int nseq, int ntotal, int nnodes, NT_node **lu)
2788 NT_node p, rootnode, rootptr;
2789 float diff, mindiff=0, mindepth = 1.0, maxdist;
2797 for (i=0; i<ntotal; i++)
2800 if (p->parent == NULL)
2801 diff = calc_root_mean(p, &maxdist, nseq, lu);
2803 diff = calc_mean(p, &maxdist, nseq, lu);
2805 if ((diff == 0) || ((diff > 0) && (diff < 2 * p->dist)))
2807 if ((maxdist < mindepth) || (first == TRUE))
2817 if (rootptr == ptree)
2819 mindiff = rootptr->left->dist + rootptr->right->dist;
2820 rootptr = rootptr->right;
2823 rootnode = insert_root(rootptr, mindiff);
2824 diff = calc_root_mean(rootnode, &maxdist, nseq, lu);
2829 float calc_root_mean(NT_node root, float *maxdist, int nseq, NT_node **lu)
2831 float dist , lsum = 0.0, rsum = 0.0, lmean,rmean,diff;
2838 dist = (*maxdist) = 0;
2840 for (i=0; i< nseq; i++)
2844 while (p->parent != root)
2849 if (p == root->left) direction = LEFT;
2850 else direction = RIGHT;
2853 if (direction == LEFT)
2864 if (dist > (*maxdist)) *maxdist = dist;
2870 diff = lmean - rmean;
2874 float calc_mean(NT_node nptr, float *maxdist, int nseq,NT_node **lu)
2876 float dist , lsum = 0.0, rsum = 0.0, lmean,rmean,diff;
2877 NT_node p, *path2root;
2879 int depth = 0, i,j , n;
2881 int direction, found;
2884 path2root = (NT_node *)vcalloc(nseq,sizeof(Treenode));
2885 dist2node = (float *)vcalloc(nseq,sizeof(float));
2887 depth = (*maxdist) = dist = 0;
2892 path2root[depth] = p;
2894 dist2node[depth] = dist;
2899 /***************************************************************************
2901 for each leaf, determine whether the leaf is left or right of the node.
2902 (RIGHT = descendant, LEFT = not descendant)
2903 ****************************************************************************/
2904 for (i=0; i< nseq; i++)
2919 while ((found == FALSE) && (p->parent != NULL))
2921 for (j=0; j< depth; j++)
2922 if (p->parent == path2root[j])
2931 if (p == nptr) direction = RIGHT;
2934 if (direction == LEFT)
2937 lsum += dist2node[n-1];
2946 if (dist > (*maxdist)) *maxdist = dist;
2954 if ( nl==0 || nr==0)
2956 myexit (EXIT_FAILURE);
2961 diff = lmean - rmean;
2965 NT_node insert_root(NT_node p, float diff)
2967 NT_node newp, prev, q, t;
2968 float dist, prevdist,td;
2971 newp = declare_tree_node( p->maxnseq);
2981 if (p->dist < 0.0) p->dist = 0.0;
2982 if (p->dist > dist) p->dist = dist;
2984 t->dist = dist - p->dist;
2988 newp->parent = NULL;
2992 if (t->left == p) t->left = t->parent;
2993 else t->right = t->parent;
3002 if (q->left == prev)
3004 q->left = q->parent;
3014 q->right = q->parent;
3025 remove the old root node
3028 if (q->left == NULL)
3033 q->parent = prev->parent;
3034 if (prev->parent->left == prev)
3035 prev->parent->left = q;
3037 prev->parent->right = q;
3045 q->parent = prev->parent;
3046 if (prev->parent->left == prev)
3047 prev->parent->left = q;
3049 prev->parent->right = q;
3059 /*********************************************************************/
3064 /*********************************************************************/
3066 int *aln2seq_chain (Alignment *A, int **sim,int seq1, int seq2, int limit, int max_chain);
3068 Alignment *seq2seq_chain (Alignment *A,Alignment*T, char *arg)
3071 int *buf=NULL, *seq2keep, *list, *tname;
3077 /*Estimate Similarity within the incoming sequences*/
3078 sim=seq2comp_mat (aln2seq(A), "blosum62mt", "sim2");
3080 /*Read and store the list of sequences to keep*/
3081 seq2keep=vcalloc (A->nseq, sizeof (int));
3082 tname=vcalloc (T->nseq, sizeof (int));
3083 for ( a=0; a< T->nseq; a++)
3085 tname[a]=name_is_in_list ( T->name[a], A->name, A->nseq, 100);
3086 if (tname[a]>=0)seq2keep[tname[a]]=1;
3089 /*Consider Every Pair of Sequences within the list of sequences to keep*/
3091 fprintf ( stderr, "\n");
3092 for ( a=0; a< T->nseq-1; a++)
3094 if (tname[a]<0) continue;
3095 for ( b=a+1;b<T->nseq; b++)
3098 if (tname[b]<0) continue;
3100 buf=NULL;sim_limit=90;
3101 while (!buf && sim_limit>min_sim)
3103 buf=aln2seq_chain ( A, sim,tname[a],tname[b],sim_limit, max_chain);
3109 for (c=0; c< A->nseq; c++)seq2keep[c]+=buf[c];
3114 fprintf ( stderr, "\n#Could Not Find any Intermediate sequence [MAx chain %d MinID %d\n", max_chain, min_sim);
3119 list=vcalloc (A->nseq, sizeof (int));
3120 for ( nl=0,a=0; a< A->nseq; a++)
3124 A=extract_sub_aln (A, nl, list);
3130 int max_explore=10000000;/*Limits the number of explorations that tends to increase when id is small*/
3133 int *aln2seq_chain (Alignment *A, int **sim, int seq1, int seq2, int limit, int max_chain)
3137 char output1[10000];
3138 char output2[10000];
3144 output1[0]=output2[0]='\0';
3145 used=vcalloc (A->nseq, sizeof(int));
3148 if (find_seq_chain ( A, sim,used,seq1,seq1, seq2,1,limit, max_chain, &nseq))
3150 list=vcalloc (A->nseq, sizeof (int));
3151 chain=declare_int (A->nseq, 2);
3152 for (n=0, a=0; a< A->nseq; a++)
3156 chain[n][0]=used[a];
3158 list[used[a]-1]=a;n++;
3162 sprintf ( output2, "#%s %s N: %d Lower: %d Sim: %d DELTA: %d\n", A->name[list[0]], A->name[list[n-1]],n, limit,sim[list[0]][list[n-1]],limit-sim[list[0]][list[n-1]]);strcat (output1, output2);
3164 sort_int ( chain, 2, 0, 0, n-1);
3165 sprintf ( output2, "#");strcat(output1, output2);
3167 for ( a=0; a< n-1; a++)
3169 sprintf (output2, "%s -->%d -->", A->name[chain[a][1]],sim[chain[a][1]][chain[a+1][1]]);strcat ( output1, output2);
3171 sprintf ( output2, "%s\n", A->name[chain[n-1][1]]);strcat (output1, output2);
3173 free_int (chain, -1);
3181 /* fprintf ( stdout, "%s", output1);*/
3182 fprintf ( stderr, "%s", output1);
3186 static int ***pw_sim;
3187 int find_seq_chain (Alignment *A, int **sim,int *used,int seq0,int seq1, int seq2,int chain_length, int limit, int max_chain, int *nseq)
3189 int a,b, seq, seq_sim;
3192 if ( n_explore>=max_explore)
3198 pw_sim=declare_arrayN(3, sizeof (int), A->nseq, A->nseq, 3);
3199 for ( a=0; a< A->nseq; a++)
3201 for ( b=0; b<A->nseq; b++)
3204 pw_sim[a][b][1]=sim[a][b];
3205 pw_sim[a][b][2]=sim[b][seq2];
3207 sort_int_inv ( pw_sim[a],3, 1, 0, A->nseq-1);
3211 if ( chain_length>max_chain)return 0;
3212 else if ( sim[seq1][seq2]>=limit)
3214 used[seq2]=chain_length+1;
3221 for ( a=0; a< A->nseq; a++)
3223 seq=pw_sim[seq1][a][0];
3224 seq_sim=pw_sim[seq1][a][1];
3225 delta_seq2=pw_sim[seq1][a][2]-sim[seq1][seq2];
3229 if ( used[seq])continue;
3230 else if ( seq_sim<limit)continue;
3233 used[seq]=chain_length+1;
3235 if ( find_seq_chain( A, sim,used,seq0,seq, seq2, chain_length+1,limit, max_chain, nseq))
3251 /*********************************************************************/
3253 /* New Tree Parsing */
3255 /*********************************************************************/
3256 NT_node new_insert_node (NT_node T);
3257 int scan_name_and_dist ( FILE *fp, char *name, float *dist);
3262 NT_node check_tree (NT_node T);
3263 NT_node main_read_tree (char *treefile)
3273 fp=vfopen (remove_charset_from_file (treefile, " \t\n\r"), "r");
3274 T=new_get_node (NULL,fp);
3277 S=tree2seq(T, NULL);
3279 T=recode_tree(T, S);
3280 free_sequence (S,S->nseq);
3282 T->file=vcalloc ( strlen (treefile)+1, sizeof (char));
3283 sprintf ( T->file, "%s", treefile);
3287 //This function codes the tree into lseq and lseq2
3288 //lseq: list of the N->nseq child sequences of the node
3289 //lsseq2:Array of size Nseq, with lseq[a]=1 if sequence a is child of node N
3290 static int node_index;
3291 NT_node index_tree_node (NT_node T)
3294 if (!T->parent){node_index=tree2nseq (T)+1;}
3296 index_tree_node(T->left);
3297 index_tree_node(T->right);
3299 if (!T->left && !T->right)T->index=T->lseq[0]+1;
3300 else T->index=node_index++;
3306 NT_node simple_recode_tree (NT_node T, int nseq)
3309 //recodes atree wher the leafs are already coded
3327 vfree (T->lseq); T->lseq=vcalloc (nseq, sizeof (int));
3328 vfree (T->lseq2); T->lseq2=vcalloc (nseq, sizeof (int));
3329 vfree (T->idist); T->idist=vcalloc (nseq, sizeof (int));
3330 vfree (T->ldist); T->ldist=vcalloc (nseq, sizeof (int));
3332 R=simple_recode_tree (T->left,nseq);
3334 L=simple_recode_tree (T->right,nseq);
3336 if (R)for (a=0; a<R->nseq; a++)
3338 T->lseq2[R->lseq[a]]=1;
3341 if (L)for (a=0; a<L->nseq; a++)
3343 T->lseq2[L->lseq[a]]=1;
3346 for (a=0; a<nseq; a++)
3348 if (T->lseq2[a])T->lseq[T->nseq++]=a;
3349 if (T->lseq2[a])T->idist[a]=(!R)?0:R->idist[a]+((!L)?0:L->idist[a])+1;
3350 if (T->lseq2[a])T->ldist[a]=(!R)?0:R->ldist[a]+((!L)?0:L->ldist[a])+(int)(T->dist*10000);
3356 NT_node recode_tree (NT_node T, Sequence *S)
3363 vfree (T->lseq); T->lseq=vcalloc (S->nseq, sizeof (int));
3364 vfree (T->lseq2); T->lseq2=vcalloc (S->nseq, sizeof (int));
3365 vfree (T->idist); T->idist=vcalloc (S->nseq, sizeof (int));
3366 vfree (T->ldist); T->ldist=vcalloc (S->nseq, sizeof (int));
3373 i=name_is_in_list (T->name, S->name, S->nseq, -1);
3377 T->lseq[T->nseq++]=i;
3380 T->ldist[i]=(int)(T->dist*10000);;
3384 printf_exit ( EXIT_FAILURE, stderr, "\nERROR: Sequence %s is in the Tree but Not in the Sequence dataset [code_lseq][FATAL:%s]", T->name, PROGRAM);
3393 R=recode_tree (T->left, S);
3395 L=recode_tree (T->right, S);
3398 for (a=0; a<R->nseq; a++)
3400 T->lseq2[R->lseq[a]]=1;
3403 if (L)for (a=0; a<L->nseq; a++)
3405 T->lseq2[L->lseq[a]]=1;
3408 for (a=0; a<S->nseq; a++)
3410 //don't count the root
3413 if ( !(T->parent) || !(T->parent)->parent)d=0;
3414 else if ( T->dist==0)d=0;
3417 if (T->lseq2[a])T->lseq[T->nseq++]=a;
3418 if (T->lseq2[a])T->idist[a]=(!R)?0:(R->idist[a]+((!L)?0:L->idist[a])+d);
3419 if (T->lseq2[a])T->ldist[a]=(!R)?0:R->ldist[a]+((!L)?0:L->ldist[a])+(int)(T->dist*10000);
3425 int tree2split_list (NT_node T, int ns,int **sl, int* n)
3430 tree2split_list (T->right, ns, sl, n);
3431 tree2split_list (T->left , ns, sl, n);
3433 if (!T->right) return 1;
3434 else if (T->parent && !(T->parent)->parent)return 1;
3435 else if ( T->dist==0)return 1;
3438 int t=0,t2=0, c=0, a;
3440 for (a=0; a< ns; a++)
3442 t2+=(a+1)*T->lseq2[a];
3446 if (t2==0) HERE ("0");
3448 sl[n[0]][ns]=t2;//Hash value for quick comparison;
3450 for (a=0; a< ns; a++)sl[n[0]][a]=(c==0)?T->lseq2[a]:(1-T->lseq2[a]);
3457 NT_node display_splits (NT_node T,Sequence *S, FILE *fp)
3462 if (!S)S=tree2seq (T,NULL);
3464 display_splits (T->right,S, fp);
3465 display_splits (T->left, S, fp);
3470 else if (T->parent && !(T->parent)->parent);
3474 for (a=0; a< S->nseq; a++)
3476 fprintf (fp, "%d", T->lseq2[a]);
3480 fprintf ( fp, " %5d \n", MIN(t,((S->nseq)-t)));
3484 NT_node display_leaf_nb (NT_node T, int n, FILE *fp, char * name)
3490 display_leaf_nb (T->right, n, fp, name);
3491 display_leaf_nb (T->left, n, fp, name);
3500 fprintf (fp, "%s ", T->name);
3501 for (a=0; a< n; a++)fprintf (fp, "%d", P->lseq2[a]);
3502 fprintf ( fp," %s\n", name);
3507 NT_node display_code (NT_node T, int n, FILE *fp)
3509 int a, debug=0, t=0;
3517 if (!T->parent && debug) fprintf ( fp, "\nDISPLAY TREE: START");
3518 display_code (T->right, n, fp);
3519 display_code (T->left, n, fp);
3521 fprintf ( fp, "\n");
3522 if (!T->parent) return T;
3523 else if ( !(T->parent)->parent && root4dc==1)return T;
3524 else if ( !(T->parent)->parent && root4dc==0)root4dc=1;
3526 for (a=0; a< n; a++)
3529 for (a=0; a< n; a++)fprintf (fp, "%d", T->lseq2[a]);
3531 for (a=0; a< n; a++)fprintf (fp, "%d", 1-T->lseq2[a]);
3532 if (T->isseq && debug)fprintf (fp, "%s", T->name);
3534 if (!T->parent && debug) fprintf (fp, "\nDISPLAY TREE: FINISHED");
3537 NT_node display_dist (NT_node T, int n, FILE *fp)
3545 display_dist (T->right, n, fp);
3546 display_dist (T->left, n, fp);
3548 fprintf ( stdout, "\n");
3549 for ( a=0; a< n; a++)
3550 fprintf ( stdout, " %2d ", T->idist[a]);
3551 fprintf ( stdout, "\n");
3556 NT_node check_tree (NT_node T)
3558 if (T) HERE("CHECK %s", T->name);
3561 HERE ("ERROR: Empty Group");
3564 else if (T->isseq)return T;
3568 check_tree (T->right);
3570 check_tree (T->left);
3575 NT_node new_reroot_tree( NT_node T)
3580 NT_node new_get_node (NT_node T, FILE *fp)
3587 if (!T)T=declare_tree_node (100);
3593 if (!T->right)T=T->left;
3594 else if (!T->left)T=T->right;
3595 vfree (T->parent);T->parent=NULL;
3601 scan_name_and_dist (fp, T->name, &T->dist);
3602 return new_get_node (T->parent, fp);
3606 return new_get_node (T, fp);
3610 NN=new_insert_node (T);
3615 return new_get_node (NN, fp);
3620 scan_name_and_dist (fp, NN->name, &NN->dist);
3624 return new_get_node (T, fp);
3628 int scan_name_and_dist ( FILE *fp, char *name, float *dist)
3634 c=fgetc (fp);ungetc (c, fp);
3637 if ( c==';')return 0;
3639 while ((c=fgetc(fp))!=':' && c!=EOF && c!=')' && c!=';' && c!=',')
3652 while (isdigit((c=fgetc(fp))) || c=='.' || c=='-')
3660 dist[0]=atof (number);
3663 NT_node new_insert_node (NT_node T)
3668 NN=new_declare_tree_node ();
3674 else if (T->left==NULL)
3679 else if ( T->right==NULL)
3686 NN2=new_declare_tree_node ();
3688 NN2->right=T->right;
3700 (T->right)->parent=NN;
3704 NN->left=new_declare_tree_node ();
3705 (NN->left)->parent=NN;
3710 This caused a crash when internal undefined nodes, removed 19/02/08
3715 P=NN->parent=T->parent;
3718 if (P && P->right==T)P->right=NN;
3719 else if ( P && P->left==T)P->left=NN;
3721 NN->left=new_declare_tree_node ();
3722 (NN->left)->parent=NN;
3729 NT_node new_declare_tree_node ()
3732 static int node_index;
3733 p= (NT_node)vcalloc (1, sizeof ( Treenode));
3740 p->index=++node_index;
3742 p->name=(char*)vcalloc (MAXNAMES+1,sizeof (char));
3747 int new_display_tree (NT_node T, int n)
3754 if ( T->parent)fprintf (stdout, "\nNode %d: has parents)", in);
3755 else fprintf (stdout, "\nNode %d: NO parents)", in);
3759 fprintf (stdout, "\nNode %d has Right Child", in);
3760 n=new_display_tree (T->right, n+1);
3762 else fprintf ( stdout, "\nNode %d No Right\n", in);
3766 fprintf (stdout, "\nNode %d has Left Child", in);
3767 n=new_display_tree (T->left, n+1);
3769 else fprintf ( stdout, "\nNode %d No Left\n", in);
3773 fprintf (stdout, "\nNode %d has Bot Child", in);
3774 n=new_display_tree (T->bot, n+1);
3776 else fprintf ( stdout, "\nNode %d No Bot\n", in);
3781 fprintf (stdout, "\nNode %d is %s", in, T->name);
3785 int display_tree_duplicates (NT_node T)
3791 free_sequence (S, -1);
3794 S=tree2seq (T, NULL);
3795 dup=vcalloc ( S->nseq, sizeof (int));
3797 for (a=0; a< S->nseq-1; a++)
3798 for ( b=a+1; b<S->nseq; b++)
3800 if ( strm (S->name[a], S->name[b]))
3805 for (a=0; a< S->nseq-1; a++)
3806 for ( b=a+1; b<S->nseq; b++)
3808 if ( strm (S->name[a], S->name[b]) && dup[a])
3810 fprintf ( stderr, "\nSequence %s is duplicated %d Times in the tree", S->name[a], dup[a]);
3816 int tree_contains_duplicates (NT_node T)
3821 free_sequence (S, -1);
3823 S=tree2seq (T, NULL);
3824 for (a=0; a< S->nseq-1; a++)
3825 for ( b=a+1; b<S->nseq; b++)
3827 if ( strm (S->name[a], S->name[b]))return 1;
3832 float display_avg_bootstrap ( NT_node T)
3837 tot=tree2tot_dist (T, BOOTSTRAP);
3838 n=tree2n_branches (T, BOOTSTRAP);
3839 fprintf ( stdout, "\nAVERAGE BOOTSRAP: %.3f on %d Branches\n", (n>0)?tot/n:0, n);
3840 return (n>0)?tot/n:0;
3844 int tree2n_branches(NT_node T, int mode)
3850 else if ((T->isseq && mode !=BOOTSTRAP) || !T->isseq)
3854 n+=tree2n_branches(T->right, mode);
3855 n+=tree2n_branches(T->left, mode);
3860 float tree2tot_dist ( NT_node T, int mode)
3868 else if ((T->isseq && mode !=BOOTSTRAP) || !T->isseq)
3873 t+=tree2tot_dist(T->right, mode);
3874 t+=tree2tot_dist(T->left, mode);
3878 //This function displays all the sequences within the tree sorted by node label
3879 int cmp_tree_array ( const void *vp, const void *vq);
3880 int node_sort ( char *name, NT_node T)
3886 while (T->parent)T=T->parent;
3889 array=declare_int (nseq, 2);
3890 N=tree2node (name, T);
3892 if (N==NULL)printf_exit (EXIT_FAILURE, stderr, "ERROR: %s is not in the tree [FATAL:%s]\n", name, PROGRAM);
3893 array=display_tree_from_node (N,0,0, array);
3894 qsort ( array, nseq, sizeof (int**), cmp_tree_array);
3895 S=tree2seq(T, NULL);
3896 for (a=0; a<nseq; a++)
3897 fprintf ( stdout, ">%s %d %d\n", S->name[array[a][0]], array[a][1], array[a][2]);
3898 exit (EXIT_SUCCESS);
3901 NT_node tree2root ( NT_node R)
3903 if (R)while (R->parent)R=R->parent;
3907 NT_node tree2node (char *name, NT_node T)
3911 else if (T->leaf && strm (T->name, name)) return T;
3915 T1=tree2node ( name, T->right);
3916 T2=tree2node ( name, T->left);
3917 return (T1>T2)?T1:T2;
3921 NT_node * tree2node_list (NT_node T, NT_node *L)
3923 if (!T) return NULL;
3924 if (!L) L=vcalloc (T->node+1, sizeof (NT_node));
3926 tree2node_list (T->left, L);
3927 tree2node_list (T->right, L);
3934 int ** display_tree_from_node (NT_node T, int up, int down, int **array)
3937 if (!T || T->visited)return array;
3942 array[T->lseq[0]][0]=T->lseq[0];
3943 array[T->lseq[0]][1]=up;
3944 array[T->lseq[0]][2]=down;
3949 array=display_tree_from_node ( T->left ,up, down+1, array);
3950 array=display_tree_from_node ( T->right,up, down+1, array);
3952 array=display_tree_from_node ( T->parent,up+1, 0, array);
3958 int cmp_tree_array ( const void *p, const void *q)
3960 const int **vp=(const int**)p;
3961 const int **vq=(const int**)q;
3962 if (vp[0][1]>vq[0][1])return 1;
3963 else if ( vp[0][1]<vq[0][1]) return -1;
3964 else if ( vp[0][2]>vq[0][2]) return 1;
3965 else if ( vp[0][2]<vq[0][2]) return -1;
3969 NT_node * read_tree_list (Sequence *S)
3974 T=vcalloc ( S->nseq+1, sizeof (NT_node));
3976 for ( a=0; a<S->nseq; a++)
3979 if (S->seq && S->seq[a] && strlen (S->seq[a])<2)
3982 fname=string2file (S->seq[a], NULL, "w");
3984 T[a]=main_read_tree (fname);
3985 T[a]->file=vcalloc (strlen (S->name[a])+1, sizeof (char));
3986 sprintf (T[a]->file, "%s", S->name[a]);
3991 int treelist2dmat ( Sequence *S)
4001 T=read_tree_list (S);
4002 TS=tree2seq(T[0], NULL);
4003 fprintf (stdout, "\n%d", S->nseq);
4006 fprintf ( stdout,"\n%-10s ", S->name[a]);
4007 for ( b=0; b<n; b++)
4009 v=100-simple_tree_cmp (T[a], T[b], TS, 1);
4010 fprintf ( stdout, "%.4f ", v);
4015 exit (EXIT_SUCCESS);
4019 int treelist2leafgroup ( Sequence *S, Sequence *TS, char *taxon)
4022 int n=0,nseq, a, c,s;
4026 char *split_file, *sorted_split_file;
4029 char *name, *fname, *group, *ref_group, *list;
4033 T=read_tree_list (S);
4034 if (!TS)TS=tree2seq(T[0], NULL);
4036 name=vcalloc (1000, sizeof (char));
4037 fname=vcalloc (1000, sizeof (char));
4038 group=vcalloc (TS->nseq*10, sizeof (char));
4039 ref_group=vcalloc (TS->nseq*10, sizeof (char));
4040 list=vcalloc (100*S->nseq, sizeof (char));
4041 split_file=vtmpnam (NULL);
4042 sorted_split_file =vtmpnam (NULL);
4045 used=vcalloc (n, sizeof (int));
4047 T=read_tree_list (S);
4048 if (!TS)TS=tree2seq(T[0], NULL);
4050 fp=vfopen (split_file, "w");
4052 for ( a=0; a< S->nseq; a++)
4055 T[a]=prune_tree (T[a], TS);
4056 T[a]=recode_tree (T[a], TS);
4057 display_leaf_nb (T[a], TS->nseq,fp, S->name[a]);
4062 for (s=0; s< TS->nseq; s++)
4067 if (taxon && !(strm (taxon, TS->name[s]) ))continue;
4069 printf_system ( "cat %s | grep %s| sort > %s", split_file,TS->name[s], sorted_split_file);
4071 vfopen (sorted_split_file, "r");
4072 ref_group[0]=group[0]='\0';
4074 while ( (c=fgetc (fp))!=EOF)
4078 buf=vfgets (buf, fp);
4079 sscanf (buf, "%s %s %s\n", name, group, fname);
4081 if ( !ref_group[0]|| !strm (group, ref_group))
4085 {fprintf (stdout, "%s %6.2f %s",name, (((float)n*100)/(float)S->nseq), ref_group);
4086 for (i=0,a=0; a<nseq; a++)
4087 if (ref_group[a]=='1' && a!=s)fprintf (stdout, " %s ", TS->name[a]);
4088 fprintf ( stdout, "\n");
4089 fprintf (stdout, "\nLIST: %s\n", list);
4092 sprintf ( ref_group, "%s", group);
4093 list=strcatf (list, " %s", fname);
4099 list=strcatf (list, " %s", fname);
4104 fprintf (stdout, "%s %6.2f %s",name, (((float)n*100)/(float)S->nseq), group);
4105 for (i=0,a=0; a<nseq; a++)
4106 if (group[a]=='1' && a!=s)fprintf (stdout, " %s ", TS->name[a]);
4107 fprintf (stdout, "\nLIST %s\n", list);
4108 fprintf ( stdout, "\n");
4114 int count_tree_groups( Sequence *LIST, char *group_file)
4118 int a, b, c,n, w, wo, ng=0;
4119 int **list, ***rlist, **blist;
4125 T=read_tree_list (LIST);
4126 S=tree2seq(T[0], NULL);
4127 for ( a=0; a< LIST->nseq; a++)
4129 T[a]=prune_tree (T[a], S);
4130 T[a]=recode_tree (T[a], S);
4135 gs=vcalloc (2, sizeof (int));
4136 list=declare_int (LIST->nseq*S->nseq*2, S->nseq+1);
4138 blist=declare_int (2, S->nseq+1);
4139 for ( n=0, a=0; a< LIST->nseq; a++)
4142 tree2split_list (T[a], S->nseq, list+n, &n2);
4146 for (b=0; b<n2; b++)
4148 for ( c=0; c<S->nseq; c++)
4149 list[n+b][c]=1-list[n-n2+b][c];
4156 rlist=declare_arrayN(3, sizeof (int), 2,LIST->nseq*S->nseq, S->nseq+1);
4157 l=file2list (group_file, " ");
4162 if (!strstr (l[ng][1], "group")){ng++;continue;}
4163 g=(strm (l[ng][1], "group2"))?0:1;
4165 for (b=2; b<atoi (l[ng][0]); b++)
4167 if ((i=name_is_in_list(l[ng][b], S->name, S->nseq, 100))!=-1)rlist[g][gs[g]][i]=1;
4175 rlist=vcalloc ( 2, sizeof (int**));
4176 rlist[1]=count_int_strings (list, n, S->nseq);
4177 gs[1]=read_array_size_new (rlist[1]);
4179 rlist[0]=declare_int (S->nseq, S->nseq);
4181 for ( a=0; a<S->nseq; a++)rlist[0][a][a]=1;
4185 for (wo=w=0,a=0; a<gs[0]; a++)
4187 for (c=0; c< gs[1]; c++)
4190 for (b=0; b<S->nseq; b++)
4192 blist[0][b]=blist[1][b]=rlist[1][c][b];
4193 blist[0][b]=(rlist[0][a][b]==1)?1:blist[0][b]; //WITH GROUP 1
4194 blist[1][b]=(rlist[0][a][b]==1)?0:blist[1][b]; //wiTHOUT gROUP 1
4200 x1=(memcmp (blist[0], list[b], sizeof (int)*S->nseq)==0)?1:0;
4202 x2=(memcmp (blist[1], list[b], sizeof (int)*S->nseq)==0)?1:0;
4206 fprintf ( stdout, "\n%d ", MIN(wo, w));
4207 fprintf ( stdout, "(");
4208 for (b=0; b<S->nseq; b++)if (rlist[1][c][b])fprintf ( stdout, "%s ",S->name[b]);
4209 fprintf ( stdout, ") +/- (");
4210 for (b=0; b<S->nseq; b++)if (rlist[0][a][b])fprintf ( stdout, "%s ",S->name[b]);
4212 fprintf (stdout , ") + %d - %d Delta %d", w, wo, FABS((wo-w)));
4217 Split * print_split ( int n, int **list, Sequence *LIST, Sequence *S, char *buf, char *file);
4219 NT_node split2tree ( NT_node RT,Sequence *LIST, char *param)
4223 S=count_splits (RT, LIST, param);
4224 A=seq2aln ((S[0])->S,NULL, KEEP_GAP);
4226 return split2upgma_tree (S,A, A->nseq, "no");
4229 Split** count_splits( NT_node RT,Sequence *LIST, char *param)
4233 int a, b, c, d, n1, n2;
4234 int **list1, **list2;
4238 char *in=NULL,*in2=NULL, *out=NULL, order[100], filter[100];
4242 //+count_splits _NB_x_FILTER_<file>
4243 //_<file is a fasta file containing the list of species to keep>
4244 if (!def_param)def_param=vcalloc ( 10, sizeof (char));
4248 if (!param)param=def_param;
4251 strget_param (param, "_NB_", "0", "%d", &nb);
4252 strget_param (param, "_TLIST_", "0", "%d", &tlist);
4253 strget_param (param, "_ORDER_", "NO", "%s", order);
4254 strget_param (param, "_FILTER_", "NO", "%s", filter);
4256 fprintf ( stderr, "\nREAD TREE LIST [%d Trees...", LIST->nseq);
4257 T=read_tree_list (LIST);
4258 fprintf ( stderr, "..]");
4260 if ( !(strm (order, "NO")))
4262 if (is_newick (order))
4264 OrderT=main_read_tree (order);
4268 S=main_read_seq (order);
4273 OrderT=(RT)?RT:T[0];
4275 fprintf ( stderr, "\nTrees Ordered according to: %s", (strm (order, "NO"))?"First Tree":order);
4278 if (!S)S=tree2seq(OrderT, NULL);
4280 for (a=0; a<S->nseq; a++)
4282 fprintf ( stdout, "\n#ORDER %15s : %3d", S->name[a], a+1);
4284 if ( !strm (filter, "NO"))
4289 F=main_read_seq (filter);
4290 cache=vcalloc (S->nseq, sizeof (int));
4291 for ( a=0; a<F->nseq; a++)
4293 if ( (i=name_is_in_list (F->name[a], S->name, S->nseq, 100))!=-1)
4296 free_sequence (F, -1);
4299 main_buf=vcalloc ( S->nseq*(STRING+1), sizeof(int));
4301 list1=declare_int (S->nseq*3, S->nseq+1);
4302 list2=declare_int (S->nseq*3, S->nseq+1);
4304 for ( a=0; a< LIST->nseq; a++)
4306 T[a]=prune_tree (T[a], S);
4307 T[a]=recode_tree (T[a], S);
4317 in=vtmpnam (NULL);in2=vtmpnam(NULL); out=vtmpnam (NULL);
4319 fp=vfopen (in, "w");
4320 fp2=vfopen (in2, "w");
4321 for ( a=0; a< LIST->nseq; a++)
4324 tree2split_list (T[a], S->nseq, list2, &n2);
4325 for ( b=0; b<n2; b++)
4327 for (c=0; c< S->nseq; c++)
4328 {fprintf (fp, "%d", list2[b][c]);}
4330 for (c=0; c< S->nseq; c++)
4331 {fprintf (fp, "%d", 1-list2[b][c]);}
4334 for (c=0; c< S->nseq; c++)
4335 {fprintf (fp2, "%d", list2[b][c]);}
4337 for (c=0; c< S->nseq; c++)
4338 {fprintf (fp2, "%d", 1-list2[b][c]);}
4339 fprintf (fp2, " %s\n",LIST->name[a]);
4345 count_strings_in_file (in, out);
4346 nl=count_n_line_in_file(out);
4347 list1=declare_int (nl+1, S->nseq+2);
4349 fp=vfopen (out, "r");
4351 buf=vcalloc (measure_longest_line_in_file (out)+1, sizeof (char));
4352 while ( fscanf (fp, "%s %d",buf, &i)==2)
4354 for (a=0; a<S->nseq; a++)list1[n1][a]=buf[a]-'0';
4355 list1[n1++][S->nseq+1]=i;
4363 RT=prune_tree (RT, S);
4364 RT=recode_tree (RT, S);
4366 tree2split_list (RT, S->nseq, list1,&n1);
4367 for ( a=0; a< LIST->nseq; a++)
4370 tree2split_list (T[a], S->nseq, list2, &n2);
4371 for (b=0; b<n1; b++)
4373 for ( c=0; c<n2; c++)
4376 for (d=0; d<S->nseq; d++)
4378 if (list1[b][d]!=list2[c][d])di++;
4380 list1[b][S->nseq+1]+=(di==0 || di== S->nseq)?1:0;
4386 SL=vcalloc ( n1+1, sizeof (Split*));
4388 for (a=0; a<n1; a++)
4392 if (nb)fprintf ( stdout, "\nSPLIT: %d and its Neighborhood +/^- %d\n", a+1, nb);
4395 for (b=0; b<S->nseq; b++)if (cache[b]!=list1[a][b])cont=0;
4396 if (!cont) continue;
4398 SL[a]=print_split (a, list1, LIST, S, main_buf, (tlist==1)?in2:NULL);
4399 for (b=0; b<n1; b++)
4405 for (d=s1=s2=0,c=0; c<S->nseq; c++)
4409 d+=(list1[a][c]!=list1[b][c])?1:0;
4413 if (d<=nb &&((s1==s2)|| ((S->nseq-s1)==s2)))print_split (b, list1, LIST, S, main_buf, (tlist==1)?in2:NULL);
4420 Split * declare_split (int nseq, int ntrees);
4421 Split* print_split ( int a, int **list1, Sequence *LIST, Sequence *S, char *buf, char *split_file)
4428 SP=declare_split (S->nseq, LIST->nseq);
4430 fprintf ( stdout, "\n>");
4431 for (t=0,b=0; b<S->nseq; b++){fprintf ( stdout, "%d", list1[a][b]);t+=list1[a][b];SP->split[b]='0'+list1[a][b];}
4432 fprintf ( stdout, " NumberSplit %5d SplitSize %5d Score %5.2f %s ", list1[a][S->nseq+1],t, (float)(list1[a][S->nseq+1]*100)/LIST->nseq, (buf)?buf:"");
4433 SP->n= list1[a][S->nseq+1];
4434 SP->score=(float)(list1[a][S->nseq+1]*100)/LIST->nseq;
4437 for (f1=1,b=0; b< S->nseq; b++)
4442 if (f1==1)fprintf ( stdout, "(");
4443 else fprintf (stdout, ",");
4445 fprintf ( stdout, "%s", S->name [b]);
4448 fprintf ( stdout, ")");
4455 fp=vfopen (split_file, "r");
4456 while ( (c=fgetc(fp))!=EOF)
4460 buf=vfgets (buf, fp);
4461 if ( strstr (buf, SP->split))
4464 list=string2list (buf);
4465 fprintf ( stdout, "\n\t%s %s", SP->split, list[3]);
4466 free_char (list, -1);
4474 Split * declare_split (int nseq, int ntrees)
4477 S=vcalloc (1, sizeof (Split));
4478 S->split=vcalloc ( nseq+1, sizeof (char));
4481 int treelist2splits( Sequence *S, Sequence *TS)
4488 char *split_file, *sorted_split_file;
4489 char *buf=NULL, *ref_buf=NULL;
4492 split_file=vtmpnam (NULL);
4493 sorted_split_file =vtmpnam (NULL);
4496 used=vcalloc (n, sizeof (int));
4498 T=read_tree_list (S);
4499 if (!TS)TS=tree2seq(T[0], NULL);
4501 fp=vfopen (split_file, "w");
4504 for ( a=0; a< S->nseq; a++)
4507 T[a]=prune_tree (T[a], TS);
4508 T[a]=recode_tree (T[a], TS);
4509 display_splits (T[a], TS,fp);
4513 printf_system ("cp %s split_file", split_file);
4515 printf_system ( "cat %s | grep 1| sort > %s", split_file, sorted_split_file);
4517 fp=vfopen (sorted_split_file, "r");
4518 fprintf (stdout, "LEGEND: <#occurences> <coded split> <min group size> <(group1,)> <(group2,>\n");
4520 for ( a=0; a<TS->nseq; a++)fprintf ( stdout, "SEQ_INDEX %d %s\n", a+1, TS->name[a]);
4521 while ( (c=fgetc (fp))!=EOF)
4525 buf=vfgets (buf, fp);
4526 buf [strlen(buf)-1]='\0';
4530 ref_buf=vcalloc (strlen (buf)+1, sizeof (char));
4531 sprintf ( ref_buf, "%s", buf);
4534 else if ( !strm (buf, ref_buf))
4537 fprintf ( stdout, "SPLIT_COUNT %3d %s (", n, ref_buf);
4538 for (i=0,a=0; a<nseq; a++)
4539 if (ref_buf[a]=='1')
4541 if (i==1)fprintf(stdout, ",");
4542 fprintf (stdout, "%s", TS->name[a]);
4545 fprintf ( stdout, "),(");
4546 for (i=0,a=0; a<nseq; a++)
4547 if (ref_buf[a]=='0')
4549 if (i==1) fprintf ( stdout, ",");
4550 fprintf (stdout, "%s", TS->name[a]);
4554 fprintf (stdout, ")\n");
4555 sprintf ( ref_buf, "%s", buf);
4569 int treelist2splits_old ( Sequence *S, Sequence *TS)
4576 char *split_file, *sorted_split_file;
4577 char *buf=NULL, *ref_buf=NULL;
4580 split_file=vtmpnam (NULL);
4581 sorted_split_file =vtmpnam (NULL);
4584 used=vcalloc (n, sizeof (int));
4586 T=read_tree_list (S);
4587 if (!TS)TS=tree2seq(T[0], NULL);
4589 fp=vfopen (split_file, "w");
4591 for ( a=0; a< S->nseq; a++)
4594 T[a]=prune_tree (T[a], TS);
4595 T[a]=recode_tree (T[a], TS);
4596 display_leaf_nb (T[a], TS->nseq,fp, S->name[a]);
4599 printf_system ("cp %s split_file", split_file);exit (0);
4601 printf_system ( "cat %s | grep 1| sort > %s", split_file, sorted_split_file);
4603 vfopen (sorted_split_file, "r");
4605 while ( (c=fgetc (fp))!=EOF)
4609 buf=vfgets (buf, fp);
4610 buf [strlen(buf)-1]='\0';
4614 ref_buf=vcalloc (strlen (buf)+1, sizeof (char));
4615 sprintf ( ref_buf, "%s", buf);
4618 else if ( !strm (buf, ref_buf))
4621 fprintf ( stdout, "%3d %s(", n, ref_buf);
4622 for (i=0,a=0; a<nseq; a++)
4623 if (ref_buf[a]=='1')
4625 if (i==1)fprintf(stdout, ",");
4626 fprintf (stdout, "%s", TS->name[a]);
4629 fprintf ( stdout, "),(");
4630 for (i=0,a=0; a<nseq; a++)
4631 if (ref_buf[a]=='0')
4633 if (i==1) fprintf ( stdout, ",");
4634 fprintf (stdout, "%s", TS->name[a]);
4638 fprintf (stdout, ")\n");
4639 sprintf ( ref_buf, "%s", buf);
4653 NT_node *treelist2prune_treelist (Sequence *S, Sequence *TS, FILE *out)
4658 T=read_tree_list (S);
4659 T=vrealloc (T, (S->nseq+1)*sizeof (NT_node));
4660 for (b=0,a=0; a<S->nseq; a++)
4662 T[a]=prune_tree (T[a], TS);
4663 if (tree2nleaf(T[a])<TS->nseq)
4671 T[b]=recode_tree (T[b], TS);
4672 sprintf ( S->name[b], "%s", S->name[a]);
4673 s=tree2string (T[a]);
4674 S->seq[b]=vrealloc (S->seq[b], (strlen (s)+1)*sizeof (char));
4675 sprintf (S->seq[b], "%s",s);
4676 sprintf (S->seq_comment[b], " NSPECIES: %d", TS->nseq);
4689 for (a=0; a<S->nseq; a++)
4691 print_tree (T[a], "newick", out);
4696 int** treelist2lti2 ( Sequence *S, Sequence *TS, int ngb, FILE *out);
4697 int treelist2frame (Sequence *S, Sequence *TS)
4699 int n, a, b, c,d, **r, **order;
4702 temp=duplicate_sequence (S);
4703 order= treelist2lti (temp, TS,0,stdout);
4705 TS=reorder_seq_2 (TS, order, 0, TS->nseq);
4713 temp=duplicate_sequence (S);
4714 r=treelist2groups (temp,TS, NULL, NULL);
4715 fprintf ( stdout, "\n>Tree_%d [%d %%]\n ", a+1,r[0][1]);
4716 tree=main_read_tree (temp->name[r[0][0]]);
4717 tree=prune_tree (tree, TS);
4718 print_tree (tree, "newick",stdout);
4721 free_sequence (temp,-1);
4723 exit (EXIT_SUCCESS);
4725 int** treelist2lti2 ( Sequence *S, Sequence *TS, int ngb, FILE *out)
4728 int a,b, c, d, ****dist, i;
4729 int **score, **order;
4731 score=declare_int (TS->nseq, 3);
4732 order=declare_int (TS->nseq, 2);
4735 for (a=0; a<50; a++)
4737 Sequence *seq, *trees;
4739 trees=duplicate_sequence (S);
4740 seq=duplicate_sequence (TS);
4741 for (b=0; b<TS->nseq; b++){order[b][0]=b;order[b][1]=rand()%10000;}
4742 sort_int (order, 2, 1, 0, TS->nseq-1);
4743 seq=reorder_seq_2(seq, order, 0,5);
4744 r=treelist2groups (trees,seq, NULL, NULL);
4748 score[order[b][0]][1]+=r[0][1];
4749 score[order[b][0]][2]++;
4751 HERE ("Score=%d", r[0][1]);
4753 free_sequence (seq, -1);
4754 free_sequence (trees, -1);
4758 for ( a=0; a< TS->nseq; a++)
4761 HERE ("%s => %d [%d]",TS->name[a], score[a][1]/score[a][2], score[a][2]);
4762 score[a][1]/=(score[a][2])?score[a][2]:1;
4764 sort_int_inv (score, 3, 1, 0, TS->nseq-1);
4770 int** treelist2lti ( Sequence *S, Sequence *TS, int ngb, FILE *out)
4773 int a,b, c, d, ****dist, i;
4774 float score0=0, score1=0;
4779 T=treelist2prune_treelist (S, TS,NULL);
4781 if (!ngb)ngb=TS->nseq*2;
4782 dist=vcalloc ( S->nseq, sizeof (int****));
4783 result=declare_int (TS->nseq, 2);
4784 for (a=0; a<TS->nseq; a++)
4788 for (b=0; b<TS->nseq;b++)
4792 for (c=0; c<S->nseq; c++)
4794 if (!dist[c])dist[c]=tree2dist(T[c], TS, NULL);
4795 for (d=0; d<S->nseq; d++)
4797 float score, d1, d2;
4799 if (!dist[d])dist[d]=tree2dist(T[d], TS, NULL);
4800 d1=dist[c][0][a][b];
4801 d2=dist[d][0][a][b];
4802 score=FABS((d1-d2));
4803 if (d1>ngb || d2>ngb);
4811 // if (d1 && d2) HERE ("%d %d", (int)d1, (int)d2);
4814 score_pair=(score_pair*100)/(float)n_pair;
4815 if (out)fprintf ( stdout, "\n>%-20s %-20s LTI: %7.3f [Kept %d Trees Out of %d] ", TS->name[a],TS->name[b], score_pair, S->nseq,i);
4818 score_seq=(score_seq*100)/n_seq;
4820 result[a][1]=(int)(100*score_seq);
4821 if (out)fprintf ( stdout, "\n>%-20s %-20s LTI: %7.3f [Kept %d Trees Out of %d] ", TS->name[a],"*", score_seq, S->nseq, i);
4823 sort_int (result,2,1,0, TS->nseq-1);
4828 int ***tree2dist (NT_node T, Sequence *S, int ***d)
4830 int *l0, *r0,*l1, *r1, a, b;
4834 if (!S)S=tree2seq(T, NULL);
4837 d=declare_arrayN (3, sizeof (float),2, S->nseq, S->nseq);
4839 T=recode_tree (T, S);
4842 if (!T->left)return d;
4843 if (!T->right) return d;
4845 l0=(T->left)->idist;
4846 r0=(T->right)->idist;
4848 l1=(T->left)->ldist;
4849 r1=(T->right)->ldist;
4853 for (a=0; a< S->nseq; a++)
4854 for (b=0; b<S->nseq; b++)
4856 if (l0[a]>0 && r0[b]>0)d[0][a][b]=d[0][b][a]=l0[a]+r0[b];
4857 if (l0[a]>0 && r0[b]>0)d[1][a][b]=d[1][b][a]=l1[a]+r1[b];
4860 d=tree2dist (T->left, S, d);
4861 d=tree2dist (T->right, S, d);
4869 int **tree2dist_split ( NT_node T, Sequence *S, int **dist)
4874 char *buf=NULL, **list=NULL, *split_file;
4877 if (!S)S=tree2seq(T, NULL);
4879 T=prune_tree (T, S);
4880 T=recode_tree (T, S);
4882 split_file=vtmpnam (NULL);
4883 fp=vfopen (split_file, "w");
4884 display_code (T, S->nseq,fp);
4887 list=declare_char (2*S->nseq, S->nseq+1);
4888 fp=vfopen (split_file, "r");
4890 while ((buf=vfgets (buf,fp))!=NULL)
4892 if (buf[0]=='1' || buf[0]=='0')sprintf (list[n++], "%s", buf);
4895 dist=declare_int ( S->nseq, S->nseq);
4896 for (a=0; a< S->nseq; a++)
4897 for ( b=0; b<S->nseq; b++)
4899 if (list[c][a]!=list[c][b])dist[a][b]++;
4905 int** treelist2groups (Sequence *S, Sequence *TS, char *star_node, FILE *out)
4918 T=treelist2prune_treelist (S, TS,NULL);
4919 nsn=(star_node)?atoi(star_node):0;
4921 results=declare_int (S->nseq+1, 2);
4925 for (a=0; a< S->nseq; a++)tree2star_nodes(T[a],nsn);
4928 used=vcalloc (S->nseq, sizeof (int));
4929 for (ntop=0,a=0; a<S->nseq; a++)
4935 if (out)fprintf ( out, "\nTree %s:",S->name[a]);
4940 for ( b=0; b<S->nseq; b++)
4944 v=(int)simple_tree_cmp (T[a], T[b], TS, 1);
4949 if (out)fprintf (stdout," %s ", S->name[b]);
4954 if (out)fprintf ( stdout, "__ N=%d\n", tot-1);
4958 for (n=0,a=0; a<S->nseq; a++)
4962 if (out)fprintf ( out, "\n>%-15s %4d %6.2f TOPOLOGY_LIST\n", S->name[a], used[a]-1, (float)(((float)used[a]-1)*100/(float)S->nseq));
4963 if (out)print_tree (T[a], "newick_tree", out);
4965 results[n][1]=((used[a]-1)*100)/i;
4970 for (a=0; a<S->nseq; a++) free_tree(T[a]);
4973 if (out)fprintf ( stdout, "\nTotal Number of different topologies: %d\n", ntop);
4975 sort_int_inv (results,2,1,0, n-1);
4976 for (a=0; a<S->nseq; a++) free_tree(T[a]);
4980 float simple_tree_cmp (NT_node T1, NT_node T2,Sequence *S, int mode)
4982 Tree_sim *TS1, *TS2;
4985 TS1=vcalloc (1, sizeof (Tree_sim));
4986 TS2=vcalloc (1, sizeof (Tree_sim));
4989 T1=recode_tree(T1, S);
4990 T2=recode_tree(T2, S);
4992 n=new_compare_trees ( T1, T2, S->nseq, TS1);
4993 new_compare_trees ( T2, T1, S->nseq, TS2);
4997 t=(TS1->uw+TS2->uw)*100/(TS1->max_uw+TS2->max_uw);
4998 w=(TS1->w+TS2->w)*100/(TS1->max_w+TS2->max_w);
4999 l=(TS1->d+TS2->d)*100/(TS1->max_d+TS2->max_d);
5001 vfree (TS1); vfree (TS2);
5002 if ( mode ==1)return t;
5003 else if (mode ==2) return w;
5006 int treelist2n (NT_node *L)
5012 int **treelist2avg_treecmp (NT_node *L, char *file)
5017 if (file) L=read_tree_list (main_read_seq(file));
5020 score=declare_int (n, 2);
5021 for (a=0; a<n; a++)score[a][0]=a;
5023 for (a=0; a<n-1; a++)
5025 output_completion (stderr,a,n,1, "Tree Cmp");
5026 for (b=a+1; b<n; b++)
5029 ts=tree_cmp (L[a],L[b]);
5030 score[a][1]+=ts->uw;
5031 score[b][1]+=ts->uw;
5035 sort_int_inv (score, 2, 1, 0, n-1);
5036 if (file)free_treelist(L);
5039 NT_node treelist2filtered_bootstrap ( NT_node *L,char *file, int **score, float t)
5044 if (t==1 || t==0 || !score)return treelist2bootstrap (L, file);
5046 if (file)L=read_tree_list (main_read_seq(file));
5050 if (n==0) return NULL;
5052 L2=vcalloc ( n+1, sizeof (NT_node));
5054 L2[a]=L[score[a][0]];
5056 BT=treelist2bootstrap (L2, NULL);
5059 if (file)free_treelist(L);
5063 NT_node treelist2bootstrap ( NT_node *L, char *file)
5071 file=vtmpnam (NULL);
5072 vfclose (print_tree_list (L,"newick", vfopen (file, "w")));
5075 outfile=vtmpnam (NULL);
5077 printf_system( "msa2bootstrap.pl -i %s -o %s -input tree >/dev/null 2>/dev/null", file, outfile);
5079 T=main_read_tree (outfile);
5080 T=tree_dist2normalized_tree_dist (T,treelist2n(L));
5088 Sequence * treelist2seq (Sequence *S)
5097 name=vcalloc (1, sizeof (char*));
5098 fp=vfopen ((fname=vtmpnam (NULL)), "w");
5100 T=read_tree_list (S);
5101 for (n=0,a=0; a< S->nseq; a++)
5103 TS=tree2seq(T[a], NULL);
5104 for (b=0; b<TS->nseq; b++)
5106 if ( (i=name_is_in_list (TS->name[b], name, n, 100))==-1)
5108 name[n]=vcalloc (100, sizeof (int));
5109 sprintf ( name[n], "%s", TS->name[b]);
5111 name=vrealloc (name, (n+1)*sizeof (char*));
5112 fprintf ( fp, ">%s\n", TS->name[b]);
5115 free_sequence(TS, TS->nseq);
5121 return get_fasta_sequence (fname, NULL);
5125 Sequence * treelist2sub_seq ( Sequence *S, int f)
5128 int a,b,c, s, i, n, maxnseq, tot;
5129 int **count, **grid;
5133 if (!f)return treelist2seq(S);
5136 //keep as many taxons as possible so that f% of the trees are kept
5137 //1: count the frequency of each taxon
5139 FS=treelist2seq (S);
5142 count=declare_int (maxnseq, 3);
5143 grid=declare_int (S->nseq,maxnseq+1);
5144 T=read_tree_list (S);
5148 for (a=0; a<FS->nseq; a++){count[a][0]=a;count[a][2]=1;}
5149 for (n=0,a=0; a< S->nseq; a++)
5151 TS=tree2seq(T[a], NULL);
5152 for (b=0; b<TS->nseq; b++)
5154 i=name_is_in_list (TS->name[b], FS->name, FS->nseq, 100);
5155 if ( i==-1){exit (EXIT_FAILURE);}
5159 free_sequence(TS, TS->nseq);
5163 sort_int ( count,3,1, 0, maxnseq-1);
5165 for (a=0; a<maxnseq; a++)
5168 for ( b=0; b< S->nseq; b++)grid[b][maxnseq]=1;//prepare to keep everything
5169 for ( tot=S->nseq, b=0; b< S->nseq; b++)
5171 for (c=0; c<maxnseq; c++)
5174 if (count[c][2] && !grid[b][s])
5182 tot=(tot*100)/S->nseq;
5185 if (tot<f)return NULL;
5187 fname=vtmpnam (NULL);
5188 fp=vfopen (fname, "w");
5189 for (a=0; a<maxnseq; a++)
5193 fprintf ( fp, ">%s LIMIT: %d %%\n", FS->name[count[a][0]], f);
5198 free_int (grid, -1); free_int (count, -1);
5199 free_sequence (FS, FS->nseq);
5201 return get_fasta_sequence (fname, NULL);
5203 /*********************************COPYRIGHT NOTICE**********************************/
5204 /*© Centro de Regulacio Genomica */
5206 /*Cedric Notredame */
5207 /*Tue Oct 27 10:12:26 WEST 2009. */
5208 /*All rights reserved.*/
5209 /*This file is part of T-COFFEE.*/
5211 /* T-COFFEE is free software; you can redistribute it and/or modify*/
5212 /* it under the terms of the GNU General Public License as published by*/
5213 /* the Free Software Foundation; either version 2 of the License, or*/
5214 /* (at your option) any later version.*/
5216 /* T-COFFEE is distributed in the hope that it will be useful,*/
5217 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
5218 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
5219 /* GNU General Public License for more details.*/
5221 /* You should have received a copy of the GNU General Public License*/
5222 /* along with Foobar; if not, write to the Free Software*/
5223 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
5224 /*............................................... |*/
5225 /* If you need some more information*/
5226 /* cedric.notredame@europe.com*/
5227 /*............................................... |*/
5231 /*********************************COPYRIGHT NOTICE**********************************/