--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+#include "io_lib_header.h"
+#include "util_lib_header.h"
+#include "dp_lib_header.h"
+#include "define_header.h"
+
+#define TOPOLOGY 1
+#define WEIGHTED 2
+#define LENGTH 3
+#define RECODE 4
+
+int distance_tree;
+int rooted_tree;
+int tot_nseq;
+static NT_node compute_fj_tree (NT_node T, Alignment *A, int limit, char *mode);
+static NT_node compute_cw_tree (Alignment *A);
+static NT_node compute_std_tree (Alignment *A, int n, char **arg);
+static NT_node tree2fj_tree (NT_node T);
+int tree_contains_duplicates (NT_node T);
+int display_tree_duplicates (NT_node T);
+
+static int compare_node1 ( int *b1, int *b2, int n);
+static int compare_node2 ( int *b1, int *b2, int n);
+static int find_seq_chain (Alignment *A, int **sim,int *used,int seq0,int seq1, int seq2,int chain_length, int limit, int max_chain, int *nseq);
+int new_display_tree (NT_node T, int n);
+NT_node display_code (NT_node T, int nseq, FILE *fp);
+NT_node display_dist (NT_node T, int n, FILE *fp);
+/*********************************************************************/
+/* */
+/* dpa_tree_manipulation */
+/* */
+/*********************************************************************/
+static NT_node code_dpa_tree ( NT_node T, int **D);
+NT_node collapse_sub_tree ( NT_node T,int nseq, int *list, char *new_name);
+NT_node seq2dpa_tree (Sequence *S, char *mode)
+{
+ Constraint_list *CL;
+ NT_node **T;
+ NT_node Tree;
+ CL=declare_constraint_list_simple (S);
+ CL->local_stderr=NULL;
+
+
+ CL->DM=cl2distance_matrix (CL,NOALN,(mode==NULL)?"ktup":mode, NULL, 0);
+
+ T=int_dist2nj_tree ( (CL->DM)->similarity_matrix, S->name, S->nseq, vtmpnam (NULL));
+ Tree=T[3][0];
+
+ Tree=recode_tree (Tree, S);
+ Tree=reset_dist_tree (Tree, -1);
+
+ Tree=code_dpa_tree (Tree, (CL->DM)->similarity_matrix);
+ free_distance_matrix (CL->DM);
+ return Tree;
+}
+
+NT_node tree2dpa_tree (NT_node T, Alignment *A, char *mode)
+{
+ /*This Function sets the branches with Length values used by DP*/
+ /*The tree must be rooted*/
+ Sequence *S;
+ int **D;
+
+ S=aln2seq (A);
+ T=recode_tree (T, S);
+ T=reset_dist_tree (T, -1);
+ D=get_sim_aln_array (A,mode);
+
+ T=code_dpa_tree (T, D);
+ return T;
+}
+
+NT_node code_dpa_tree ( NT_node T, int **D)
+{
+ if ( !T) return T;
+ else if ( T->leaf==1)
+ {
+ T->dist=100;
+ return T;
+ }
+ else
+ {
+ int nl, *ll;
+ int nr, *lr;
+ int a, b, min=100;
+ float tot, n=0;
+
+ nl=(T->left)->nseq;ll=(T->left)->lseq;
+ nr=(T->right)->nseq;lr=(T->right)->lseq;
+
+ for (tot=0,n=0, a=0; a< nl; a++)
+ for ( b=0; b< nr; b++, n++)
+ {
+ tot+=D[ll[a]][lr[b]];
+ min=MIN(min,(D[ll[a]][lr[b]]));
+ }
+ /* T->dist=(mode==AVERAGE)?(tot/n):min:;*/
+ T->dist=(n>0)?tot/n:0;
+ T->dist=min;
+ code_dpa_tree ( T->right, D);
+ code_dpa_tree ( T->left, D);
+ return T;
+ }
+}
+static int group_number;
+char *tree2Ngroup (Alignment *A, NT_node T, int max_n, char *fname, char *mat)
+{
+ double top, bot, mid, pmid;
+ Sequence *S;
+ int n;
+
+
+
+ if (!T)
+ {
+ char **list;
+
+ list=declare_char ( 2, 100);
+ sprintf (list[0], "%s",mat);
+
+ fprintf ( stderr, "\nCompute Phylogenetic tree [Matrix=%s]", mat);
+ T=compute_std_tree(A,1, list);
+ fprintf ( stderr, "\nCompute dpa tree");
+ T=tree2dpa_tree (T,A, mat);
+ }
+
+ S=tree2seq(T, NULL);
+
+ if ( max_n<0)
+ {
+ max_n*=-1;
+ n=tree2group_file (T,S,0, max_n, fname);
+ fprintf ( stderr, "\n#TrimTC: Split in %d Groups at a minimum of %d%% ID\n",n, (int)max_n);
+ return fname;
+
+ }
+ else if ( max_n>0)
+ {
+ if ( max_n>S->nseq)max_n=S->nseq;
+
+ top=100; bot=0;
+ pmid=0; mid=50;
+ n=tree2group_file(T, S,0, (int)mid,fname);
+ mid=dichotomy((double)n, (double)max_n,(pmid=mid), &bot, &top);
+ while (n!=max_n && (int)pmid!=(int)mid)
+ {
+ n=tree2group_file(T, S,0, (int)mid, fname);
+ mid=dichotomy((double)n, (double)max_n,(pmid=mid), &bot, &top);
+ }
+ fprintf ( stderr, "\nDONE2");
+ fprintf ( stderr, "\n#TrimTC: Split in %d Groups at a minimum of %d%% ID\n",n, (int)mid);
+ return fname;
+ }
+ return NULL;
+}
+static int group_number;
+int tree2group_file ( NT_node T,Sequence *S, int maxnseq, int minsim, char *name)
+ {
+ FILE *fp;
+
+
+ fp=vfopen (name, "w");
+ vfclose (tree2group (T, S,maxnseq,minsim, "tree2ngroup",fp));
+
+ return count_n_line_in_file(name);
+ }
+
+
+FILE * tree2group ( NT_node T,Sequence *S, int maxnseq, int minsim,char *name, FILE *fp)
+{
+ if ( !T)return fp;
+ else
+ {
+ int m,d;
+
+ m=(maxnseq==0)?S->nseq:maxnseq;
+ d=minsim;
+
+
+
+ if ( T->nseq<=m && T->dist>=d)
+ {
+ int a;
+ fprintf ( fp, ">%s_%d ", (name)?name:"", ++group_number);
+ for ( a=0; a< T->nseq; a++)
+ fprintf ( fp, "%s ", S->name[T->lseq[a]]);
+ fprintf (fp, "\n");
+ if (!T->parent)group_number=0;
+ return fp;
+ }
+ else
+ {
+ fp=tree2group (T->right, S, maxnseq, minsim, name,fp);
+ fp=tree2group (T->left, S, maxnseq, minsim, name,fp);
+ if (!T->parent)group_number=0;
+ return fp;
+ }
+
+ }
+}
+
+
+NT_node tree2collapsed_tree (NT_node T, int n, char **string)
+{
+ char ***list;
+ Sequence *A;
+ int a, *nlist;
+
+
+ A=tree2seq(T, NULL);
+ T=recode_tree(T, A);
+ list=vcalloc (A->nseq, sizeof (char***));
+ nlist=vcalloc (A->nseq, sizeof (int));
+ if ( n==0)return T;
+ else if (n>1)
+ {
+ int l;
+ char *buf;
+
+ for (l=0,a=0; a< n; a++)l+=strlen (string[a]);
+ buf=vcalloc ( 2*n+l+1, sizeof (char));
+ for (a=0; a< n; a++){buf=strcat (buf,string[a]), buf=strcat ( buf, " ");}
+ list[0]=string2list (buf);
+ vfree (buf);
+ }
+ else if ( file_exists (NULL,string[0]))
+ {
+ list=read_group (string[0]);
+ }
+ else
+ {
+ fprintf (stderr, "\nERROR: file <%s> does not exist [FATAL:%s]\n",string[0], PROGRAM);
+ myexit (EXIT_FAILURE);
+ }
+
+
+ a=0;
+ while (list[a])
+ {
+ int i, b;
+ n=atoi (list[a][0]);
+ for (b=0; b<A->nseq; b++)nlist[b]=0;
+ for (b=2; b<n; b++)
+ {
+ i=name_is_in_list (list[a][b], A->name, A->nseq, MAXNAMES);
+ nlist[i]=1;
+ }
+ T=collapse_sub_tree ( T,A->nseq,nlist,list[a][1]);
+ free_char (list[a], -1);
+ a++;
+ }
+ vfree (list);
+ return T;
+}
+
+NT_node collapse_sub_tree ( NT_node T,int nseq, int *list, char *new_name)
+{
+ if (!T) return T;
+ else
+ {
+ int a=0;
+
+
+ while (a<nseq && list[a]==T->lseq2[a]){a++;}
+ if (a==nseq)
+ {
+ sprintf ( T->name, "%s", new_name);
+ T->leaf=T->isseq=1;
+ T->left=T->right=NULL;
+ return T;
+ }
+ else
+ {
+ collapse_sub_tree (T->right, nseq, list, new_name);
+ collapse_sub_tree (T->left, nseq, list, new_name);
+ return T;
+ }
+ }
+}
+
+/*********************************************************************/
+/* */
+/* tree pruning */
+/* */
+/* */
+/*********************************************************************/
+NT_node remove_leaf ( NT_node T);
+NT_node prune_root (NT_node T);
+NT_node main_prune_tree ( NT_node T, Sequence *S)
+{
+ T=prune_tree ( T, S);
+ return T;
+}
+
+NT_node prune_tree ( NT_node T, Sequence *S)
+{
+
+ if (!T ) return T;
+
+ if (T->leaf && T->isseq && name_is_in_list (T->name,S->name, S->nseq, 100)==-1)
+ {
+ NT_node C, P, PP;
+
+ P=T->parent;
+ if ( !P)
+ {
+ int a;
+ for (a=0; a< S->nseq; a++)
+ {
+ HERE ("prune pb ---%s", S->name[a]);
+ }
+ myexit (EXIT_FAILURE);
+ }
+ C=(P->right==T)?P->left:P->right;
+ PP=C->parent=P->parent;
+
+ if (PP && PP->right==P)PP->right=C;
+ else if (PP)PP->left=C;
+ else
+ {
+ if (T==P->right)P->right=NULL;
+ else P->left=NULL;
+ T=C;
+
+ }
+ }
+ else
+ {
+ prune_tree (T->left, S);
+ prune_tree (T->right, S);
+ }
+ return prune_root(T);
+}
+
+NT_node prune_root (NT_node T)
+{
+ //This function prunes the root if needed (and frees it).
+ if (T->parent)return T;
+
+ if (!T->right && T->left)
+ {
+ return prune_root (T->left);
+ }
+ else if (T->right && !T->left)
+ {
+
+ return prune_root (T->right);
+ }
+ else
+ {
+ return T;
+ }
+}
+/*********************************************************************/
+/* */
+/* tree comparison */
+/* */
+/* */
+/*********************************************************************/
+int main_compare_cog_tree (NT_node T1, char *cogfile)
+{
+ char ***array;
+ int a, nbac, n=0, p, c, b;
+ Alignment *A;
+
+ array=file2list(cogfile, ";\n");
+ nbac=atoi(array[0][0])-2;
+
+ A=declare_aln2 (nbac+1, 10);
+ for (a=0; a<nbac; a++)
+ {
+ sprintf ( A->name[a], "%s", array[0][a+2]);
+ A->seq_al[a][0]='a';
+ A->seq_al[a][1]='\0';
+
+ }
+ sprintf ( A->name[nbac], "cons");
+
+ A->nseq=nbac+1;
+ A->len_aln=1;
+
+
+ n=3;
+ while (array[n]!=NULL)
+ {
+ for (b=0; b<nbac; b++)
+ {
+ p=atoi (array[1][b+2]);p=(p==0)?'O':'I';
+ c=atoi (array[n][b+2]);c=(c==0)?'O':'I';
+ A->seq_al[b][0]=p;
+ A->seq_al[b][1]=c;
+ A->seq_al[b][2]='\0';
+
+ }
+ sprintf (A->file[0], "%s", array[n][1]);
+ A->len_aln=2;
+ main_compare_aln_tree (T1, A, stdout);
+ n++;
+ }
+ return n;
+}
+
+
+int main_compare_aln_tree (NT_node T1, Alignment *A, FILE *fp)
+{
+ int n=0;
+
+ fprintf ( fp, "\nTOT_CLASH COG %s N %d", A->file[0], compare_aln_tree (T1, A, &n, fp));
+ vfclose (fp);
+ return n;
+}
+
+int compare_aln_tree (NT_node T, Alignment *A, int *n, FILE *fp)
+{
+ if (T->leaf)
+ {
+ int i;
+ i=name_is_in_list (T->name, A->name, A->nseq, 100);
+ T->seqal=A->seq_al[i];
+ return 0;
+ }
+ else
+ {
+ char *seq1, *seq2;
+ if (!(T->left )->seqal)compare_aln_tree (T->left, A,n, fp);
+ if (!(T->right)->seqal)compare_aln_tree (T->right, A,n, fp);
+
+ seq1=(T->left)->seqal;
+ seq2=(T->right)->seqal;
+ (T->left)->seqal=(T->right)->seqal=NULL;
+ if ( seq1 && seq2)
+ {
+ if (strm (seq1, seq2))
+ {
+ T->seqal=seq1;
+
+ }
+ else
+ {
+
+ if (seq1[0]!=seq2[0] && seq1[1]!=seq2[1])
+ {
+ fprintf ( fp, "\nNODE_CLASH: COG %s (%s,%s):(",A->file[0],seq1,seq2 );
+ display_leaf_below_node (T->left, fp);
+ fprintf ( fp, ");(");
+ display_leaf_below_node (T->right, fp);
+ fprintf ( fp, ")");
+ n[0]++;
+ }
+ }
+ }
+ }
+ return n[0];
+}
+//**********************************************************************
+
+int compare_split (int *s1, int *s2, int l);
+int get_split_size (int *s, int l);
+
+int main_compare_splits ( NT_node T1, NT_node T2, char *mode,FILE *fp)
+{
+ Sequence *S1, *S2, *S;
+ int a, b;
+
+
+
+ int **sl1, n1;
+ int **sl2, n2;
+ if ( tree_contains_duplicates (T1))
+ {
+ display_tree_duplicates (T1);
+ printf_exit (EXIT_FAILURE, stderr, "\nFirst Tree Contains Duplicated Sequences [main_compare_trees][FATAL:%s]", PROGRAM);
+
+ }
+ else if ( tree_contains_duplicates (T2))
+ {
+ display_tree_duplicates (T2);
+ printf_exit (EXIT_FAILURE, stderr, "\nSecond Tree Contains Duplicated Sequences [main_compare_trees]");
+
+ }
+
+ //Identify the commom Sequence Set
+ S1=tree2seq(T1, NULL);
+
+
+ S2=tree2seq(T2, NULL);
+
+
+ S=trim_seq ( S1, S2);
+
+ //Prune the trees and recode the subtree list
+ T1=prune_tree (T1, S);
+ T1=recode_tree(T1, S);
+
+ T2=prune_tree (T2, S);
+ T2=recode_tree(T2, S);
+ HERE ("1");
+ sl1=declare_int (10*S->nseq, S->nseq);
+ sl2=declare_int (10*S->nseq, S->nseq);
+
+ HERE ("2");
+ n1=n2=0;
+ tree2split_list (T1, S->nseq, sl1, &n1);
+ tree2split_list (T2, S->nseq, sl2, &n2);
+
+ for (a=0; a<n1; a++)
+ {
+ int n, best, s;
+
+ n=get_split_size (sl1[a], S->nseq);
+ for (best=0,b=0; b<n2; b++)
+ {
+ s=compare_split (sl1[a], sl2[b], S->nseq);
+ best=MAX(s,best);
+ }
+ fprintf ( fp, "\n%4d %4d ", MIN(n,(S->nseq)), best);
+ for (b=0; b<S->nseq; b++)fprintf ( fp, "%d", sl1[a][b]);
+ }
+
+ free_sequence (S, -1);
+ free_sequence (S1, -1);
+ free_sequence (S2, -1);
+ myexit (EXIT_SUCCESS);
+ return 1;
+}
+int compare_split (int *s1, int *s2, int l)
+{
+ int n1, n2, score1, score2, a;
+ n1=get_split_size (s1, l);
+ n2=get_split_size (s2, l);
+
+ for (score1=0,a=0; a< l; a++)
+ {
+ score1+=(s1[a]==1 && s2[a]==1)?1:0;
+ }
+ score1=(score1*200)/(n1+n2);
+
+ for ( score2=0, a=0; a<l; a++)
+ {
+ score2+=(s1[a]==0 && s2[a]==1)?1:0;
+ }
+ score2=(score2*200)/((l-n1)+n2);
+ return MAX(score1, score2);
+}
+int get_split_size (int *s, int l)
+{
+ int a, b;
+ for (a=b=0; a<l; a++)b+=s[a];
+ return b;
+}
+//**********************************************************************
+ //
+ //
+ // TREEE COMPARISON FUNCTIONS
+ //
+ //
+ //
+ //////////////////////////////////////////////////////////////////////
+
+//JM_START
+void normalizeScore(float *score, int len)
+{
+ int i;
+ float SCORE_MIN = FLT_MAX;
+ float SCORE_MAX = FLT_MIN;
+ for(i = 0; i < len; i++)
+ {
+ if(score[i] < SCORE_MIN)
+ SCORE_MIN = score[i];
+ if(score[i] > SCORE_MAX)
+ SCORE_MAX = score[i];
+ }
+ for(i = 0; i < len; i++)
+ score[i] = (9*(score[i]-SCORE_MIN)/(SCORE_MAX-SCORE_MIN));
+}
+
+int new_compare_trees ( NT_node T1, NT_node T2, int nseq, Tree_sim *TS);
+NT_node new_search_split (NT_node T, NT_node B, int nseq);
+int new_compare_split ( int *b1, int *b2, int n);
+Tree_sim* tree_scan_pos (Alignment *A, int start, int end, char *ptree, NT_node RT);
+Tree_sim* tree_scan_pos_woble (Alignment *A, int center, int max, char *ptree, NT_node RT, int *br, int *bl );
+Tree_sim* tree_scan_pair_pos (Alignment *A, int start, int end, int start2, int end2,char *ptree, NT_node RT);
+Tree_sim* tree_scan_multiple_pos (int *poslist, int *wlist,int nl, Alignment *A, char *ptree, NT_node RT);
+NT_node aln2std_tree(Alignment *A, int ipara1, int ipara2, char *mode);
+
+NT_node tree_scan (Alignment *A,NT_node RT, char *pscan, char *ptree)
+{
+ int l, a,ax, c, cx, b;
+ char mode[100];
+ int start, w;
+ int nl, *poslist;
+ char posfile[100];
+
+ char *pcFileName = A->file[0];
+ char prefix[200] ={0};
+ int len = (strrchr(pcFileName,'.')?strrchr(pcFileName,'.')-pcFileName:strlen(pcFileName));
+ strncpy(prefix, pcFileName, len);
+
+ float *fascore;
+ char out_format[100];
+
+ char *score_csv_file = vcalloc(200, sizeof (char));
+ char *score_html_file = vcalloc(200, sizeof (char));
+ char *hit_matrix_file = vcalloc(200, sizeof (char));
+ char *hit_html_file = vcalloc(200, sizeof (char));
+ char *tree_file = vcalloc(200, sizeof (char));
+
+ sprintf(score_csv_file, "%s%s", prefix, ".score_csv");
+ sprintf(score_html_file, "%s%s", prefix, ".ts_html");
+ sprintf(hit_matrix_file, "%s%s", prefix, ".hit_matrix");
+ sprintf(hit_html_file, "%s%s", prefix, ".hit_html");
+ sprintf(tree_file, "%s%s", prefix, ".trees_txt");
+
+ if ( pscan && strstr ( pscan, "help"))
+ {
+ fprintf ( stdout, "\n+tree_scan| _W_ : Window size for the tree computation|STD size in norscan mode");
+ fprintf ( stdout, "\n+tree_scan| _MODE_ : Mode for the number of windows (single, double, list, scan, pairscan, norscan, hit, norhit)");
+ fprintf ( stdout, "\n+tree_scan| _MINW_ : Minimum Window size when using the scan mode (4)");
+ fprintf ( stdout, "\n+tree_scan| _OUTTREE_ : specify the format of outputing tree in every position (default: not ouput)");
+ myexit (EXIT_SUCCESS);
+ }
+
+ strget_param (pscan, "_W_", "5", "%d",&w);
+ strget_param (pscan, "_MODE_", "single", "%s",mode);
+ strget_param (pscan, "_MINW_", "1", "%d",&start);
+ strget_param (pscan, "_POSFILE_", "NO", "%s", posfile);
+ strget_param (pscan, "_OUTTREE_", "", "%s", &out_format);
+
+ if(strlen(out_format) > 1)
+ unlink(tree_file);
+
+ l=intlen (A->len_aln);
+
+ poslist=vcalloc ( A->len_aln, sizeof (int));
+ nl=0;
+ fascore = vcalloc(A->len_aln, sizeof (float));
+
+ if ( strm (posfile, "NO"))
+ {
+
+ for ( a=0; a< A->len_aln; a++)poslist[nl++]=a+1;
+ }
+ else
+ {
+ int *p;
+ p=file2pos_list (A,posfile);
+ poslist=pos2list (p, A->len_aln, &nl);
+ for (a=0; a<nl; a++)poslist[a]++;
+ vfree (p);
+ }
+
+//For tree hit
+ NT_node *TreeArray = vcalloc(nl, sizeof (NT_node));
+
+ if ( strm (mode, "woble"))
+ {
+ for (ax=0; ax<nl; ax++)
+ {
+ Tree_sim *TS;
+ int left=0, right=0;
+ a=poslist[ax];
+
+ TS=tree_scan_pos_woble (A,a,w,ptree, RT, &left, &right);
+ fprintf ( stdout, "P: %*d I: %*d %*d SIM: %6.2f\n", l,a,l,left,l,right,TS->uw);
+ vfree (TS);
+ }
+ }
+ else if ( strm (mode, "single"))
+ {
+ for (b=0,ax=0; ax<nl; ax++)
+ {
+
+ Tree_sim *TS;
+ int pstart, pend;
+
+ a=poslist[ax];
+
+ pstart=a-b;
+ pend=a+b;
+
+ if (pstart<1 || pstart>A->len_aln)continue;
+ if (pend<1 || pend>A->len_aln)continue;
+ TS=tree_scan_pos (A, pstart,pend, ptree, RT);
+ fprintf ( stdout, "P: %*d I: %*d %*d SIM: %6.2f L: %2d\n", l,a,l,pstart,l,pend,TS->uw, (w*2)+1);
+ vfree (TS);
+ }
+ }
+ else if (strm (mode, "scan")||strm (mode, "hit"))
+ {
+ FILE *fp_ts;
+ fp_ts=vfopen (score_csv_file, "w");
+ fprintf ( fp_ts, "Position,Win_Beg,Win_End,Similarity,Win_Len\n");
+ for ( ax=0; ax<nl; ax++)
+ {
+ float best_score=0;
+ int best_pos=0, best_w=0, best_start, best_end;
+
+ a=poslist[ax];
+ best_pos = best_start = best_end = a;
+ for (b=start; b<=w; b++)
+ {
+ Tree_sim *TS;
+ int pstart, pend;
+ pstart=a-b;
+ pend=a+b;
+
+ if (pstart<1 || pstart>A->len_aln)continue;
+ if (pend<1 || pend>A->len_aln)continue;
+ TS=tree_scan_pos (A, pstart,pend, ptree, RT);
+ if (TS->uw>=best_score)
+ {best_score=TS->uw;best_w=b;best_start=pstart; best_end=pend;}
+ vfree (TS);
+ }
+ fprintf (fp_ts, "%*d,%*d,%*d,%6.2f,%2d\n", l,best_pos, l,best_start, l,best_end, best_score,(best_w*2)+1);
+ fascore[ax]=(float)best_score;
+ if(strlen(out_format) > 1)
+ vfclose (print_tree (aln2std_tree(A, best_start, best_end, mode), out_format, vfopen (tree_file, "a+")));
+ if(strm (mode, "hit"))
+ TreeArray[ax] = aln2std_tree(A, best_start, best_end, mode);
+ }
+ vfclose(fp_ts);
+ }
+//tree scan by using normal distribution window
+//or
+//generate hit matrix
+ else if ( strm (mode, "norscan")||strm (mode, "norhit"))
+ {
+ FILE *fp_ts;
+ ptree=vcalloc(100, sizeof (char));
+ fp_ts=vfopen (score_csv_file, "w");
+ fprintf ( fp_ts, "Position,Similarity,STD_Len\n");
+ for ( ax=0; ax<nl; ax++)
+ {
+ float best_score=DBL_MIN;
+ int best_STD = start;
+ a=poslist[ax];
+ for (b=start; b<=w; b++)
+ {
+ Tree_sim *TS;
+ sprintf ( ptree, "+aln2tree _COMPARE_nordisaln__STD_%d__CENTER_%d_", b, a); //should be used a or ax
+ TS=tree_scan_pos (A, 1, nl, ptree, RT);
+ if (TS->uw>=best_score)
+ {best_score=TS->uw;best_STD=b;}
+ vfree (TS);
+ }
+ fascore[ax]=best_score;
+ fprintf ( fp_ts, "%*d,%6.2f,%d\n", l,a, fascore[ax], best_STD);
+ if(strlen(out_format) > 1)
+ vfclose (print_tree (aln2std_tree(A, best_STD, a, mode), out_format, vfopen (tree_file, "a+")));
+ if(strm (mode, "norhit"))
+ TreeArray[ax] = aln2std_tree(A, best_STD, a, mode);
+ }
+ vfclose(fp_ts);
+ }
+//generate hit matrix
+ if (strm (mode, "hit")||strm (mode, "norhit"))
+ {
+//Compute the pair score of tree scan segqtion
+ fprintf (stdout, "[STRAT] Calculate the hit matrix of the tree scan\n");
+ float **ffpHitScoreMatrix;
+
+ ffpHitScoreMatrix=vcalloc (nl, sizeof (float*));
+ int i, j;
+ for(i = 0; i < nl; i++)
+ ffpHitScoreMatrix[i]=vcalloc (nl-i, sizeof (float));
+
+ fprintf (stdout, "Process positions\n", i);
+ for(i = 0; i < nl; i++)
+ {
+ fprintf (stdout, "%d, ", i);
+ for(j = i; j < nl; j++)
+ {
+ Tree_sim *TS;
+ TS=tree_cmp (TreeArray[i], TreeArray[j]);
+ ffpHitScoreMatrix[i][j-i] = TS->uw;
+ vfree (TS);
+ }
+ }
+ vfree(TreeArray);
+ fprintf (stdout, "\n");
+ output_hit_matrix(hit_matrix_file, ffpHitScoreMatrix, nl);
+ fprintf (stdout, "[END]Calculate the hit matrix of the tree scan\n");
+
+//Output Hit Score into color html
+ output_hit_color_html (A, ffpHitScoreMatrix, nl, hit_html_file);
+ vfree(ffpHitScoreMatrix);
+ }
+ else if ( strm (mode, "pairscan"))
+ {
+ int d, set;
+
+ for ( ax=0; ax<nl; ax++)
+ {
+ float best_score=0;
+ int best_pos=0, best_w=0, best_w2=0, best_start, best_end, best_pos2=0, best_start2, best_end2;
+
+ Tree_sim *TS;
+ int pstart, pend, p2start, p2end;
+ a=poslist[ax];
+ for ( cx=0; cx<nl; cx++)
+ {
+ set=0;
+ c=poslist[cx];
+ for (d=start; d<=w; d++)
+ {
+ for (b=start; b<=w; b++)
+ {
+ pstart=a-b;
+ pend=a+b;
+ p2start=c-d;
+ p2end=c+d;
+ if (pstart<1 || pstart>A->len_aln)continue;
+ if (pend<1 || pend>A->len_aln)continue;
+ if (p2start<1 || p2start>A->len_aln)continue;
+ if (p2end<1 || p2end>A->len_aln)continue;
+ if (pstart<=p2start && pend>=p2start) continue;
+ if (pstart<=p2end && pend>=p2end) continue;
+ TS=tree_scan_pair_pos (A, pstart,pend,p2start, p2end, ptree, RT);
+
+ if (TS->uw>=best_score){best_score=TS->uw; best_pos=a;best_w=b;best_start=pstart; best_end=pend; best_pos2=c, best_w2=d, best_start2=p2start, best_end2=p2end;set=1;}
+ vfree (TS);
+ }
+ }
+ if (set)fprintf ( stdout, "P1: %*d I1: %*d %*d P2: %*d I2: %*d %*d SIM: %6.2f L: %2d\n", l,best_pos, l,best_start, l,best_end, l, best_pos2, l, best_start2, l, best_end2, best_score,(best_w*2)+1 );
+
+ set=0;
+ }
+ }
+ }
+ else if ( strm (mode, "multiplescan"))
+ {
+ int n, **wlist, best_pos;
+ float best_score;
+ Tree_sim *TS;
+ wlist=generate_array_int_list (nl*2,start, w,1, &n, NULL);
+ HERE ("Scan %d Possibilities", n);
+
+ for (best_score=best_pos=0,a=0; a<n; a++)
+ {
+ TS=tree_scan_multiple_pos (poslist,wlist[a],nl, A, ptree, RT);
+ if (TS && TS->uw>best_score)
+ {
+ best_score=TS->uw;
+ fprintf ( stdout, "\n");
+ for (b=0; b<nl; b++)
+ {
+ fprintf ( stdout, "[%3d %3d %3d]", poslist[b], wlist[a][b*2], wlist[a][b*2+1]);
+ }
+ fprintf ( stdout, " SCORE: %.2f", best_score);
+ }
+ if (TS)vfree (TS);
+ }
+ }
+
+//Output Tree Scan core into color html
+ normalizeScore(fascore, nl);
+ Alignment *ST;
+ Sequence *S;
+ int i, r1;
+
+ S=A->S;
+ ST=copy_aln (A, NULL);
+ for (a=0; a<ST->nseq; a++)
+ {
+ i=name_is_in_list (ST->name[a],S->name, S->nseq, 100);
+ if ( i!=-1)
+ {
+ for (b=0; b<ST->len_aln; b++)
+ {
+ r1=ST->seq_al[a][b];
+ if ( r1!='-')
+ r1 = (int)fascore[b] + 48;
+ ST->seq_al[a][b]=r1;
+ }
+ }
+ }
+ output_color_html ( A, ST, score_html_file);
+
+//free memory
+ free_aln(ST);
+ vfree(fascore);
+ vfree(score_csv_file);
+ vfree(score_html_file);
+ vfree(hit_matrix_file);
+ vfree(hit_html_file);
+
+ myexit(EXIT_SUCCESS);return NULL;
+}
+
+NT_node aln2std_tree(Alignment *A, int ipara1, int ipara2, char *mode)
+{
+ Alignment *B;
+ NT_node T;
+ char *cpSet = vcalloc(100, sizeof (char));
+
+ if(strm (mode, "norhit"))
+ {
+ B=extract_aln (A, 1, A->len_aln);
+ sprintf ( cpSet, "+aln2tree _COMPARE_nordisaln__STD_%d__CENTER_%d_", ipara1, ipara2);
+ }
+ else
+ B=extract_aln (A, ipara1, ipara2);
+
+ T=compute_std_tree (B, (cpSet)?1:0, (cpSet)?&cpSet:NULL);
+ free_aln(B);
+ return T;
+}
+
+Tree_sim* tree_scan_multiple_pos (int *poslist, int *wlist,int nl, Alignment *A, char *ptree, NT_node RT)
+{
+ static Alignment *B;
+ static int *pos;
+ int a, b, n, s, p, left, right;
+ Tree_sim *TS;
+ NT_node T=NULL;
+
+
+ //poslist positions come [1..n]
+ vfree(pos);
+ free_aln (B);
+
+ pos=vcalloc ( A->len_aln+1, sizeof (int));
+ B=copy_aln (A, NULL);
+
+ for (a=0; a<nl; a++)
+ {
+ p=poslist[a]-1;
+ left =wlist[2*a];
+ right=wlist[2*a+1];
+ for (b=p-left; b<=p+right; b++)
+ {
+ if (b<1 ||b>A->len_aln) return NULL;
+ else pos[b]++;
+
+ if (pos[b]>1) return NULL;
+ }
+ }
+
+ for (s=0; s<A->nseq; s++)
+ {
+ for (n=0,a=1; a<=A->len_aln; a++)
+ {
+ if (pos[a])B->seq_al[s][n++]=A->seq_al[s][a-1];
+ }
+ }
+
+ B->len_aln=n;
+ for (s=0; s<A->nseq; s++)B->seq_al[s][B->len_aln]='\0';
+
+ T=compute_std_tree (B, (ptree)?1:0, (ptree)?&ptree:NULL);
+
+ TS=tree_cmp (T, RT);
+
+ free_tree(T);
+ return TS;
+ }
+
+Tree_sim* tree_scan_pair_pos (Alignment *A, int start, int end, int start2, int end2,char *ptree, NT_node RT)
+ {
+ Tree_sim *TS;
+ Alignment *B,*B1, *B2;
+ NT_node T=NULL;
+ int a;
+
+
+ B=copy_aln (A, NULL);
+ B1=extract_aln (A,start,end);
+ B2=extract_aln (A,start2, end2);
+
+
+ for ( a=0; a< B->nseq;a++)
+ sprintf (B->seq_al[a], "%s%s", B1->seq_al[a], B2->seq_al[a]);
+ B->len_aln=strlen (B->seq_al[0]);
+
+ T=compute_std_tree (B, (ptree)?1:0, (ptree)?&ptree:NULL);
+ TS=tree_cmp (T, RT);
+
+ free_tree(T);
+ free_aln (B);free_aln(B1); free_aln(B2);
+
+ return TS;
+ }
+
+Tree_sim* tree_scan_pos (Alignment *A, int start, int end, char *ptree, NT_node RT)
+ {
+ Tree_sim *TS;
+ Alignment *B;
+ NT_node T;
+
+ if ( start<1 || start>A->len_aln) return NULL;
+ if ( end<1 || end>A->len_aln) return NULL;
+
+ B=extract_aln (A,start,end);
+ T=compute_std_tree (B, (ptree)?1:0, (ptree)?&ptree:NULL);
+ TS=tree_cmp (T, RT);
+ free_tree(T);free_aln (B);
+ return TS;
+ }
+Tree_sim* tree_scan_pos_woble (Alignment *A, int center, int max, char *ptree, NT_node RT, int *br, int *bl )
+ {
+ Tree_sim *TS,*BTS;
+
+
+ int left, right;
+ float best_score=0;
+ int start, end;
+
+ br[0]=bl[0]=0;
+ BTS=vcalloc (1, sizeof (Tree_sim));
+
+ for (left=0; left<max; left++)
+ for (right=0; right<max; right++)
+ {
+ start=center-left;
+ end=center+right;
+ TS=tree_scan_pos (A, start, end, ptree, RT);
+
+ if (TS && TS->uw >best_score)
+ {
+ best_score=TS->uw;
+ BTS[0]=TS[0];
+ br[0]=right; bl[0]=left;
+ vfree(TS);
+ }
+ }
+ return BTS;
+ }
+
+Tree_sim* tree_cmp( NT_node T1, NT_node T2)
+{
+ Sequence *S1, *S2, *S;
+ int n;
+ int a;
+
+ Tree_sim *TS1, *TS2;
+
+ if ( tree_contains_duplicates (T1))
+ {
+ display_tree_duplicates (T1);
+ printf_exit (EXIT_FAILURE, stderr, "\nFirst Tree Contains Duplicated Sequences [main_compare_trees][FATAL:%s]", PROGRAM);
+
+ }
+ else if ( tree_contains_duplicates (T2))
+ {
+ display_tree_duplicates (T2);
+ printf_exit (EXIT_FAILURE, stderr, "\nSecond Tree Contains Duplicated Sequences [main_compare_trees]");
+
+ }
+
+ //Identify the commom Sequence Set
+ S1=tree2seq(T1, NULL);
+ S2=tree2seq(T2, NULL);
+
+ S=trim_seq ( S1, S2);
+
+ if ( S->nseq<=2)
+ {
+ fprintf ( stderr, "\nERROR: Your two do not have enough common leaf to be compared [FATAL:PROGRAM]");
+ }
+
+ //Prune the trees and recode the subtree list
+ T1=prune_tree (T1, S);
+ T1=recode_tree(T1, S);
+
+ T2=prune_tree (T2, S);
+ T2=recode_tree(T2, S);
+
+ TS1=vcalloc (1, sizeof (Tree_sim));
+ TS2=vcalloc (1, sizeof (Tree_sim));
+
+ new_compare_trees ( T1, T2, S->nseq, TS1);
+ new_compare_trees ( T2, T1, S->nseq, TS2);
+
+
+ TS1->n=tree2nnode (T1);
+ TS1->nseq=S->nseq;
+
+ TS2->n=tree2nnode (T2);
+ /*if (TS1->n !=TS2->n)
+ printf_exit (EXIT_FAILURE, stderr,"\nERROR: Different number of Nodes in the two provided trees after prunning [FATAL: %s]", PROGRAM);
+ */
+
+ free_sequence (S, -1);
+ free_sequence (S1, -1);
+ free_sequence (S2, -1);
+
+ TS1->uw=(TS1->uw+TS2->uw)*100/(TS1->max_uw+TS2->max_uw);
+ TS1->w=(TS1->w+TS2->w)*100/(TS1->max_w+TS2->max_w);
+ TS1->d=(TS1->d+TS2->d)*100/(TS1->max_d+TS2->max_d);
+ TS1->rf=(TS1->rf+TS2->rf)/2;
+ vfree (TS2);
+ return TS1;
+}
+int print_node_list (NT_node T, Sequence *RS)
+{
+ NT_node *L;
+ Sequence *S;
+ int *nlseq2;
+
+ S=tree2seq(T, NULL);
+ L=tree2node_list (T, NULL);
+ if (!RS)RS=S;
+
+ nlseq2=vcalloc ( RS->nseq, sizeof (int));
+ while (L[0])
+ {
+ int d,b;
+ d=MIN(((L[0])->nseq), (S->nseq-(L[0])->nseq));
+ fprintf ( stdout, "Bootstrap: %5.2f Depth: %5d Splits: ", (L[0])->bootstrap, d);
+
+ for (b=0; b<RS->nseq; b++)nlseq2[b]='-';
+ for (b=0; b<S->nseq; b++)
+ {
+ int p;
+ p=name_is_in_list (S->name[b], RS->name, RS->nseq, 100);
+ if (p!=-1)nlseq2[p]=(L[0])->lseq2[b]+'0';
+ }
+ for (b=0; b<RS->nseq; b++) fprintf ( stdout, "%c", nlseq2[b]);
+ fprintf (stdout, "\n");
+ L++;
+ }
+ return 1;
+}
+NT_node main_compare_trees_list ( NT_node RT, Sequence *S, FILE *fp)
+{
+ Tree_sim *T;
+ NT_node *TL;
+ Sequence *RS;
+ int a;
+
+ T=vcalloc (1, sizeof (Tree_sim));
+ RS=tree2seq(RT, NULL);
+
+ TL=read_tree_list (S);
+
+ reset_boot_tree (RT,0.0001);
+ for (a=0; a<S->nseq; a++)
+ {
+ TL[a]=prune_tree(TL[a],RS);
+ TL[a]=recode_tree (TL[a],RS);
+ new_compare_trees (RT, TL[a], RS->nseq,T);
+ }
+
+ vfree (T);
+ return RT;
+}
+NT_node main_compare_trees ( NT_node T1, NT_node T2, FILE *fp)
+{
+ Tree_sim *T;
+
+ T=tree_cmp (T1, T2);
+ fprintf ( fp, "\n#tree_cmp|T: %.f W: %.2f L: %.2f RF: %d N: %d S: %d", T->uw, T->w, T->d, T->rf, T->n, T->nseq);
+ fprintf ( fp, "\n#tree_cmp_def|T: ratio of identical nodes");
+ fprintf ( fp, "\n#tree_cmp_def|W: ratio of identical nodes weighted with the min Nseq below node");
+ fprintf ( fp, "\n#tree_cmp_def|L: average branch length similarity");
+ fprintf ( fp, "\n#tree_cmp_def|RF: Robinson and Foulds");
+ fprintf ( fp, "\n#tree_cmp_def|N: number of Nodes in T1 [unrooted]");
+ fprintf ( fp, "\n#tree_cmp_def|S: number of Sequences in T1\n");
+
+ vfree (T);
+ return T1;
+}
+
+int new_compare_trees ( NT_node T1, NT_node T2, int nseq, Tree_sim *TS)
+{
+ int n=0;
+ NT_node N;
+ float t1, t2;
+
+ if (!T1 || !T2) return 0;
+
+ n+=new_compare_trees (T1->left, T2, nseq,TS);
+ n+=new_compare_trees (T1->right, T2, nseq,TS);
+
+ //Exclude arbitrary splits (dist==0)
+ if ((T1->dist==0) && !(T1->parent))return n;
+
+ N=new_search_split (T1, T2, nseq);
+ t1=FABS(T1->dist);
+ t2=(N)?FABS(N->dist):0;
+ TS->max_d+=MAX(t1, t2);
+
+ if (!N)TS->rf++;
+ if (T1->nseq>1)
+ {
+ int w;
+ w=MIN((nseq-T1->nseq),T1->nseq);
+ TS->max_uw++;
+ TS->max_w+=w;
+
+ if (N)
+ {
+ TS->uw++;
+ TS->w+=w;
+ TS->d+=MIN(t1, t2);
+ T1->bootstrap++;
+ //T1->dist=T1->nseq;
+ }
+ else
+ {
+ //T1->dist=T1->nseq*-1;
+ ;
+ }
+ }
+ else
+ {
+ TS->d+=MIN(t1, t2);
+ //T1->dist=1;
+ }
+ return ++n;
+}
+NT_node new_search_split (NT_node T, NT_node B, int nseq)
+{
+ NT_node N;
+ if (!T || !B) return NULL;
+ else if ( new_compare_split (T->lseq2, B->lseq2, nseq)==1)return B;
+ else if ( (N=new_search_split (T, B->right, nseq)))return N;
+ else return new_search_split (T, B->left, nseq);
+}
+int new_compare_split ( int *b1, int *b2, int n)
+{
+ int a, flag;
+
+ for (flag=1, a=0; a<n; a++)
+ if (b1[a]!=b2[a])flag=0;
+ if (flag) return flag;
+
+ for (flag=1, a=0; a<n; a++)
+ if (b1[a]==b2[a])flag=0;
+ return flag;
+}
+
+float compare_trees ( NT_node T1, NT_node T2, int nseq,int mode)
+{
+ /*search each branch of T1 in T2*/
+ float n=0;
+
+
+ if ( !T1 || !T2)return 0;
+
+ if (getenv4debug("DEBUG_TREE_COMPARE"))display_node (T1, "\nNODE ", nseq);
+
+ if (T1->parent && T1->nseq>1)n+=search_node ( T1, T2, nseq, mode);
+
+ n+=compare_trees ( T1->left, T2, nseq, mode);
+ n+=compare_trees ( T1->right, T2, nseq, mode);
+
+ return n;
+}
+
+float search_node ( NT_node B, NT_node T, int nseq, int mode)
+{
+ int n=0;
+ if ( !B || !T) return -1;
+ if (getenv4debug("DEBUG_TREE_COMPARE"))display_node ( T, "\n\t", nseq);
+
+ n=compare_node ( B->lseq2, T->lseq2, nseq );
+
+ if ( n==1)
+ {
+ if (getenv4debug("DEBUG_TREE_COMPARE"))fprintf ( stderr, "[1][%d]", (int)evaluate_node_similarity ( B, T, nseq, mode));
+ if (mode==RECODE)B->dist=B->leaf;
+ return evaluate_node_similarity ( B, T, nseq, mode);
+ }
+ else if ( n==-1)
+ {
+ if (getenv4debug("DEBUG_TREE_COMPARE"))fprintf ( stderr, "[-1]");
+ if (mode==RECODE)B->dist=-B->leaf;
+ return 0;
+ }
+ else
+ {
+ if (getenv4debug("DEBUG_TREE_COMPARE"))fprintf ( stderr, "[0]");
+ n=search_node ( B, T->left, nseq, mode);
+ if ( n>0) return n;
+ n=search_node ( B, T->right, nseq, mode);
+ if ( n>0) return n;
+ n=search_node ( B, T->bot, nseq, mode);
+ if ( n>0) return n;
+ }
+ return n;
+}
+
+float evaluate_node_similarity ( NT_node B, NT_node T, int nseq, int mode)
+{
+int a, c;
+
+ if ( mode==TOPOLOGY || mode ==RECODE)
+ {
+ for ( a=0; a< nseq; a++)
+ if ( B->lseq2[a]!=T->lseq2[a]) return 0;
+ return 1;
+ }
+ else if ( mode == WEIGHTED)
+ {
+ for (c=0, a=0; a< nseq; a++)
+ {
+ if ( B->lseq2[a]!=T->lseq2[a]) return 0;
+ else c+=B->lseq2[a];
+ }
+ return (float)(MIN(c,nseq));
+ }
+ else if ( mode == LENGTH )
+ {
+ float d1, d2;
+
+ for (c=0, a=0; a< nseq; a++)
+ {
+ if ( B->lseq2[a]!=T->lseq2[a]) return 0;
+ }
+ d1=FABS((B->dist-T->dist));
+ d2=MAX(B->dist, T->dist);
+ return (d2>0)?(d1*100)/d2:0;
+ }
+ else
+ {
+ return 0;
+ }
+}
+int compare_node ( int *b1, int *b2, int nseq)
+{
+ int n1, n2;
+
+ n1=compare_node1 ( b1, b2, nseq);
+ /*fprintf ( stderr, "[%d]", n1);*/
+ if ( n1==1) return 1;
+
+ n2=compare_node2 ( b1, b2, nseq);
+ /* fprintf ( stderr, "[%d]", n2);*/
+ if ( n2==1)return 1;
+ else if ( n2==-1 && n1==-1) return -1;
+ else return 0;
+}
+int compare_node1 ( int *b1, int *b2, int n)
+{
+ int a;
+ int l1, l2;
+ int r=1;
+ for ( a=0; a< n; a++)
+ {
+ l1=b1[a];
+ l2=b2[a];
+ if ( l1==1 && l2==0) return -1;
+ if ( l1!=l2)r=0;
+ }
+ return r;
+}
+int compare_node2 ( int *b1, int *b2, int n)
+{
+ int a;
+ int l1, l2;
+ int r=1;
+
+ for ( a=0; a< n; a++)
+ {
+ l1=1-b1[a];
+ l2=b2[a];
+ if ( l1==1 && l2==0) return -1;
+ if ( l1!=l2) r=0;
+ }
+ return r;
+}
+void display_node (NT_node N, char *string,int nseq)
+{
+ int a;
+ fprintf ( stderr, "%s", string);
+ for (a=0; a< nseq; a++)fprintf ( stderr, "%d", N->lseq2[a]);
+}
+
+
+/*********************************************************************/
+/* */
+/* FJ_tree Computation */
+/* */
+/* */
+/*********************************************************************/
+
+
+NT_node tree_compute ( Alignment *A, int n, char ** arg_list)
+{
+ if (n==0 || strm (arg_list[0], "cw"))
+ {
+ return compute_cw_tree (A);
+ }
+ else if ( strm (arg_list[0], "fj"))
+ {
+ return compute_fj_tree ( NULL, A, (n>=1)?atoi(arg_list[1]):8, (n>=2)?arg_list[2]:NULL);
+ }
+
+ else if ( ( strm (arg_list[0], "nj")))
+ {
+ return compute_std_tree (A, n, arg_list);
+ }
+ else
+ return compute_std_tree (A, n, arg_list);
+}
+
+NT_node compute_std_tree (Alignment *A, int n, char **arg_list)
+{
+ return compute_std_tree_2 (A, NULL, list2string (arg_list, n));
+}
+NT_node compute_std_tree_2 (Alignment *A, int **s, char *cl)
+{
+
+ NT_node T, **BT=NULL;
+ char *tree_name;
+
+ char matrix[100];
+ char score [100];
+ char compare[100];
+ char tmode[100];
+ int free_s=0;
+
+ tree_name =vtmpnam (NULL);
+
+ if (strstr (cl, "help"))
+ {
+ fprintf ( stdout, "\n+aln2tree| _MATRIX_ : matrix used for the comparison (idmat, sarmat, pam250mt..)\n");
+ fprintf ( stdout, "\n+aln2tree| _SCORE_ : score mode used for the distance (sim, raw)\n");
+ fprintf ( stdout, "\n+aln2tree| _COMPARE_: comparison mode (aln, ktup, align, nordisaln)\n");
+ fprintf ( stdout, "\n+aln2tree| _TMODE_ : tree mode (nj, upgma)\n");
+ myexit (EXIT_SUCCESS);
+ }
+
+
+ //matrix: idmat, ktup,sarmat, sarmat2
+ strget_param (cl, "_MATRIX_", "idmat", "%s",matrix);
+
+ //score: sim, raw
+ strget_param (cl, "_SCORE_", "sim", "%s",score);
+
+ //compare: aln, ktup, align
+ strget_param (cl, "_COMPARE_", "aln", "%s",compare);
+
+ //compare: aln, ktup, align
+ strget_param (cl, "_TMODE_", "nj", "%s",tmode);
+
+ int STD, CENTER;
+ if ( strm (compare, "nordisaln"))
+ {
+ strget_param (cl, "_STD_", "1", "%d", &STD);
+ strget_param (cl, "_CENTER_", "5", "%d", &CENTER);
+ }
+ //Use external msa2tree methods
+ if ( strm (tmode, "cw"))
+ {
+ free_int (s, -1);
+ return compute_cw_tree (A);
+ }
+
+ //compute distance matrix if needed
+ if ( !s)
+ {
+ free_s=1;
+ if ( strm (compare, "ktup"))
+ {
+ ungap_array (A->seq_al, A->nseq);
+ s=get_sim_aln_array ( A,cl);
+ }
+ else if ( strm ( compare, "aln"))
+ {
+ if (strm (score, "sim"))
+ s=get_sim_aln_array(A, matrix);
+ else if ( strm (score, "raw"))
+ {
+ s=get_raw_sim_aln_array (A,matrix);
+ }
+ }
+ else if ( strm ( compare, "nordisaln"))
+ {
+ s=get_sim_aln_array_normal_distribution(A, matrix, &STD, &CENTER);
+ }
+ s=sim_array2dist_array(s, 100);
+ }
+
+ //Compute the tree
+ if (strm (tmode, "nj"))
+ {
+
+ BT=int_dist2nj_tree (s, A->name, A->nseq, tree_name);
+ T=main_read_tree (tree_name);
+ free_read_tree(BT);
+ }
+ else if (strm (tmode, "upgma"))
+ {
+ BT=int_dist2upgma_tree (s,A, A->nseq, tree_name);
+ T=main_read_tree (tree_name);
+ free_read_tree(BT);
+ }
+
+ if ( strm ( cl, "dpa"))
+ {
+ s=dist_array2sim_array(s, 100);
+ T=code_dpa_tree (T,s);
+ }
+
+ if (free_s)free_int (s, -1);
+ return T;
+}
+
+NT_node similarities_file2tree (char *mat)
+{
+ int **s;
+ Alignment *A;
+ char *tree_name;
+ NT_node T;
+
+
+
+ tree_name =vtmpnam (NULL);
+
+ s=input_similarities (mat,NULL, NULL);
+
+
+ A=similarities_file2aln(mat);
+ s=sim_array2dist_array(s, 100);
+
+
+ int_dist2nj_tree (s, A->name, A->nseq, tree_name);
+ T=main_read_tree(tree_name);
+ free_int (s, -1);
+ return T;
+}
+
+NT_node compute_cw_tree (Alignment *A)
+{
+ char *tmp1, *tmp2, tmp3[1000];
+
+ tmp1=vtmpnam (NULL);
+ tmp2=vtmpnam (NULL);
+
+ sprintf ( tmp3, "%s.ph", tmp1);
+ output_clustal_aln (tmp1, A);
+ printf_system ("clustalw -infile=%s -tree -newtree=%s %s ", tmp1,tmp3, TO_NULL_DEVICE);
+ printf_system("mv %s %s", tmp3, tmp2);
+
+ return main_read_tree(tmp2);
+}
+
+NT_node compute_fj_tree (NT_node T, Alignment *A, int limit, char *mode)
+{
+ static int in_fj_tree;
+ if (!in_fj_tree)fprintf ( stderr, "\nComputation of an NJ tree using conserved positions\n");
+
+ in_fj_tree++;
+ if (T && T->leaf<=2);
+ else
+ {
+ T=aln2fj_tree(T,A,limit, mode);
+ T->right=compute_fj_tree ( T->right, A, limit, mode);
+ T->left=compute_fj_tree ( T->left, A, limit, mode);
+ }
+ in_fj_tree--;
+ return T;
+}
+
+
+NT_node aln2fj_tree(NT_node T, Alignment *A, int limit_in, char *mode)
+{
+ NT_node NT;
+ Sequence *S=NULL;
+ Alignment *subA=NULL;
+ int fraction_gap;
+ int l, limit;
+
+ if (T)
+ S=tree2seq (T,NULL);
+ else
+ S=aln2seq (A);
+
+
+ l=0;
+ for ( fraction_gap=100; fraction_gap<=100 && l<1; fraction_gap+=10)
+ for ( limit=limit_in; limit>0 && l<1; limit--)
+ {
+ fprintf ( stderr, "\n%d %d", limit, fraction_gap);
+ free_aln (subA);
+ subA=extract_sub_aln2 (A,S->nseq,S->name);
+ subA=filter_aln4tree (subA,limit,fraction_gap, mode);
+ l=subA->len_aln;
+ }
+
+ /* while ( subA->len_aln<1)
+ {
+ subA=extract_sub_aln2 (A,S->nseq,S->name);
+ subA=filter_aln4tree (subA,limit,fraction_gap,mode);
+ free_aln (subA);
+ subA=extract_sub_aln2 (A,S->nseq,S->name);
+ subA=filter_aln4tree (subA,--limit,fraction_gap, mode);
+ }
+ */
+ NT=aln2tree (subA);
+ NT=tree2fj_tree (NT);
+
+ NT=realloc_tree (NT,A->nseq);
+ fprintf ( stderr, "Limit:%d Gap: %d Columns: %4d Left: %4d Right %4d BL:%4.2f\n",limit,fraction_gap, subA->len_aln, (NT->right)->leaf,(NT->left)->leaf, (NT->left)->dist+(NT->right)->dist);
+
+ if ( T)
+ {
+ NT->dist=T->dist;
+ NT->parent=T->parent;
+ }
+ free_tree(T);
+ free_aln (subA);
+ free_sequence (S, -1);
+ return NT;
+}
+
+Alignment * filter_aln4tree (Alignment *A, int n,int fraction_gap,char *mode)
+{
+ char *aln_file;
+ char *ungaped_aln_file;
+ char *scored_aln_file;
+ char *filtered_aln_file;
+
+ aln_file=vtmpnam(NULL);
+ ungaped_aln_file=vtmpnam (NULL);
+ scored_aln_file=vtmpnam (NULL);
+ scored_aln_file=vtmpnam(NULL);
+ filtered_aln_file=vtmpnam(NULL);
+
+
+
+ output_clustal_aln (aln_file, A);
+ /* 1: remove columns with too many gaps*/
+ printf_system ("t_coffee -other_pg seq_reformat -in %s -action +rm_gap %d -output clustalw > %s", aln_file,fraction_gap, ungaped_aln_file);
+
+ /* 2: evaluate the alignment*/
+
+ printf_system ("t_coffee -other_pg seq_reformat -in %s -action +evaluate %s -output clustalw > %s", ungaped_aln_file,(mode)?mode:"categories", scored_aln_file);
+
+
+ /*3 extract the high scoring columns*/
+ printf_system("t_coffee -other_pg seq_reformat -in %s -struc_in %s -struc_in_f number_aln -action +use_cons +keep '[%d-8]' +rm_gap -output clustalw > %s", ungaped_aln_file, scored_aln_file,n, filtered_aln_file);
+
+
+ free_aln (A);
+
+ A=main_read_aln ( filtered_aln_file, NULL);
+ print_aln (A);
+
+ return A;
+}
+
+NT_node tree2fj_tree (NT_node T)
+{
+ NT_node L;
+
+ return T;
+
+ L=find_longest_branch (T, NULL);
+ T=reroot_tree (T, L);
+ return T;
+}
+
+
+/*********************************************************************/
+/* */
+/* Tree Filters and MAnipulation */
+/* */
+/* */
+/*********************************************************************/
+int tree2star_nodes (NT_node R, int n_max)
+{
+ if ( !R) return 0;
+ else if (!R->left && !R->right)
+ {
+ if (n_max>=1)R->dist=0;
+ return 1;
+ }
+ else
+ {
+ int n=0;
+
+ n+=tree2star_nodes (R->right, n_max);
+ n+=tree2star_nodes (R->left, n_max);
+
+ if (n<n_max)R->dist=0;
+ return n;
+ }
+}
+
+NT_node aln2tree (Alignment *A)
+{
+ NT_node **T=NULL;
+
+
+ T=make_nj_tree (A, NULL, 0, 0, A->seq_al, A->name, A->nseq, NULL, NULL);
+ tree2nleaf (T[3][0]);
+
+ return T[3][0];
+}
+NT_node realloc_tree ( NT_node R, int n)
+{
+ if ( !R)return R;
+ R->right=realloc_tree (R->right,n);
+ R->left=realloc_tree (R->left,n);
+ R->bot=realloc_tree (R->bot,n);
+
+ R->lseq=vrealloc (R->lseq, n*sizeof (int));
+ R->lseq2=vrealloc (R->lseq2, n*sizeof (int));
+ return R;
+}
+
+NT_node reset_boot_tree ( NT_node R, int n)
+{
+ if ( !R)return R;
+ R->right=reset_boot_tree (R->right,n);
+ R->left=reset_boot_tree (R->left,n);
+ R->bot=reset_boot_tree (R->bot,n);
+ R->bootstrap=(float)n;
+
+ return R;
+}
+NT_node tree_dist2normalized_tree_dist ( NT_node R, float max)
+{
+ if (!R)return R;
+ else
+ {
+ tree_dist2normalized_tree_dist ( R->right, max);
+ tree_dist2normalized_tree_dist ( R->left, max);
+ R->bootstrap=(int)((R->dist*100)/max);
+ }
+ return R;
+}
+NT_node reset_dist_tree ( NT_node R, float n)
+{
+ if ( !R)return R;
+ R->right=reset_dist_tree (R->right,n);
+ R->left=reset_dist_tree (R->left,n);
+ R->bot=reset_dist_tree (R->bot,n);
+
+ if (R->parent && !(R->parent)->parent && !(R->parent)->bot)R->dist=n/2;
+ else R->dist=n;
+
+ return R;
+}
+
+
+NT_node* free_treelist (NT_node *L)
+{
+ int n=0;
+ while (L[n])free_tree (L[n++]);
+ vfree (L);
+ return NULL;
+}
+NT_node free_tree ( NT_node R)
+{
+ if ( !R)return R;
+ R->right=free_tree (R->right);
+ R->left=free_tree (R->left);
+ R->bot=free_tree (R->bot);
+ free_tree_node (R);
+ return R;
+}
+
+NT_node free_tree_node ( NT_node R)
+{
+ if (!R)return NULL;
+
+ vfree (R->seqal);
+ vfree (R->idist);
+ vfree (R->ldist);
+ vfree (R->file);
+ vfree ( R->name);
+ vfree ( R->lseq); vfree ( R->lseq2);
+ vfree (R);
+ return NULL;
+}
+
+NT_node rename_seq_in_tree ( NT_node R, char ***list)
+{
+ if ( !R || !list) return R;
+
+ if ( R->leaf!=1)
+ {
+ R->right=rename_seq_in_tree (R->right, list);
+ R->left=rename_seq_in_tree (R->left, list);
+ R->bot=rename_seq_in_tree (R->bot, list);
+ }
+ else
+ {
+ int n=0;
+ while ( list[n][0][0])
+ {
+ if ( strm (list[n][0], R->name))sprintf (R->name, "%s",list[n][1]);
+ n++;
+ }
+ }
+ return R;
+}
+Sequence * tree2seq (NT_node R, Sequence *S)
+{
+
+ if ( !R)return S;
+ if ( !S)
+ {
+ S=declare_sequence (10, 10, tree2nseq (R));
+ S->nseq=0;
+ }
+
+ if (R->leaf==1)
+ {
+ sprintf ( S->name[S->nseq++], "%s", R->name);
+ }
+ else
+ {
+ S=tree2seq (R->left, S);
+ S=tree2seq (R->right, S);
+ }
+ return S;
+}
+
+NT_node balance_tree (NT_node T)
+{
+ static int **list;
+ NT_node NL[3];
+
+ if ( !T) return T;
+ else if ( T->leaf<=2)return T;
+ else
+ {
+ if (!list)list=declare_int (3, 2);
+
+ NL[0]=T->left;
+ NL[1]=T->right;
+ NL[2]=T->bot;
+
+ list[0][0]=(T->left)?(T->left)->leaf:0;
+ list[0][1]=0;
+ list[1][0]=(T->right)?(T->right)->leaf:0;
+ list[1][1]=1;
+ list[2][0]=(T->bot)?(T->bot)->leaf:0;
+ list[2][1]=2;
+
+ sort_int (list,2,0,0,2);
+
+ T->left=NL[list[2][1]];
+ T->right=NL[list[1][1]];
+ T->bot=NL[list[0][1]];
+
+ T->left=balance_tree (T->left);
+ T->right=balance_tree (T->right);
+ T->bot=balance_tree (T->bot);
+ return T;
+ }
+}
+FILE * display_tree (NT_node R, int nseq, FILE *fp)
+{
+ int a;
+
+ if ( !R);
+ else
+ {
+ /*
+ if ( R->nseq==1)fprintf (stderr,"\n[%s] ", R->name);
+ else fprintf ( stderr, "\n[%d Node] ",R->nseq);
+ for ( a=0; a< R->nseq; a++) fprintf ( stderr, "[%d]", R->lseq[a]);
+ */
+ fprintf (fp, "\n %10s N ", R->name);
+ for ( a=0; a< nseq; a++)fprintf (fp, "%d", R->lseq2[a]);
+ fprintf (fp, "\n %10s D ", R->name);
+ for ( a=0; a< nseq; a++)fprintf (fp, "%d", R->idist[a]);
+
+
+ if (R->leaf==1) fprintf (fp, " %s", R->name);
+ fprintf (fp, " :%.4f", R->dist);
+ HERE ("\nGo Left");fp=display_tree (R->left, nseq, fp);
+ HERE ("\nGo Right");fp=display_tree (R->right, nseq, fp);
+ HERE ("\nGo Bot");fp=display_tree (R->bot, nseq, fp);
+ }
+ return fp;
+}
+int tree2nnode_unresolved (NT_node R, int *l)
+{
+ if ( !R)return 0;
+ else if (R->leaf && R->dist==0){return 1;}
+ else
+ {
+ int n=0;
+ n+=tree2nnode_unresolved (R->right, l);
+ n+=tree2nnode_unresolved (R->left, l);
+ if (R->dist==0)
+ {
+ return n;
+ }
+ else
+ {
+ if (n)l[n]++;
+ return 0;
+ }
+ }
+
+}
+
+int tree2nnode ( NT_node R)
+{
+ int n;
+ if ( !R)n=0;
+ else if ( R->leaf==1){R->node=1;n=1;}
+ else
+ {
+ n=1;
+ n+=tree2nnode (R->right);
+ n+=tree2nnode (R->left);
+ n+=tree2nnode (R->bot);
+ R->node=n;
+ }
+ return n;
+}
+int tree2nleaf (NT_node R)
+{
+ if ( !R)return 0;
+ else if (R->leaf==1){return 1;}
+ else if (R->right==NULL && R->left==NULL && R->bot==NULL){R->leaf=1; return 1;}
+ else
+ {
+ int n=0;
+ n+=tree2nleaf (R->right);
+ n+=tree2nleaf (R->left);
+ n+=tree2nleaf (R->bot);
+
+ R->leaf=n;
+ return n;
+ }
+}
+
+int tree2nseq ( NT_node R)
+{
+ return tree2nleaf(R);
+}
+
+int tree_file2nseq (char *fname)
+{
+ FILE *fp;
+ char *string;
+ int p, a, b, c, n;
+
+ string=vcalloc (count_n_char_in_file(fname)+1, sizeof (char));
+
+ fp=vfopen (fname, "r");
+ n=0;
+ while ( (c=fgetc(fp))!=EOF){if (c=='(' || c==')' || c==',' || c==';') string[n++]=c;}
+ vfclose (fp);string[n]='\0';
+
+ for (n=0, p=1; p<strlen (string); p++)
+ {
+ a=string[p-1];
+ b=string[p];
+
+ if ( a=='(' && b==',')n++;
+ else if ( a==',' && b==')')n++;
+ else if ( a==',' && b==',')n++;
+ }
+ if ( getenv4debug("DEBUG_TREE"))fprintf (stderr, "\n[DEBUG_TREE:tree_file2nseq]%s", string);
+ vfree (string);
+ return n;
+}
+
+
+void clear_tree ( NT_node R)
+{
+ if (!R) return;
+
+ R->visited=0;
+
+ if ( R->leaf==1);
+ else
+ {
+ clear_tree ( R->right);
+ clear_tree ( R->left);
+ clear_tree ( R->bot);
+ }
+}
+int display_leaf_below_node (NT_node T, FILE *fp)
+{
+ int n=0;
+ if ( !T)return 0;
+
+ if ( T->leaf==1)
+ {
+ fprintf (fp, " %s", T->name);
+ return 1;
+ }
+ else
+ {
+ n+=display_leaf_below_node ( T->right, fp);
+ n+=display_leaf_below_node ( T->left, fp);
+ return n;
+ }
+}
+int display_leaf ( NT_node T, FILE *fp)
+{
+ int n=0;
+ if ( !T)return 0;
+ else if ( T->visited)return 0;
+ else T->visited=1;
+
+ if ( T->leaf==1)
+ {
+ fprintf (fp, " %s", T->name);
+ return 1;
+ }
+ else
+ {
+ n+=display_leaf ( T->right, fp);
+ n+=display_leaf ( T->left, fp);
+ n+=display_leaf ( T->bot, fp);
+ return n;
+ }
+}
+
+
+
+
+NT_node find_longest_branch ( NT_node T, NT_node L)
+ {
+
+ if ( !L || T->dist>L->dist)
+ {
+
+ L=T;
+ }
+
+ if ( T->leaf==1)return L;
+ else
+ {
+ L=find_longest_branch ( T->right, L);
+ L=find_longest_branch ( T->left, L);
+ return L;
+ }
+ }
+int node2side (NT_node N);
+int test_print (NT_node T);
+NT_node straighten_node (NT_node N);
+NT_node EMPTY;
+NT_node Previous;
+NT_node reroot_tree ( NT_node TREE, NT_node Right)
+{
+ /*ReRoots the tree between Node R and its parent*/
+ NT_node NR;
+ int n1, n2;
+
+ if (!EMPTY)EMPTY=vcalloc (1, sizeof (NT_node));
+ if ( !Right->parent)return Right;
+
+ TREE=unroot_tree (TREE);
+ if (Right->parent==NULL && Right->bot)
+ Right=Right->bot;
+
+ n1=tree2nleaf (TREE);
+
+ NR=declare_tree_node(TREE->maxnseq);
+
+ NR->right=Right;
+ NR->left=Right->parent;
+ Right->parent=NR;
+
+ Right->dist=Right->dist/2;
+
+ if ((NR->left)->right==Right)(NR->left)->right=EMPTY;
+ else if ( (NR->left)->left==Right) (NR->left)->left=EMPTY;
+
+ Previous=NULL;
+
+
+ NR->left=straighten_node (NR->left);
+
+
+
+ (NR->left)->parent=NR;
+ (NR->left)->dist=Right->dist;
+
+
+
+ n2=tree2nleaf(NR);
+
+ if ( n1!=n2){fprintf ( stderr, "\n%d %d", n1, n2);myexit (EXIT_FAILURE);}
+ return NR;
+}
+
+NT_node straighten_node ( NT_node N)
+{
+ NT_node Child;
+
+
+ if ( N->parent)
+ {
+ if (N->right==EMPTY)N->right=N->parent;
+ else if ( N->left==EMPTY) N->left=N->parent;
+
+ Child=N->parent;
+ if (Child->right==N)
+ {
+ Child->right=EMPTY;
+ }
+ else if (Child->left==N)
+ {
+ Child->left=EMPTY;
+ }
+
+ Previous=N;
+ Child=straighten_node (Child);
+ Child->parent=N;
+ Child->dist=N->dist;
+ return N;
+ }
+ else if ( N->bot && N->bot!=Previous)
+ {
+ if ( N->right==EMPTY)N->right=N->bot;
+ else if ( N->left==EMPTY)N->left=N->bot;
+
+ N->bot=NULL;
+ return N;
+ }
+ else
+ {
+ N->bot=NULL;
+ return N;
+ }
+}
+int test_print (NT_node T)
+{
+ if ( !T)
+ {
+ fprintf ( stderr, "\nEMPTY");
+ }
+ else if ( !T->left && !T->right)
+ {
+ fprintf ( stderr, "\n%s",T->name);
+ }
+ else
+ {
+ fprintf ( stderr, "\nGoing Right");
+ test_print (T->right);
+ fprintf ( stderr, "\nGoing Left");
+ test_print (T->left);
+ }
+ return 1;
+}
+int node2side (NT_node C)
+{
+ if ( !C->parent) return UNKNOWN;
+ else if ( (C->parent)->left==C)return LEFT;
+ else if ( (C->parent)->right==C)return RIGHT;
+ else return UNKNOWN;
+}
+NT_node straighten_tree ( NT_node P, NT_node C, float new_dist)
+{
+ float dist;
+
+ if ( C==NULL)return NULL;
+
+
+ dist=C->dist;
+ C->dist=new_dist;
+ C->bot=NULL;
+
+ if (C->left && C->right)
+ {
+ C->parent=P;
+ }
+ else if (!C->left)
+ {
+ C->left=C->parent;
+ C->parent=P;
+ }
+
+ if ( C->parent==P);
+ else if ( C->left==NULL && C->right==NULL)
+ {
+ C->parent=P;
+ }
+ else if ( C->right==P)
+ {
+ C->right=C->parent;
+ C->parent=P;
+
+ C=straighten_tree(C, C->right, dist);
+ }
+ else if ( C->left==P)
+ {
+ C->left=C->parent;
+ C->parent=P;
+ C=straighten_tree (C, C->left, dist);
+ }
+ else if ( C->parent==NULL)
+ {
+ C->parent=P;
+ }
+
+ return C;
+}
+
+
+NT_node unroot_tree ( NT_node T)
+{
+
+ if (!T || T->visited) return T;
+ else T->visited=1;
+
+ if (T->parent==NULL)
+ {
+
+ (T->right)->dist=(T->left)->dist=(T->right)->dist+(T->left)->dist;
+ (T->right)->parent=T->left;
+ (T->left)->parent=T->right;
+ T=T->left;
+ T->leaf=0;
+ vfree (T->parent);
+ }
+ else
+ {
+ T->parent=unroot_tree (T->parent);
+ T->right=unroot_tree (T->right);
+ T->left=unroot_tree (T->left);
+ }
+ T->visited=0;
+ return T;
+}
+
+FILE * print_tree_list ( NT_node *T, char *format,FILE *fp)
+{
+ int a=0;
+ while ( T[a])
+ {
+ fp=print_tree (T[a], format, fp);
+ a++;
+ }
+ return fp;
+}
+char * tree2string (NT_node T)
+{
+ if (!T) return NULL;
+ else
+ {
+ static char *f;
+ FILE *fp;
+
+ if (!f)f=vtmpnam (NULL);
+ fp=vfopen (f, "w");
+ print_tree (T, "newick", fp);
+ vfclose (fp);
+ return file2string (f);
+ }
+}
+char * tree2file (NT_node T, char *name, char *mode)
+{
+ if (!name)name=vtmpnam (NULL);
+ string2file (name, mode, tree2string(T));
+ return name;
+}
+FILE * print_tree ( NT_node T, char *format,FILE *fp)
+{
+ Sequence *S;
+
+ tree2nleaf(T);
+ S=tree2seq(T, NULL);
+
+ recode_tree (T, S);
+
+ free_sequence (S, -1);
+ if ( format && strm (format, "binary"))
+ fp=display_tree ( T,S->nseq, fp);
+ else if ( ! format || strm2 (format, "newick_tree","newick"))
+ {
+ /*T=balance_tree (T);*/
+ fp=rec_print_tree (T, fp);
+ fprintf ( fp, ";\n");
+ }
+ else
+ {
+ fprintf ( stderr, "\nERROR: %s is an unknown tree format [FATAL:%s]\n", format, PROGRAM);
+ myexit (EXIT_FAILURE);
+ }
+ return fp;
+}
+int print_newick_tree ( NT_node T, char *name)
+{
+ FILE *fp;
+ fp=vfopen (name, "w");
+ fp=rec_print_tree (T,fp);
+ fprintf (fp, ";\n");
+ vfclose (fp);
+ return 1;
+}
+FILE * rec_print_tree ( NT_node T, FILE *fp)
+{
+
+
+
+ if (!T)return fp;
+
+ if ( T->isseq)
+ {
+ fprintf ( fp, " %s:%.5f",T->name, T->dist);
+ }
+ else
+ {
+ if (T->left && T->right)
+ {
+ fprintf ( fp, "(");fp=rec_print_tree ( T->left, fp);
+ fprintf ( fp, ",");fp=rec_print_tree ( T->right, fp);
+ fprintf ( fp, ")");
+ if (T->parent || T->dist)
+ {
+ if ( T->bootstrap!=0)fprintf (fp, " %d", (int)T->bootstrap);
+ fprintf (fp, ":%.5f", T->dist);
+ }
+ }
+ else if (T->left)fp=rec_print_tree (T->left, fp);
+ else if (T->right)fp=rec_print_tree(T->right, fp);
+ }
+
+ return fp;
+}
+
+
+
+
+
+/*********************************************************************/
+/* */
+/* Tree Functions */
+/* */
+/* */
+/*********************************************************************/
+
+int ** make_sub_tree_list ( NT_node **T, int nseq, int n_node)
+ {
+
+
+/*This function produces a list of all the sub trees*/
+
+
+/* /A */
+/* -* */
+/* \ /B */
+/* \ / */
+/* ---* */
+/* \ */
+/* *--C */
+/* \ */
+/* \D */
+
+/* Contains 4 i_nodes */
+/* 8 nodes (internal nodes +leaves) */
+/* 8 sub trees: */
+/* ABCD */
+/* 1111 */
+/* 0111 */
+/* 1000 */
+/* 0100 */
+/* 0011 */
+/* 0001 */
+/* 0010 */
+
+ int **sub_tree_list;
+ int a, n=0;
+
+
+ if (T)
+ {
+ sub_tree_list=declare_int ( (n_node), nseq);
+ make_all_sub_tree_list (T[3][0],sub_tree_list, &n);
+
+ }
+ else
+ {
+ sub_tree_list=declare_int (nseq, nseq);
+ for ( a=0; a< nseq; a++)sub_tree_list[a][a]=1;
+ }
+
+ return sub_tree_list;
+ }
+
+void make_all_sub_tree_list ( NT_node N, int **list, int *n)
+ {
+ make_one_sub_tree_list (N, list[n[0]++]);
+ if (N->leaf!=1)
+ {
+ make_all_sub_tree_list (N->left , list, n);
+ make_all_sub_tree_list (N->right, list, n);
+ }
+ return;
+ }
+
+void make_one_sub_tree_list ( NT_node T,int *list)
+ {
+ if (T->leaf==1)
+ {
+
+ list[T->seq]=1;
+ }
+ else
+ {
+ make_one_sub_tree_list(T->left , list);
+ make_one_sub_tree_list(T->right, list);
+ }
+ return;
+ }
+
+
+NT_node old_main_read_tree(char *treefile)
+{
+ /*Reads a tree w/o needing the sequence file*/
+ NT_node **T;
+ T=simple_read_tree (treefile);
+ return T[3][0];
+}
+
+
+
+NT_node** simple_read_tree(char *treefile)
+{
+ int tot_node=0;
+ NT_node **T;
+ T=read_tree ( treefile, &tot_node,tree_file2nseq (treefile),NULL);
+ return T;
+}
+
+void free_read_tree ( NT_node **BT)
+{
+ int a, s;
+
+
+ if (!BT) return;
+
+ for (s=0,a=0; a<3; a++)
+ {
+ vfree (BT[a]);
+ }
+ free_tree (BT[3][0]);
+ vfree (BT);
+ return;
+}
+
+NT_node** read_tree(char *treefile, int *tot_node,int nseq, char **seq_names)
+ {
+
+ /*The Tree Root is in the TREE[3][0]...*/
+ /*TREE[0][ntot]--> pointer to each node and leave*/
+ char ch;
+ int a,b;
+
+ FILE *fp;
+ int nseq_read = 0;
+ int nnodes = 0;/*Number of Internal Nodes*/
+ int ntotal = 0;/*Number of Internal Nodes + Number of Leaves*/
+ int flag;
+ int c_seq;
+ NT_node **lu_ptr;
+ NT_node seq_tree, root,p;
+
+
+ tot_nseq=nseq;
+ rooted_tree=distance_tree=TRUE;
+
+ fp = vfopen(treefile, "r");
+ fp=skip_space(fp);
+ ch = (char)getc(fp);
+ if (ch != '(')
+ {
+ fprintf(stderr, "Error: Wrong format in tree file %s\n", treefile);
+ myexit (EXIT_FAILURE);
+ }
+ rewind(fp);
+
+
+ lu_ptr=(NT_node **)vcalloc(4,sizeof(NT_node*));
+ lu_ptr[0] = (NT_node *)vcalloc(10*nseq,sizeof(NT_node));
+ lu_ptr[1] = (NT_node *)vcalloc(10*nseq,sizeof(NT_node));
+ lu_ptr[2] = (NT_node *)vcalloc(10*nseq,sizeof(NT_node));
+ lu_ptr[3] =(NT_node *) vcalloc(1,sizeof(NT_node));
+
+ seq_tree =(NT_node) declare_tree_node(nseq);
+
+ set_info(seq_tree, NULL, 0, " ", 0.0, 0);
+
+
+ fp=create_tree(seq_tree,NULL,&nseq_read, &ntotal, &nnodes, lu_ptr, fp);
+ fclose (fp);
+
+
+ if (nseq != tot_nseq)
+ {
+ fprintf(stderr," Error: tree not compatible with alignment (%d sequences in alignment and %d in tree\n", nseq,nseq_read);
+ myexit (EXIT_FAILURE);
+ }
+
+ if (distance_tree == FALSE)
+ {
+ if (rooted_tree == FALSE)
+ {
+ fprintf(stderr,"Error: input tree is unrooted and has no distances, cannot align sequences\n");
+ myexit (EXIT_FAILURE);
+ }
+ }
+
+ if (rooted_tree == FALSE)
+ {
+ root = reroot(seq_tree, nseq,ntotal,nnodes, lu_ptr);
+ lu_ptr[1][nnodes++]=lu_ptr[0][ntotal++]=root;
+
+ }
+ else
+ {
+ root = seq_tree;
+ }
+
+ lu_ptr[3][0]=root;
+ tot_node[0]=nnodes;
+
+
+
+ for ( a=0; a< ntotal; a++)
+ {
+ (lu_ptr[0][a])->isseq=(lu_ptr[0][a])->leaf;
+ (lu_ptr[0][a])->dp=(lu_ptr[0][a])->dist;
+ }
+
+
+ for ( a=0; a< nseq; a++)
+ {
+ if (!seq_names)
+ {
+ flag=1;
+ (lu_ptr[2][a])->order=(lu_ptr[2][a])->seq=a;
+ }
+ else
+ {
+ for ( flag=0,b=0; b<nseq; b++)
+ {
+ if ( strncmp ( (lu_ptr[2][a])->name, seq_names[b], MAXNAMES)==0)
+ {
+ flag=1;
+
+ (lu_ptr[2][a])->order=(lu_ptr[2][a])->seq=b;
+ /*vfree ( (lu_ptr[2][a])->name);*/
+ sprintf ((lu_ptr[2][a])->name, "%s", seq_names[b]);
+ }
+ }
+ }
+ /*
+ if ( flag==0 && (lu_ptr[0][a])->leaf==1)
+ {
+ fprintf ( stderr, "\n%s* not in tree",(lu_ptr[2][a])->name);
+ for ( a=0; a< ntotal; a++)
+ {
+ fprintf ( stderr, "\n%d %s",(lu_ptr[2][a])->leaf, (lu_ptr[2][a])->name);
+ }
+ }
+ */
+ }
+
+ if (seq_names)
+ {
+ int tnseq;
+ char *s;
+ char **tree_names;
+ int fail_flag=0;
+ tnseq=tree_file2nseq(treefile);
+ tree_names=vcalloc ( tnseq, sizeof (char*));
+ for (a=0; a<tnseq; a++)
+ {
+ s=(lu_ptr[2][a])->name;
+ tree_names[a]=s;
+ if ( name_is_in_list(s, seq_names, nseq, MAXNAMES+1)==-1)
+ {
+ fprintf (stderr, "\nERROR: Sequence %s in the tree [%s] is not in the alignment[FATAL:%s]\n", s, treefile, PROGRAM);
+ fail_flag=1;
+ }
+ }
+ for (a=0; a<nseq; a++)
+ {
+ s=seq_names[a];
+ if ( name_is_in_list(s, tree_names, nseq, MAXNAMES+1)==-1)
+ {
+ fprintf (stderr, "\nERROR: Sequence %s in the sequences is not in the tree [%s][FATAL:%s]\n", s, treefile, PROGRAM);
+ fail_flag=1;
+ }
+ }
+ vfree (tree_names);
+ if ( fail_flag==1)myexit (EXIT_FAILURE);
+ }
+
+ for ( a=0; a< nseq; a++)
+ {
+ p=lu_ptr[2][a];
+ c_seq=p->seq;
+
+ while ( p!=NULL)
+ {
+ p->lseq[p->nseq]=c_seq;
+ p->nseq++;
+ p=p->parent;
+ }
+ }
+
+
+ return lu_ptr;
+ }
+
+FILE * create_linear_tree ( char **name, int n, FILE *fp)
+{
+
+ if (!name || n==0 ||!fp) return NULL;
+
+
+ if (n==2)
+ fprintf ( fp, "(%s,%s);",name[0],name[1]);
+ else if ( n==3)
+ fprintf ( fp, "((%s,%s),%s);",name[0],name[1], name[2]);
+ else
+ {
+ int a;
+ for (a=0; a<n-2; a++)fprintf (fp, "(");
+ fprintf (fp, "%s, %s),", name[0], name[1]);
+ for ( a=2; a<n-2; a++)fprintf ( fp, "%s),",name[a]);
+ fprintf ( fp, "%s,%s);\n", name[n-2], name[n-1]);
+ }
+ return fp;
+}
+FILE * create_tree(NT_node ptree, NT_node parent,int *nseq,int *ntotal,int *nnodes,NT_node **lu, FILE *fp)
+ {
+
+ int i, type;
+ float dist=0;
+ float bootstrap=0;
+ char *name;
+ int ch;
+
+
+ name=vcalloc ( MAXNAMES+1, sizeof (char));
+ sprintf ( name, " ");
+ fp=skip_space(fp);
+ ch = (char)getc(fp);
+
+ if (ch == '(')
+ {
+ type = NODE;
+ name[0] = '\0';
+ lu[0][ntotal[0]] = lu[1][nnodes[0]] = ptree;
+ ntotal[0]++;
+ nnodes[0]++;
+ create_tree_node(ptree, parent);
+ fp=create_tree(ptree->left, ptree, nseq,ntotal,nnodes,lu,fp);
+ ch = (char)getc(fp);
+ if ( ch == ',')
+ {
+ fp=create_tree(ptree->right, ptree,nseq,ntotal,nnodes,lu,fp);
+ ch = (char)getc(fp);
+ if ( ch == ',')
+ {
+
+ ptree = insert_tree_node(ptree);
+ lu[0][ntotal[0]] = lu[1][nnodes[0]] = ptree;
+ ntotal[0]++;
+ nnodes[0]++;
+ fp=create_tree(ptree->right, ptree,nseq,ntotal,nnodes,lu,fp);
+ rooted_tree = FALSE;
+ if ( getenv4debug ( "DEBUG_TREE")){fprintf ( stderr, "\n[DEBUG_TREE:create_tree] Unrooted Tree");}
+ }
+ }
+
+ fp=skip_space(fp);
+ ch = (char)getc(fp);
+ }
+ else
+ {
+ type=LEAF;
+ lu[0][ntotal[0]] = lu[2][nseq[0]] = ptree;
+ ntotal[0]++;
+ nseq[0]++;
+ name[0] = ch;
+ i=1;
+ ch = (char)getc(fp);
+ if ( name[0]=='\'')
+ {
+ /*This protects names that are between single quotes*/
+ while ( ch!='\'')
+ {
+ if (i < MAXNAMES) name[i++] = ch;
+ ch = (char)getc(fp);
+ }
+ if (i < MAXNAMES) name[i++] = ch;
+ while ((ch != ':') && (ch != ',') && (ch != ')'))ch = (char)getc(fp);
+ }
+ else
+ {
+ while ((ch != ':') && (ch != ',') && (ch != ')'))
+ {
+ if (i < MAXNAMES) name[i++] = ch;
+ ch = (char)getc(fp);
+ }
+ }
+
+ name[i] = '\0';
+
+ if ( i>=(MAXNAMES+1)){fprintf (stderr, "\nName is too long");myexit (EXIT_FAILURE);}
+ if (ch != ':' && !isdigit(ch))
+ {
+ /*distance_tree = FALSE*/;
+ }
+ }
+ if (ch == ':')
+ {
+ fp=skip_space(fp);
+ fscanf(fp,"%f",&dist);
+ fp=skip_space(fp);
+ bootstrap=0;
+ }
+ /*Tree with Bootstrap information*/
+ else if (isdigit (ch))
+ {
+ ungetc(ch,fp);
+ fscanf(fp,"%f",&bootstrap);
+ if ( fscanf(fp,":%f",&dist)==1);
+ else dist=0;
+ fp=skip_space(fp);
+ }
+ else
+ {
+ ungetc ( ch, fp);
+ skip_space(fp);
+ }
+
+ set_info(ptree, parent, type, name, dist, bootstrap);
+
+
+ vfree (name);
+ return fp;
+ }
+
+NT_node declare_tree_node (int nseq)
+ {
+ NT_node p;
+
+ p= (NT_node)vcalloc (1, sizeof ( Treenode));
+ p->left = NULL;
+ p->right = NULL;
+ p->parent = NULL;
+ p->dist = 0.0;
+ p->leaf = 0;
+ p->order = 0;
+ p->maxnseq=nseq;
+ p->name=(char*)vcalloc (MAXNAMES+1,sizeof (char));
+ p->name[0]='\0';
+ p->lseq=(int*)vcalloc ( nseq, sizeof (int));
+ return p;
+
+ }
+
+void set_info(NT_node p, NT_node parent, int pleaf, char *pname, float pdist, float bootstrap)
+ {
+ p->parent = parent;
+ p->leaf = pleaf;
+ p->dist = pdist;
+ p->bootstrap=bootstrap;
+ p->order = 0;
+
+
+ sprintf (p->name, "%s", pname);
+
+ if (pleaf ==1)
+ {
+ p->left = NULL;
+ p->right = NULL;
+ }
+
+ }
+NT_node insert_tree_node(NT_node pptr)
+ {
+
+ NT_node newnode;
+
+ newnode = declare_tree_node( pptr->maxnseq);
+ create_tree_node(newnode, pptr->parent);
+
+ newnode->left = pptr;
+ pptr->parent = newnode;
+
+ set_info(newnode, pptr->parent, 0, "", 0.0, 0);
+
+ return(newnode);
+ }
+
+void create_tree_node(NT_node pptr, NT_node parent)
+ {
+ pptr->parent = parent;
+ pptr->left =declare_tree_node(pptr->maxnseq) ;
+ (pptr->left)->parent=pptr;
+
+ pptr->right =declare_tree_node(pptr->maxnseq) ;
+ (pptr->right)->parent=pptr;
+ }
+
+FILE * skip_space(FILE *fp)
+ {
+ int c;
+
+ do
+ c = getc(fp);
+ while(isspace(c));
+ if ( c==EOF)
+ {
+ fprintf ( stderr, "\nEOF");
+ myexit (EXIT_FAILURE);
+ }
+ ungetc(c, fp);
+ return fp;
+ }
+
+
+NT_node reroot(NT_node ptree, int nseq, int ntotal, int nnodes, NT_node **lu)
+ {
+ NT_node p, rootnode, rootptr;
+ float diff, mindiff=0, mindepth = 1.0, maxdist;
+ int i;
+ int first = TRUE;
+
+
+
+ rootptr = ptree;
+
+ for (i=0; i<ntotal; i++)
+ {
+ p = lu[0][i];
+ if (p->parent == NULL)
+ diff = calc_root_mean(p, &maxdist, nseq, lu);
+ else
+ diff = calc_mean(p, &maxdist, nseq, lu);
+
+ if ((diff == 0) || ((diff > 0) && (diff < 2 * p->dist)))
+ {
+ if ((maxdist < mindepth) || (first == TRUE))
+ {
+ first = FALSE;
+ rootptr = p;
+ mindepth = maxdist;
+ mindiff = diff;
+ }
+ }
+
+ }
+ if (rootptr == ptree)
+ {
+ mindiff = rootptr->left->dist + rootptr->right->dist;
+ rootptr = rootptr->right;
+ }
+
+ rootnode = insert_root(rootptr, mindiff);
+ diff = calc_root_mean(rootnode, &maxdist, nseq, lu);
+ return(rootnode);
+ }
+
+
+float calc_root_mean(NT_node root, float *maxdist, int nseq, NT_node **lu)
+ {
+ float dist , lsum = 0.0, rsum = 0.0, lmean,rmean,diff;
+ NT_node p;
+ int i;
+ int nl, nr;
+ int direction;
+
+
+ dist = (*maxdist) = 0;
+ nl = nr = 0;
+ for (i=0; i< nseq; i++)
+ {
+ p = lu[2][i];
+ dist = 0.0;
+ while (p->parent != root)
+ {
+ dist += p->dist;
+ p = p->parent;
+ }
+ if (p == root->left) direction = LEFT;
+ else direction = RIGHT;
+ dist += p->dist;
+
+ if (direction == LEFT)
+ {
+ lsum += dist;
+ nl++;
+ }
+ else
+ {
+ rsum += dist;
+ nr++;
+ }
+
+ if (dist > (*maxdist)) *maxdist = dist;
+ }
+
+ lmean = lsum / nl;
+ rmean = rsum / nr;
+
+ diff = lmean - rmean;
+ return(diff);
+ }
+
+float calc_mean(NT_node nptr, float *maxdist, int nseq,NT_node **lu)
+ {
+ float dist , lsum = 0.0, rsum = 0.0, lmean,rmean,diff;
+ NT_node p, *path2root;
+ float *dist2node;
+ int depth = 0, i,j , n;
+ int nl , nr;
+ int direction, found;
+
+
+ path2root = (NT_node *)vcalloc(nseq,sizeof(Treenode));
+ dist2node = (float *)vcalloc(nseq,sizeof(float));
+
+ depth = (*maxdist) = dist = 0;
+ nl = nr = 0;
+ p = nptr;
+ while (p != NULL)
+ {
+ path2root[depth] = p;
+ dist += p->dist;
+ dist2node[depth] = dist;
+ p = p->parent;
+ depth++;
+ }
+
+/***************************************************************************
+ *nl = *nr = 0;
+ for each leaf, determine whether the leaf is left or right of the node.
+ (RIGHT = descendant, LEFT = not descendant)
+****************************************************************************/
+ for (i=0; i< nseq; i++)
+ {
+ p = lu[2][i];
+ if (p == nptr)
+ {
+ direction = RIGHT;
+ dist = 0.0;
+ }
+ else
+ {
+ direction = LEFT;
+ dist = 0.0;
+
+ found = FALSE;
+ n = 0;
+ while ((found == FALSE) && (p->parent != NULL))
+ {
+ for (j=0; j< depth; j++)
+ if (p->parent == path2root[j])
+ {
+ found = TRUE;
+ n = j;
+ }
+ dist += p->dist;
+ p = p->parent;
+ }
+
+ if (p == nptr) direction = RIGHT;
+
+ }
+ if (direction == LEFT)
+ {
+ lsum += dist;
+ lsum += dist2node[n-1];
+ nl++;
+ }
+ else
+ {
+ rsum += dist;
+ nr++;
+ }
+
+ if (dist > (*maxdist)) *maxdist = dist;
+ }
+
+ vfree(dist2node);
+ vfree(path2root);
+
+
+
+ if ( nl==0 || nr==0)
+ {
+ myexit (EXIT_FAILURE);
+ }
+ lmean = lsum / nl;
+ rmean = rsum / nr;
+
+ diff = lmean - rmean;
+ return(diff);
+}
+
+NT_node insert_root(NT_node p, float diff)
+{
+ NT_node newp, prev, q, t;
+ float dist, prevdist,td;
+
+
+ newp = declare_tree_node( p->maxnseq);
+ t = p->parent;
+
+
+ prevdist = t->dist;
+ p->parent = newp;
+
+ dist = p->dist;
+
+ p->dist = diff / 2;
+ if (p->dist < 0.0) p->dist = 0.0;
+ if (p->dist > dist) p->dist = dist;
+
+ t->dist = dist - p->dist;
+
+ newp->left = t;
+ newp->right = p;
+ newp->parent = NULL;
+ newp->dist = 0.0;
+ newp->leaf = NODE;
+
+ if (t->left == p) t->left = t->parent;
+ else t->right = t->parent;
+
+ prev = t;
+ q = t->parent;
+
+ t->parent = newp;
+
+ while (q != NULL)
+ {
+ if (q->left == prev)
+ {
+ q->left = q->parent;
+ q->parent = prev;
+ td = q->dist;
+ q->dist = prevdist;
+ prevdist = td;
+ prev = q;
+ q = q->left;
+ }
+ else
+ {
+ q->right = q->parent;
+ q->parent = prev;
+ td = q->dist;
+ q->dist = prevdist;
+ prevdist = td;
+ prev = q;
+ q = q->right;
+ }
+ }
+
+/*
+ remove the old root node
+*/
+ q = prev;
+ if (q->left == NULL)
+ {
+ dist = q->dist;
+ q = q->right;
+ q->dist += dist;
+ q->parent = prev->parent;
+ if (prev->parent->left == prev)
+ prev->parent->left = q;
+ else
+ prev->parent->right = q;
+ prev->right = NULL;
+ }
+ else
+ {
+ dist = q->dist;
+ q = q->left;
+ q->dist += dist;
+ q->parent = prev->parent;
+ if (prev->parent->left == prev)
+ prev->parent->left = q;
+ else
+ prev->parent->right = q;
+ prev->left = NULL;
+ }
+
+ return(newp);
+}
+
+
+
+
+/*********************************************************************/
+/* */
+/* TrimTC3 */
+/* */
+/* */
+/*********************************************************************/
+
+int *aln2seq_chain (Alignment *A, int **sim,int seq1, int seq2, int limit, int max_chain);
+
+Alignment *seq2seq_chain (Alignment *A,Alignment*T, char *arg)
+{
+ int **sim=NULL;
+ int *buf=NULL, *seq2keep, *list, *tname;
+ int a, b, c, nl;
+ int sim_limit;
+ int min_sim=15;
+ int max_chain=20;
+
+ /*Estimate Similarity within the incoming sequences*/
+ sim=seq2comp_mat (aln2seq(A), "blosum62mt", "sim2");
+
+ /*Read and store the list of sequences to keep*/
+ seq2keep=vcalloc (A->nseq, sizeof (int));
+ tname=vcalloc (T->nseq, sizeof (int));
+ for ( a=0; a< T->nseq; a++)
+ {
+ tname[a]=name_is_in_list ( T->name[a], A->name, A->nseq, 100);
+ if (tname[a]>=0)seq2keep[tname[a]]=1;
+ }
+
+ /*Consider Every Pair of Sequences within the list of sequences to keep*/
+
+ fprintf ( stderr, "\n");
+ for ( a=0; a< T->nseq-1; a++)
+ {
+ if (tname[a]<0) continue;
+ for ( b=a+1;b<T->nseq; b++)
+ {
+
+ if (tname[b]<0) continue;
+
+ buf=NULL;sim_limit=90;
+ while (!buf && sim_limit>min_sim)
+ {
+ buf=aln2seq_chain ( A, sim,tname[a],tname[b],sim_limit, max_chain);
+ sim_limit-=5;
+ }
+
+ if ( buf)
+ {
+ for (c=0; c< A->nseq; c++)seq2keep[c]+=buf[c];
+ vfree (buf);
+ }
+ else
+ {
+ fprintf ( stderr, "\n#Could Not Find any Intermediate sequence [MAx chain %d MinID %d\n", max_chain, min_sim);
+ }
+ }
+ }
+
+ list=vcalloc (A->nseq, sizeof (int));
+ for ( nl=0,a=0; a< A->nseq; a++)
+ if ( seq2keep[a])
+ list[nl++]=a;
+
+ A=extract_sub_aln (A, nl, list);
+
+ free_int (sim, -1);
+ vfree (list);
+ return A;
+}
+int max_explore=10000000;/*Limits the number of explorations that tends to increase when id is small*/
+int n_explore;
+
+int *aln2seq_chain (Alignment *A, int **sim, int seq1, int seq2, int limit, int max_chain)
+{
+ int *used;
+ int **chain;
+ char output1[10000];
+ char output2[10000];
+ int a;
+ int *list;
+ int n, nseq=0;
+
+
+ output1[0]=output2[0]='\0';
+ used=vcalloc (A->nseq, sizeof(int));
+ used[seq1]=1;
+
+ if (find_seq_chain ( A, sim,used,seq1,seq1, seq2,1,limit, max_chain, &nseq))
+ {
+ list=vcalloc (A->nseq, sizeof (int));
+ chain=declare_int (A->nseq, 2);
+ for (n=0, a=0; a< A->nseq; a++)
+ {
+ if ( used[a])
+ {
+ chain[n][0]=used[a];
+ chain[n][1]=a;
+ list[used[a]-1]=a;n++;
+ }
+ }
+
+ sprintf ( output2, "#%s %s N: %d Lower: %d Sim: %d DELTA: %d\n", A->name[list[0]], A->name[list[n-1]],n, limit,sim[list[0]][list[n-1]],limit-sim[list[0]][list[n-1]]);strcat (output1, output2);
+
+ sort_int ( chain, 2, 0, 0, n-1);
+ sprintf ( output2, "#");strcat(output1, output2);
+
+ for ( a=0; a< n-1; a++)
+ {
+ sprintf (output2, "%s -->%d -->", A->name[chain[a][1]],sim[chain[a][1]][chain[a+1][1]]);strcat ( output1, output2);
+ }
+ sprintf ( output2, "%s\n", A->name[chain[n-1][1]]);strcat (output1, output2);
+
+ free_int (chain, -1);
+ vfree (list);
+ }
+ else
+ {
+ vfree (used);
+ used=NULL;
+ }
+ /* fprintf ( stdout, "%s", output1);*/
+ fprintf ( stderr, "%s", output1);
+ n_explore=0;
+ return used;
+}
+static int ***pw_sim;
+int find_seq_chain (Alignment *A, int **sim,int *used,int seq0,int seq1, int seq2,int chain_length, int limit, int max_chain, int *nseq)
+{
+ int a,b, seq, seq_sim;
+
+ n_explore++;
+ if ( n_explore>=max_explore)
+ {
+ return 0;
+ }
+ if (!pw_sim)
+ {
+ pw_sim=declare_arrayN(3, sizeof (int), A->nseq, A->nseq, 3);
+ for ( a=0; a< A->nseq; a++)
+ {
+ for ( b=0; b<A->nseq; b++)
+ {
+ pw_sim[a][b][0]=b;
+ pw_sim[a][b][1]=sim[a][b];
+ pw_sim[a][b][2]=sim[b][seq2];
+ }
+ sort_int_inv ( pw_sim[a],3, 1, 0, A->nseq-1);
+ }
+ }
+
+ if ( chain_length>max_chain)return 0;
+ else if ( sim[seq1][seq2]>=limit)
+ {
+ used[seq2]=chain_length+1;
+ nseq[0]++;
+ return 1;
+ }
+ else
+ {
+ int delta_seq2;
+ for ( a=0; a< A->nseq; a++)
+ {
+ seq=pw_sim[seq1][a][0];
+ seq_sim=pw_sim[seq1][a][1];
+ delta_seq2=pw_sim[seq1][a][2]-sim[seq1][seq2];
+
+
+
+ if ( used[seq])continue;
+ else if ( seq_sim<limit)continue;
+ else
+ {
+ used[seq]=chain_length+1;
+ nseq[0]++;
+ if ( find_seq_chain( A, sim,used,seq0,seq, seq2, chain_length+1,limit, max_chain, nseq))
+ {return 1;}
+ else
+ used[seq]=0;
+ }
+ }
+ return 0;
+ }
+ return 0;
+}
+
+
+//
+//
+//
+//
+/*********************************************************************/
+/* */
+/* New Tree Parsing */
+/* */
+/*********************************************************************/
+NT_node new_insert_node (NT_node T);
+int scan_name_and_dist ( FILE *fp, char *name, float *dist);
+
+
+
+
+NT_node check_tree (NT_node T);
+NT_node main_read_tree (char *treefile)
+{
+ FILE *fp;
+ Sequence *S;
+
+ NT_node T;
+
+
+
+
+ fp=vfopen (remove_charset_from_file (treefile, " \t\n\r"), "r");
+ T=new_get_node (NULL,fp);
+ vfclose (fp);
+
+ S=tree2seq(T, NULL);
+
+ T=recode_tree(T, S);
+ free_sequence (S,S->nseq);
+ vfree (T->file);
+ T->file=vcalloc ( strlen (treefile)+1, sizeof (char));
+ sprintf ( T->file, "%s", treefile);
+ return T;
+}
+
+//This function codes the tree into lseq and lseq2
+//lseq: list of the N->nseq child sequences of the node
+//lsseq2:Array of size Nseq, with lseq[a]=1 if sequence a is child of node N
+static int node_index;
+NT_node index_tree_node (NT_node T)
+{
+ if (!T)return T;
+ if (!T->parent){node_index=tree2nseq (T)+1;}
+
+ index_tree_node(T->left);
+ index_tree_node(T->right);
+
+ if (!T->left && !T->right)T->index=T->lseq[0]+1;
+ else T->index=node_index++;
+ return T;
+}
+
+
+
+NT_node simple_recode_tree (NT_node T, int nseq)
+{
+
+ //recodes atree wher the leafs are already coded
+ if (!T) return T;
+
+
+
+
+ T->nseq=0;
+
+ if ( T->isseq)
+ {
+
+ ;
+
+ }
+ else
+ {
+ NT_node R,L;
+ int a;
+ vfree (T->lseq); T->lseq=vcalloc (nseq, sizeof (int));
+ vfree (T->lseq2); T->lseq2=vcalloc (nseq, sizeof (int));
+ vfree (T->idist); T->idist=vcalloc (nseq, sizeof (int));
+ vfree (T->ldist); T->ldist=vcalloc (nseq, sizeof (int));
+
+ R=simple_recode_tree (T->left,nseq);
+
+ L=simple_recode_tree (T->right,nseq);
+
+ if (R)for (a=0; a<R->nseq; a++)
+ {
+ T->lseq2[R->lseq[a]]=1;
+ }
+
+ if (L)for (a=0; a<L->nseq; a++)
+ {
+ T->lseq2[L->lseq[a]]=1;
+ }
+
+ for (a=0; a<nseq; a++)
+ {
+ if (T->lseq2[a])T->lseq[T->nseq++]=a;
+ if (T->lseq2[a])T->idist[a]=(!R)?0:R->idist[a]+((!L)?0:L->idist[a])+1;
+ if (T->lseq2[a])T->ldist[a]=(!R)?0:R->ldist[a]+((!L)?0:L->ldist[a])+(int)(T->dist*10000);
+ }
+ }
+ return T;
+}
+
+NT_node recode_tree (NT_node T, Sequence *S)
+{
+
+
+ if (!T) return T;
+
+
+ vfree (T->lseq); T->lseq=vcalloc (S->nseq, sizeof (int));
+ vfree (T->lseq2); T->lseq2=vcalloc (S->nseq, sizeof (int));
+ vfree (T->idist); T->idist=vcalloc (S->nseq, sizeof (int));
+ vfree (T->ldist); T->ldist=vcalloc (S->nseq, sizeof (int));
+ T->nseq=0;
+
+ if ( T->isseq)
+ {
+
+ int i;
+ i=name_is_in_list (T->name, S->name, S->nseq, -1);
+
+ if (i!=-1)
+ {
+ T->lseq[T->nseq++]=i;
+ T->lseq2[i]=1;
+ T->idist[i]=1;
+ T->ldist[i]=(int)(T->dist*10000);;
+ }
+ else
+ {
+ printf_exit ( EXIT_FAILURE, stderr, "\nERROR: Sequence %s is in the Tree but Not in the Sequence dataset [code_lseq][FATAL:%s]", T->name, PROGRAM);
+ }
+
+ }
+ else
+ {
+ NT_node R,L;
+ int a;
+
+ R=recode_tree (T->left, S);
+
+ L=recode_tree (T->right, S);
+
+ if (R)
+ for (a=0; a<R->nseq; a++)
+ {
+ T->lseq2[R->lseq[a]]=1;
+ }
+
+ if (L)for (a=0; a<L->nseq; a++)
+ {
+ T->lseq2[L->lseq[a]]=1;
+ }
+
+ for (a=0; a<S->nseq; a++)
+ {
+ //don't count the root
+ int d;
+
+ if ( !(T->parent) || !(T->parent)->parent)d=0;
+ else if ( T->dist==0)d=0;
+ else d=1;
+
+ if (T->lseq2[a])T->lseq[T->nseq++]=a;
+ if (T->lseq2[a])T->idist[a]=(!R)?0:(R->idist[a]+((!L)?0:L->idist[a])+d);
+ if (T->lseq2[a])T->ldist[a]=(!R)?0:R->ldist[a]+((!L)?0:L->ldist[a])+(int)(T->dist*10000);
+
+ }
+ }
+ return T;
+}
+int tree2split_list (NT_node T, int ns,int **sl, int* n)
+{
+ if (!T) return 0;
+ if (!sl) return 0;
+
+ tree2split_list (T->right, ns, sl, n);
+ tree2split_list (T->left , ns, sl, n);
+
+ if (!T->right) return 1;
+ else if (T->parent && !(T->parent)->parent)return 1;
+ else if ( T->dist==0)return 1;
+ else
+ {
+ int t=0,t2=0, c=0, a;
+
+ for (a=0; a< ns; a++)
+ {
+ t2+=(a+1)*T->lseq2[a];
+ t+=T->lseq2[a];
+ }
+
+ if (t2==0) HERE ("0");
+ c=(t>(ns-t))?1:0;
+ sl[n[0]][ns]=t2;//Hash value for quick comparison;
+
+ for (a=0; a< ns; a++)sl[n[0]][a]=(c==0)?T->lseq2[a]:(1-T->lseq2[a]);
+ n[0]++;
+
+ }
+ return 1;
+}
+
+NT_node display_splits (NT_node T,Sequence *S, FILE *fp)
+{
+ int a;
+ if (!T) return T;
+
+ if (!S)S=tree2seq (T,NULL);
+
+ display_splits (T->right,S, fp);
+ display_splits (T->left, S, fp);
+
+
+
+ if (!T->right);
+ else if (T->parent && !(T->parent)->parent);
+ else
+ {
+ int t=0;
+ for (a=0; a< S->nseq; a++)
+ {
+ fprintf (fp, "%d", T->lseq2[a]);
+ t+=T->lseq2[a];
+ }
+
+ fprintf ( fp, " %5d \n", MIN(t,((S->nseq)-t)));
+ }
+ return T;
+}
+NT_node display_leaf_nb (NT_node T, int n, FILE *fp, char * name)
+{
+ int a;
+ if (!T) return T;
+
+
+ display_leaf_nb (T->right, n, fp, name);
+ display_leaf_nb (T->left, n, fp, name);
+
+
+ if (!T->isseq);
+ else
+ {
+ NT_node P;
+
+ P=T->parent;
+ fprintf (fp, "%s ", T->name);
+ for (a=0; a< n; a++)fprintf (fp, "%d", P->lseq2[a]);
+ fprintf ( fp," %s\n", name);
+ }
+ return T;
+}
+static int root4dc;
+NT_node display_code (NT_node T, int n, FILE *fp)
+{
+ int a, debug=0, t=0;
+ if (!T) return T;
+
+ if (!T->parent)
+ root4dc=0;
+
+
+
+ if (!T->parent && debug) fprintf ( fp, "\nDISPLAY TREE: START");
+ display_code (T->right, n, fp);
+ display_code (T->left, n, fp);
+
+ fprintf ( fp, "\n");
+ if (!T->parent) return T;
+ else if ( !(T->parent)->parent && root4dc==1)return T;
+ else if ( !(T->parent)->parent && root4dc==0)root4dc=1;
+
+ for (a=0; a< n; a++)
+ t+=T->lseq2[a];
+ if ( t<=n/2)
+ for (a=0; a< n; a++)fprintf (fp, "%d", T->lseq2[a]);
+ else
+ for (a=0; a< n; a++)fprintf (fp, "%d", 1-T->lseq2[a]);
+ if (T->isseq && debug)fprintf (fp, "%s", T->name);
+
+ if (!T->parent && debug) fprintf (fp, "\nDISPLAY TREE: FINISHED");
+ return T;
+}
+NT_node display_dist (NT_node T, int n, FILE *fp)
+{
+ int a;
+ if (!T) return T;
+
+ if (!T->parent)
+ root4dc=0;
+
+ display_dist (T->right, n, fp);
+ display_dist (T->left, n, fp);
+
+ fprintf ( stdout, "\n");
+ for ( a=0; a< n; a++)
+ fprintf ( stdout, " %2d ", T->idist[a]);
+ fprintf ( stdout, "\n");
+
+ return T;
+}
+
+NT_node check_tree (NT_node T)
+{
+ if (T) HERE("CHECK %s", T->name);
+ if (!T)
+ {
+ HERE ("ERROR: Empty Group");
+ }
+
+ else if (T->isseq)return T;
+ else
+ {
+ HERE ("R");
+ check_tree (T->right);
+ HERE ("L");
+ check_tree (T->left);
+ return NULL;
+ }
+ return 0;}
+
+NT_node new_reroot_tree( NT_node T)
+{
+ T=unroot_tree (T);
+ return T;
+}
+NT_node new_get_node (NT_node T, FILE *fp)
+{
+ NT_node NN;
+ int c;
+ static int n;
+
+ c=fgetc (fp);
+ if (!T)T=declare_tree_node (100);
+
+
+ if ( c==';')
+ {
+
+ if (!T->right)T=T->left;
+ else if (!T->left)T=T->right;
+ vfree (T->parent);T->parent=NULL;
+ return T;
+ }
+ else if ( c==')')
+ {
+ --n;
+ scan_name_and_dist (fp, T->name, &T->dist);
+ if (T->name && T->name [0])T->bootstrap=atof (T->name);
+ return new_get_node (T->parent, fp);
+ }
+ else if ( c==',')
+ {
+ return new_get_node (T, fp);
+ }
+ else
+ {
+ NN=new_insert_node (T);
+
+ if ( c=='(')
+ {
+ ++n;
+ return new_get_node (NN, fp);
+ }
+ else
+ {
+ ungetc (c, fp);
+ scan_name_and_dist (fp, NN->name, &NN->dist);
+
+ NN->leaf=1;
+ NN->isseq=1;
+ return new_get_node (T, fp);
+ }
+ }
+}
+int scan_name_and_dist ( FILE *fp, char *name, float *dist)
+{
+ int a, c;
+ char number [1000];
+
+ a=0;
+ c=fgetc (fp);ungetc (c, fp);
+
+
+ if ( c==';')return 0;
+
+ while ((c=fgetc(fp))!=':' && c!=EOF && c!=')' && c!=';' && c!=',')
+ {
+ name[a++]=c;
+ }
+ name [a]='\0';
+
+ if ( c!=':')
+ {
+ ungetc (c, fp);
+ dist[0]=FLT_MIN;
+ return 1;
+ }
+ a=0;
+ while (isdigit((c=fgetc(fp))) || c=='.' || c=='-')
+ {
+ number[a++]=c;
+ }
+
+ ungetc (c, fp);
+ number[a]='\0';
+
+ dist[0]=atof (number);
+
+ return 2;
+}
+NT_node new_insert_node (NT_node T)
+{
+ NT_node NN;
+
+
+ NN=new_declare_tree_node ();
+ NN->parent=T;
+ if (!T)
+ {
+ return NN;
+ }
+ else if (T->left==NULL)
+ {
+ T->left=NN;
+
+ }
+ else if ( T->right==NULL)
+ {
+ T->right=NN;
+ }
+ else
+ {
+ NT_node NN2;
+ NN2=new_declare_tree_node ();
+ NN2->left=T->left;
+ NN2->right=T->right;
+ NN2->parent=T;
+
+ T->left=NN2;
+ T->right=NN;
+ }
+
+ /*
+
+ else
+ {
+ NN->right=T->right;
+ (T->right)->parent=NN;
+
+ NN->parent=T;
+ T->right=NN;
+ NN->left=new_declare_tree_node ();
+ (NN->left)->parent=NN;
+ return NN->left;
+ }
+ */
+ /*
+ This caused a crash when internal undefined nodes, removed 19/02/08
+ else
+ {
+ NT_node P;
+ NN->right=T;
+ P=NN->parent=T->parent;
+ T->parent=NN;
+
+ if (P && P->right==T)P->right=NN;
+ else if ( P && P->left==T)P->left=NN;
+
+ NN->left=new_declare_tree_node ();
+ (NN->left)->parent=NN;
+ return NN->left;
+ }
+ */
+ return NN;
+}
+
+NT_node new_declare_tree_node ()
+{
+ NT_node p;
+ static int node_index;
+ p= (NT_node)vcalloc (1, sizeof ( Treenode));
+ p->left = NULL;
+ p->right = NULL;
+ p->parent = NULL;
+ p->dist = 0.0;
+ p->leaf = 0;
+ p->order = 0;
+ p->index=++node_index;
+ p->maxnseq=1000;
+ p->name=(char*)vcalloc (MAXNAMES+1,sizeof (char));
+ p->name[0]='\0';
+ return p;
+
+ }
+int new_display_tree (NT_node T, int n)
+{
+ int in;
+
+ in=n;
+
+
+ if ( T->parent)fprintf (stdout, "\nNode %d: has parents)", in);
+ else fprintf (stdout, "\nNode %d: NO parents)", in);
+
+ if ( T->right)
+ {
+ fprintf (stdout, "\nNode %d has Right Child", in);
+ n=new_display_tree (T->right, n+1);
+ }
+ else fprintf ( stdout, "\nNode %d No Right\n", in);
+
+ if ( T->left)
+ {
+ fprintf (stdout, "\nNode %d has Left Child", in);
+ n=new_display_tree (T->left, n+1);
+ }
+ else fprintf ( stdout, "\nNode %d No Left\n", in);
+
+ if ( T->bot)
+ {
+ fprintf (stdout, "\nNode %d has Bot Child", in);
+ n=new_display_tree (T->bot, n+1);
+ }
+ else fprintf ( stdout, "\nNode %d No Bot\n", in);
+
+
+ if (T->isseq)
+ {
+ fprintf (stdout, "\nNode %d is %s", in, T->name);
+ return in;
+ }
+ else return 0;}
+int display_tree_duplicates (NT_node T)
+{
+ static Sequence *S;
+ static int *dup;
+ int a, b;
+
+ free_sequence (S, -1);
+ vfree (dup);
+
+ S=tree2seq (T, NULL);
+ dup=vcalloc ( S->nseq, sizeof (int));
+
+ for (a=0; a< S->nseq-1; a++)
+ for ( b=a+1; b<S->nseq; b++)
+ {
+ if ( strm (S->name[a], S->name[b]))
+ {
+ dup[a]++;
+ }
+ }
+ for (a=0; a< S->nseq-1; a++)
+ for ( b=a+1; b<S->nseq; b++)
+ {
+ if ( strm (S->name[a], S->name[b]) && dup[a])
+ {
+ fprintf ( stderr, "\nSequence %s is duplicated %d Times in the tree", S->name[a], dup[a]);
+ dup[a]=0;
+ }
+ }
+ return 0;
+}
+int tree_contains_duplicates (NT_node T)
+{
+ static Sequence *S;
+ int a, b;
+
+ free_sequence (S, -1);
+
+ S=tree2seq (T, NULL);
+ for (a=0; a< S->nseq-1; a++)
+ for ( b=a+1; b<S->nseq; b++)
+ {
+ if ( strm (S->name[a], S->name[b]))return 1;
+ }
+ return 0;
+}
+
+float display_avg_bootstrap ( NT_node T)
+{
+ float tot;
+ int n;
+
+ tot=tree2tot_dist (T, BOOTSTRAP);
+ n=tree2n_branches (T, BOOTSTRAP);
+ fprintf ( stdout, "\nAVERAGE BOOTSRAP: %.3f on %d Branches\n", (n>0)?tot/n:0, n);
+ return (n>0)?tot/n:0;
+}
+
+
+int tree2n_branches(NT_node T, int mode)
+{
+ int n=0;
+
+ if (!T) return 0;
+ if (!T->parent);
+ else if ((T->isseq && mode !=BOOTSTRAP) || !T->isseq)
+ {
+ n++;
+ }
+ n+=tree2n_branches(T->right, mode);
+ n+=tree2n_branches(T->left, mode);
+
+ return n;
+}
+
+float tree2tot_dist ( NT_node T, int mode)
+{
+ float t=0;
+
+
+ if ( !T)return 0;
+
+ if ( !T->parent);
+ else if ((T->isseq && mode !=BOOTSTRAP) || !T->isseq)
+ {
+ if ( mode == BOOTSTRAP && T->bootstrap!=0)t+=T->bootstrap;
+ else t+=T->dist;
+ }
+
+ t+=tree2tot_dist(T->right, mode);
+ t+=tree2tot_dist(T->left, mode);
+ return t;
+}
+
+//This function displays all the sequences within the tree sorted by node label
+int cmp_tree_array ( const void *vp, const void *vq);
+int node_sort ( char *name, NT_node T)
+{
+ NT_node N;
+ int nseq;
+ int **array, a;
+ Sequence *S;
+ while (T->parent)T=T->parent;
+
+ nseq=tree2nseq (T);
+ array=declare_int (nseq, 2);
+ N=tree2node (name, T);
+
+ if (N==NULL)printf_exit (EXIT_FAILURE, stderr, "ERROR: %s is not in the tree [FATAL:%s]\n", name, PROGRAM);
+ array=display_tree_from_node (N,0,0, array);
+ qsort ( array, nseq, sizeof (int**), cmp_tree_array);
+ S=tree2seq(T, NULL);
+ for (a=0; a<nseq; a++)
+ fprintf ( stdout, ">%s %d %d\n", S->name[array[a][0]], array[a][1], array[a][2]);
+ myexit (EXIT_SUCCESS);
+}
+
+NT_node tree2root ( NT_node R)
+{
+ if (R)while (R->parent)R=R->parent;
+ return R;
+}
+
+NT_node tree2node (char *name, NT_node T)
+{
+ NT_node T1, T2;
+ if ( !T) return T;
+ else if (T->leaf && strm (T->name, name)) return T;
+ else
+ {
+
+ T1=tree2node ( name, T->right);
+ T2=tree2node ( name, T->left);
+ return (T1>T2)?T1:T2;
+ }
+
+}
+static int ni;
+NT_node * tree2node_list (NT_node T, NT_node *L)
+{
+
+
+ if (!T) return NULL;
+ if (!L) {ni=0;L=vcalloc (tree2nnode(T)+1, sizeof (NT_node));}
+ tree2node_list (T->left, L);
+ tree2node_list (T->right, L);
+ L[ni++]=T;
+ return L;
+}
+
+
+
+int ** display_tree_from_node (NT_node T, int up, int down, int **array)
+{
+
+ if (!T || T->visited)return array;
+
+ T->visited=1;
+ if (T->isseq)
+ {
+ array[T->lseq[0]][0]=T->lseq[0];
+ array[T->lseq[0]][1]=up;
+ array[T->lseq[0]][2]=down;
+
+ }
+ else
+ {
+ array=display_tree_from_node ( T->left ,up, down+1, array);
+ array=display_tree_from_node ( T->right,up, down+1, array);
+ }
+ array=display_tree_from_node ( T->parent,up+1, 0, array);
+ T->visited=0;
+ return array;
+
+}
+
+int cmp_tree_array ( const void *p, const void *q)
+{
+ const int **vp=(const int**)p;
+ const int **vq=(const int**)q;
+ if (vp[0][1]>vq[0][1])return 1;
+ else if ( vp[0][1]<vq[0][1]) return -1;
+ else if ( vp[0][2]>vq[0][2]) return 1;
+ else if ( vp[0][2]<vq[0][2]) return -1;
+ else return 0;
+}
+
+NT_node * read_tree_list (Sequence *S)
+{
+ NT_node *T;
+ int a;
+
+ T=vcalloc ( S->nseq+1, sizeof (NT_node));
+
+ for ( a=0; a<S->nseq; a++)
+ {
+ char *fname;
+ if (S->seq && S->seq[a] && strlen (S->seq[a])<2)
+ fname=S->name[a];
+ else
+ string2file ((fname=vtmpnam(NULL)), "w", S->seq[a]);
+
+ T[a]=main_read_tree (fname);
+ T[a]->file=vcalloc (strlen (S->name[a])+1, sizeof (char));
+ sprintf (T[a]->file, "%s", S->name[a]);
+ }
+ return T;
+}
+
+int treelist2dmat ( Sequence *S)
+{
+ NT_node *T;
+ int n=0, a, b;
+ float v;
+ Sequence *TS;
+
+
+
+ n=S->nseq;
+ T=read_tree_list (S);
+ TS=tree2seq(T[0], NULL);
+ fprintf (stdout, "\n%d", S->nseq);
+ for (a=0; a<n; a++)
+ {
+ fprintf ( stdout,"\n%-10s ", S->name[a]);
+ for ( b=0; b<n; b++)
+ {
+ v=100-simple_tree_cmp (T[a], T[b], TS, 1);
+ fprintf ( stdout, "%.4f ", v);
+ }
+
+ }
+
+ myexit (EXIT_SUCCESS);
+ return 0;
+}
+
+int treelist2leafgroup ( Sequence *S, Sequence *TS, char *taxon)
+{
+ NT_node *T;
+ int n=0,nseq, a, c,s;
+
+ int *used;
+
+ char *split_file, *sorted_split_file;
+ char *buf=NULL;
+ FILE *fp;
+ char *name, *fname, *group, *ref_group, *list;
+
+
+
+ T=read_tree_list (S);
+ if (!TS)TS=tree2seq(T[0], NULL);
+
+ name=vcalloc (1000, sizeof (char));
+ fname=vcalloc (1000, sizeof (char));
+ group=vcalloc (TS->nseq*10, sizeof (char));
+ ref_group=vcalloc (TS->nseq*10, sizeof (char));
+ list=vcalloc (100*S->nseq, sizeof (char));
+ split_file=vtmpnam (NULL);
+ sorted_split_file =vtmpnam (NULL);
+
+ n=S->nseq;
+ used=vcalloc (n, sizeof (int));
+
+ T=read_tree_list (S);
+ if (!TS)TS=tree2seq(T[0], NULL);
+ nseq=TS->nseq;
+ fp=vfopen (split_file, "w");
+
+ for ( a=0; a< S->nseq; a++)
+ {
+
+ T[a]=prune_tree (T[a], TS);
+ T[a]=recode_tree (T[a], TS);
+ display_leaf_nb (T[a], TS->nseq,fp, S->name[a]);
+ }
+ vfclose (fp);
+
+
+ for (s=0; s< TS->nseq; s++)
+ {
+ int i;
+
+
+ if (taxon && !(strm (taxon, TS->name[s]) ))continue;
+ else
+ printf_system ( "cat %s | grep %s| sort > %s::IGNORE_FAILURE::", split_file,TS->name[s], sorted_split_file);
+
+ vfopen (sorted_split_file, "r");
+ ref_group[0]=group[0]='\0';
+
+ while ( (c=fgetc (fp))!=EOF)
+ {
+
+ ungetc (c, fp);
+ buf=vfgets (buf, fp);
+ sscanf (buf, "%s %s %s\n", name, group, fname);
+
+ if ( !ref_group[0]|| !strm (group, ref_group))
+ {
+ if (ref_group[0])
+
+ {fprintf (stdout, "%s %6.2f %s",name, (((float)n*100)/(float)S->nseq), ref_group);
+ for (i=0,a=0; a<nseq; a++)
+ if (ref_group[a]=='1' && a!=s)fprintf (stdout, " %s ", TS->name[a]);
+ fprintf ( stdout, "\n");
+ fprintf (stdout, "\nLIST: %s\n", list);
+ }
+ list[0]='\0';
+ sprintf ( ref_group, "%s", group);
+ list=strcatf (list, " %s", fname);
+ n=1;
+ }
+ else
+ {
+
+ list=strcatf (list, " %s", fname);
+ n++;
+ }
+ }
+
+ fprintf (stdout, "%s %6.2f %s",name, (((float)n*100)/(float)S->nseq), group);
+ for (i=0,a=0; a<nseq; a++)
+ if (group[a]=='1' && a!=s)fprintf (stdout, " %s ", TS->name[a]);
+ fprintf (stdout, "\nLIST %s\n", list);
+ fprintf ( stdout, "\n");
+ vfclose (fp);
+ }
+
+ myexit (0);
+}
+int count_tree_groups( Sequence *LIST, char *group_file)
+{
+ NT_node *T;
+ Sequence *S;
+ int a, b, c,n, w, wo, ng=0;
+ int **list, ***rlist, **blist;
+ char ***l;
+ int *gs;
+
+
+
+ T=read_tree_list (LIST);
+ S=tree2seq(T[0], NULL);
+ for ( a=0; a< LIST->nseq; a++)
+ {
+ T[a]=prune_tree (T[a], S);
+ T[a]=recode_tree (T[a], S);
+ }
+
+
+
+ gs=vcalloc (2, sizeof (int));
+ list=declare_int (LIST->nseq*S->nseq*2, S->nseq+1);
+
+ blist=declare_int (2, S->nseq+1);
+ for ( n=0, a=0; a< LIST->nseq; a++)
+ {
+ int n2=0;
+ tree2split_list (T[a], S->nseq, list+n, &n2);
+ n+=n2;
+
+
+ for (b=0; b<n2; b++)
+ {
+ for ( c=0; c<S->nseq; c++)
+ list[n+b][c]=1-list[n-n2+b][c];
+ }
+ n+=n2;
+ }
+
+ if ( group_file)
+ {
+ rlist=declare_arrayN(3, sizeof (int), 2,LIST->nseq*S->nseq, S->nseq+1);
+ l=file2list (group_file, " ");
+
+ while (l[ng])
+ {
+ int i, b, g;
+ if (!strstr (l[ng][1], "group")){ng++;continue;}
+ g=(strm (l[ng][1], "group2"))?0:1;
+
+ for (b=2; b<atoi (l[ng][0]); b++)
+ {
+ if ((i=name_is_in_list(l[ng][b], S->name, S->nseq, 100))!=-1)rlist[g][gs[g]][i]=1;
+ }
+ gs[g]++;
+ ng++;
+ }
+ }
+ else
+ {
+ rlist=vcalloc ( 2, sizeof (int**));
+ rlist[1]=count_int_strings (list, n, S->nseq);
+ gs[1]=read_array_size_new (rlist[1]);
+
+ rlist[0]=declare_int (S->nseq, S->nseq);
+ gs[0]=S->nseq;
+ for ( a=0; a<S->nseq; a++)rlist[0][a][a]=1;
+ }
+
+
+ for (wo=w=0,a=0; a<gs[0]; a++)
+ {
+ for (c=0; c< gs[1]; c++)
+ {
+ wo=w=0;
+ for (b=0; b<S->nseq; b++)
+ {
+ blist[0][b]=blist[1][b]=rlist[1][c][b];
+ blist[0][b]=(rlist[0][a][b]==1)?1:blist[0][b]; //WITH GROUP 1
+ blist[1][b]=(rlist[0][a][b]==1)?0:blist[1][b]; //wiTHOUT gROUP 1
+
+ }
+ for (b=0; b<n; b++)
+ {
+ int x1, x2;
+ x1=(memcmp (blist[0], list[b], sizeof (int)*S->nseq)==0)?1:0;
+ w+=x1;
+ x2=(memcmp (blist[1], list[b], sizeof (int)*S->nseq)==0)?1:0;
+ wo+=x2;
+
+ }
+ fprintf ( stdout, "\n%d ", MIN(wo, w));
+ fprintf ( stdout, "(");
+ for (b=0; b<S->nseq; b++)if (rlist[1][c][b])fprintf ( stdout, "%s ",S->name[b]);
+ fprintf ( stdout, ") +/- (");
+ for (b=0; b<S->nseq; b++)if (rlist[0][a][b])fprintf ( stdout, "%s ",S->name[b]);
+
+ fprintf (stdout , ") + %d - %d Delta %d", w, wo, FABS((wo-w)));
+ }
+ }
+ myexit (0);
+}
+Split * print_split ( int n, int **list, Sequence *LIST, Sequence *S, char *buf, char *file);
+
+NT_node split2tree ( NT_node RT,Sequence *LIST, char *param)
+{
+ Split **S;
+ Alignment *A;
+ S=count_splits (RT, LIST, param);
+ A=seq2aln ((S[0])->S,NULL, KEEP_GAP);
+
+ return split2upgma_tree (S,A, A->nseq, "no");
+}
+
+Split** count_splits( NT_node RT,Sequence *LIST, char *param)
+{
+ NT_node *T, OrderT;
+ Sequence *S=NULL;
+ int a, b, c, d, n1, n2;
+ int **list1, **list2;
+ Split **SL;
+ int nb, tlist;
+ char *main_buf;
+ char *in=NULL,*in2=NULL, *out=NULL, order[100], filter[100];
+ FILE *fp, *fp2;
+ static char *def_param;
+ char *cache=NULL;
+ //+count_splits _NB_x_FILTER_<file>
+ //_<file is a fasta file containing the list of species to keep>
+ if (!def_param)def_param=vcalloc ( 10, sizeof (char));
+
+
+
+ if (!param)param=def_param;
+
+
+ strget_param (param, "_NB_", "0", "%d", &nb);
+ strget_param (param, "_TLIST_", "0", "%d", &tlist);
+ strget_param (param, "_ORDER_", "NO", "%s", order);
+ strget_param (param, "_FILTER_", "NO", "%s", filter);
+
+ fprintf ( stderr, "\nREAD TREE LIST [%d Trees...", LIST->nseq);
+ T=read_tree_list (LIST);
+ fprintf ( stderr, "..]");
+
+ if ( !(strm (order, "NO")))
+ {
+ if (is_newick (order))
+ {
+ OrderT=main_read_tree (order);
+ }
+ else
+ {
+ S=main_read_seq (order);
+ }
+ }
+ else
+ {
+ OrderT=(RT)?RT:T[0];
+ }
+ fprintf ( stderr, "\nTrees Ordered according to: %s", (strm (order, "NO"))?"First Tree":order);
+
+
+ if (!S)S=tree2seq(OrderT, NULL);
+
+ for (a=0; a<S->nseq; a++)
+ {
+ fprintf ( stdout, "\n#ORDER %15s : %3d", S->name[a], a+1);
+ }
+ if ( !strm (filter, "NO"))
+ {
+ Sequence *F;
+ int i;
+
+ F=main_read_seq (filter);
+ cache=vcalloc (S->nseq, sizeof (int));
+ for ( a=0; a<F->nseq; a++)
+ {
+ if ( (i=name_is_in_list (F->name[a], S->name, S->nseq, 100))!=-1)
+ cache[i]=1;
+ }
+ free_sequence (F, -1);
+ }
+
+ main_buf=vcalloc ( S->nseq*(STRING+1), sizeof(int));
+
+ list1=declare_int (S->nseq*3, S->nseq+1);
+ list2=declare_int (S->nseq*3, S->nseq+1);
+
+ for ( a=0; a< LIST->nseq; a++)
+ {
+ T[a]=prune_tree (T[a], S);
+ T[a]=recode_tree (T[a], S);
+ }
+
+
+
+ if (!RT)
+ {
+ char *buf;
+ int i,nl;
+
+ in=vtmpnam (NULL);in2=vtmpnam(NULL); out=vtmpnam (NULL);
+
+ fp=vfopen (in, "w");
+ fp2=vfopen (in2, "w");
+ for ( a=0; a< LIST->nseq; a++)
+ {
+ n2=0;
+ tree2split_list (T[a], S->nseq, list2, &n2);
+ for ( b=0; b<n2; b++)
+ {
+ for (c=0; c< S->nseq; c++)
+ {fprintf (fp, "%d", list2[b][c]);}
+ fprintf (fp, "\n");
+ for (c=0; c< S->nseq; c++)
+ {fprintf (fp, "%d", 1-list2[b][c]);}
+ fprintf (fp, "\n");
+
+ for (c=0; c< S->nseq; c++)
+ {fprintf (fp2, "%d", list2[b][c]);}
+ fprintf (fp2, " ");
+ for (c=0; c< S->nseq; c++)
+ {fprintf (fp2, "%d", 1-list2[b][c]);}
+ fprintf (fp2, " %s\n",LIST->name[a]);
+ }
+ }
+ vfclose (fp2);
+ vfclose (fp);
+
+ count_strings_in_file (in, out);
+ nl=count_n_line_in_file(out);
+ list1=declare_int (nl+1, S->nseq+2);
+
+ fp=vfopen (out, "r");
+ n1=0;
+ buf=vcalloc (measure_longest_line_in_file (out)+1, sizeof (char));
+ while ( fscanf (fp, "%s %d",buf, &i)==2)
+ {
+ for (a=0; a<S->nseq; a++)list1[n1][a]=buf[a]-'0';
+ list1[n1++][S->nseq+1]=i;
+ }
+ vfclose (fp);
+ vfree (buf);
+ }
+ else
+ {
+
+ RT=prune_tree (RT, S);
+ RT=recode_tree (RT, S);
+ n1=0;
+ tree2split_list (RT, S->nseq, list1,&n1);
+ for ( a=0; a< LIST->nseq; a++)
+ {
+ n2=0;
+ tree2split_list (T[a], S->nseq, list2, &n2);
+ for (b=0; b<n1; b++)
+ {
+ for ( c=0; c<n2; c++)
+ {
+ int di=0;
+ for (d=0; d<S->nseq; d++)
+ {
+ if (list1[b][d]!=list2[c][d])di++;
+ }
+ list1[b][S->nseq+1]+=(di==0 || di== S->nseq)?1:0;
+ }
+
+ }
+ }
+ }
+ SL=vcalloc ( n1+1, sizeof (Split*));
+
+ for (a=0; a<n1; a++)
+ {
+ int s1, s2;
+ int cont=1;
+ if (nb)fprintf ( stdout, "\nSPLIT: %d and its Neighborhood +/^- %d\n", a+1, nb);
+
+ if (cache)
+ for (b=0; b<S->nseq; b++)if (cache[b]!=list1[a][b])cont=0;
+ if (!cont) continue;
+
+ SL[a]=print_split (a, list1, LIST, S, main_buf, (tlist==1)?in2:NULL);
+ for (b=0; b<n1; b++)
+ {
+ if ( a==b)continue;
+ else
+ {
+
+ for (d=s1=s2=0,c=0; c<S->nseq; c++)
+ {
+ s1+=list1[b][c];
+ s2+=list1[a][c];
+ d+=(list1[a][c]!=list1[b][c])?1:0;
+ }
+
+ }
+ if (d<=nb &&((s1==s2)|| ((S->nseq-s1)==s2)))print_split (b, list1, LIST, S, main_buf, (tlist==1)?in2:NULL);
+ }
+ }
+ a=0;
+ vfree (cache);
+ return SL;
+}
+Split * declare_split (int nseq, int ntrees);
+Split* print_split ( int a, int **list1, Sequence *LIST, Sequence *S, char *buf, char *split_file)
+ {
+ int f1,t,b;
+ Split *SP=NULL;
+
+
+
+ SP=declare_split (S->nseq, LIST->nseq);
+
+ fprintf ( stdout, "\n>");
+ for (t=0,b=0; b<S->nseq; b++){fprintf ( stdout, "%d", list1[a][b]);t+=list1[a][b];SP->split[b]='0'+list1[a][b];}
+ fprintf ( stdout, " NumberSplit %5d SplitSize %5d Score %5.2f %s ", list1[a][S->nseq+1],t, (float)(list1[a][S->nseq+1]*100)/LIST->nseq, (buf)?buf:"");
+ SP->n= list1[a][S->nseq+1];
+ SP->score=(float)(list1[a][S->nseq+1]*100)/LIST->nseq;
+ SP->S=S;
+
+ for (f1=1,b=0; b< S->nseq; b++)
+ {
+
+ if (list1[a][b])
+ {
+ if (f1==1)fprintf ( stdout, "(");
+ else fprintf (stdout, ",");
+ f1=0;
+ fprintf ( stdout, "%s", S->name [b]);
+ }
+ }
+ fprintf ( stdout, ")");
+ if (split_file)
+ {
+ char *buf=NULL;
+ FILE *fp;
+
+ char c;
+ fp=vfopen (split_file, "r");
+ while ( (c=fgetc(fp))!=EOF)
+ {
+
+ c=ungetc (c, fp);
+ buf=vfgets (buf, fp);
+ if ( strstr (buf, SP->split))
+ {
+ char **list;
+ list=string2list (buf);
+ fprintf ( stdout, "\n\t%s %s", SP->split, list[3]);
+ free_char (list, -1);
+ }
+ }
+ vfclose (fp);
+ }
+
+ return SP;
+ }
+Split * declare_split (int nseq, int ntrees)
+{
+ Split *S;
+ S=vcalloc (1, sizeof (Split));
+ S->split=vcalloc ( nseq+1, sizeof (char));
+ return S;
+}
+int treelist2splits( Sequence *S, Sequence *TS)
+{
+ NT_node *T;
+ int n=0,nseq, a, c;
+
+ int *used;
+
+ char *split_file, *sorted_split_file;
+ char *buf=NULL, *ref_buf=NULL;
+ FILE *fp;
+
+ split_file=vtmpnam (NULL);
+ sorted_split_file =vtmpnam (NULL);
+
+ n=S->nseq;
+ used=vcalloc (n, sizeof (int));
+
+ T=read_tree_list (S);
+ if (!TS)TS=tree2seq(T[0], NULL);
+ nseq=TS->nseq;
+ fp=vfopen (split_file, "w");
+
+
+ for ( a=0; a< S->nseq; a++)
+ {
+
+ T[a]=prune_tree (T[a], TS);
+ T[a]=recode_tree (T[a], TS);
+ display_splits (T[a], TS,fp);
+ }
+
+ vfclose (fp);
+ printf_system ("cp %s split_file::IGNORE_FAILURE::", split_file);
+ printf_system ( "cat %s | grep 1| sort > %s::IGNORE_FAILURE::", split_file, sorted_split_file);
+
+ fp=vfopen (sorted_split_file, "r");
+ fprintf (stdout, "LEGEND: <#occurences> <coded split> <min group size> <(group1,)> <(group2,>\n");
+
+ for ( a=0; a<TS->nseq; a++)fprintf ( stdout, "SEQ_INDEX %d %s\n", a+1, TS->name[a]);
+ while ( (c=fgetc (fp))!=EOF)
+ {
+
+ ungetc (c, fp);
+ buf=vfgets (buf, fp);
+ buf [strlen(buf)-1]='\0';
+
+ if ( ref_buf==NULL)
+ {
+ ref_buf=vcalloc (strlen (buf)+1, sizeof (char));
+ sprintf ( ref_buf, "%s", buf);
+ n=1;
+ }
+ else if ( !strm (buf, ref_buf))
+ {
+ int i;
+ fprintf ( stdout, "SPLIT_COUNT %3d %s (", n, ref_buf);
+ for (i=0,a=0; a<nseq; a++)
+ if (ref_buf[a]=='1')
+ {
+ if (i==1)fprintf(stdout, ",");
+ fprintf (stdout, "%s", TS->name[a]);
+ i=1;
+ }
+ fprintf ( stdout, "),(");
+ for (i=0,a=0; a<nseq; a++)
+ if (ref_buf[a]=='0')
+ {
+ if (i==1) fprintf ( stdout, ",");
+ fprintf (stdout, "%s", TS->name[a]);
+ i=1;
+ }
+
+ fprintf (stdout, ")\n");
+ sprintf ( ref_buf, "%s", buf);
+ n=1;
+ }
+ else
+ {
+ n++;
+ }
+ }
+ vfclose (fp);
+
+
+ myexit (0);
+}
+
+int treelist2splits_old ( Sequence *S, Sequence *TS)
+{
+ NT_node *T;
+ int n=0,nseq, a,c;
+
+ int *used;
+
+ char *split_file, *sorted_split_file;
+ char *buf=NULL, *ref_buf=NULL;
+ FILE *fp;
+
+ split_file=vtmpnam (NULL);
+ sorted_split_file =vtmpnam (NULL);
+
+ n=S->nseq;
+ used=vcalloc (n, sizeof (int));
+
+ T=read_tree_list (S);
+ if (!TS)TS=tree2seq(T[0], NULL);
+ nseq=TS->nseq;
+ fp=vfopen (split_file, "w");
+
+ for ( a=0; a< S->nseq; a++)
+ {
+
+ T[a]=prune_tree (T[a], TS);
+ T[a]=recode_tree (T[a], TS);
+ display_leaf_nb (T[a], TS->nseq,fp, S->name[a]);
+ }
+ vfclose (fp);
+ printf_system ("cp %s split_file::IGNORE_FAILURE::", split_file);myexit (0);
+
+ printf_system ( "cat %s | grep 1| sort > %s::IGNORE_FAILURE::", split_file, sorted_split_file);
+
+ vfopen (sorted_split_file, "r");
+
+ while ( (c=fgetc (fp))!=EOF)
+ {
+
+ ungetc (c, fp);
+ buf=vfgets (buf, fp);
+ buf [strlen(buf)-1]='\0';
+
+ if ( ref_buf==NULL)
+ {
+ ref_buf=vcalloc (strlen (buf)+1, sizeof (char));
+ sprintf ( ref_buf, "%s", buf);
+ n=1;
+ }
+ else if ( !strm (buf, ref_buf))
+ {
+ int i;
+ fprintf ( stdout, "%3d %s(", n, ref_buf);
+ for (i=0,a=0; a<nseq; a++)
+ if (ref_buf[a]=='1')
+ {
+ if (i==1)fprintf(stdout, ",");
+ fprintf (stdout, "%s", TS->name[a]);
+ i=1;
+ }
+ fprintf ( stdout, "),(");
+ for (i=0,a=0; a<nseq; a++)
+ if (ref_buf[a]=='0')
+ {
+ if (i==1) fprintf ( stdout, ",");
+ fprintf (stdout, "%s", TS->name[a]);
+ i=1;
+ }
+
+ fprintf (stdout, ")\n");
+ sprintf ( ref_buf, "%s", buf);
+ n=1;
+ }
+ else
+ {
+ n++;
+ }
+ }
+ vfclose (fp);
+
+
+ myexit (0);
+}
+
+NT_node *treelist2prune_treelist (Sequence *S, Sequence *TS, FILE *out)
+{
+ NT_node *T;
+ int a, b, c;
+
+ T=read_tree_list (S);
+ T=vrealloc (T, (S->nseq+1)*sizeof (NT_node));
+ for (b=0,a=0; a<S->nseq; a++)
+ {
+ T[a]=prune_tree (T[a], TS);
+ if (tree2nleaf(T[a])<TS->nseq)
+ {
+ ;
+ }
+ else
+ {
+ char *s;
+ T[b]=T[a];
+ T[b]=recode_tree (T[b], TS);
+ sprintf ( S->name[b], "%s", S->name[a]);
+ s=tree2string (T[a]);
+ S->seq[b]=vrealloc (S->seq[b], (strlen (s)+1)*sizeof (char));
+ sprintf (S->seq[b], "%s",s);
+ sprintf (S->seq_comment[b], " NSPECIES: %d", TS->nseq);
+ vfree (s);
+
+ b++;
+ }
+
+ }
+
+ S->nseq=b;
+ T[S->nseq]=NULL;
+
+ if (out)
+ {
+ for (a=0; a<S->nseq; a++)
+ {
+ print_tree (T[a], "newick", out);
+ }
+ }
+ return T;
+}
+int** treelist2lti2 ( Sequence *S, Sequence *TS, int ngb, FILE *out);
+int treelist2frame (Sequence *S, Sequence *TS)
+{
+ int n, a, b, c,d, **r, **order;
+ Sequence *temp;
+
+ temp=duplicate_sequence (S);
+ order= treelist2lti (temp, TS,0,stdout);
+
+ TS=reorder_seq_2 (TS, order, 0, TS->nseq);
+ n=TS->nseq;
+
+ for (a=3; a<n; a++)
+ {
+ NT_node tree;
+
+ TS->nseq=a+1;
+ temp=duplicate_sequence (S);
+ r=treelist2groups (temp,TS, NULL, NULL);
+ fprintf ( stdout, "\n>Tree_%d [%d %%]\n ", a+1,r[0][1]);
+ tree=main_read_tree (temp->name[r[0][0]]);
+ tree=prune_tree (tree, TS);
+ print_tree (tree, "newick",stdout);
+
+ free_int (r, -1);
+ free_sequence (temp,-1);
+ }
+ myexit (EXIT_SUCCESS);
+}
+int** treelist2lti2 ( Sequence *S, Sequence *TS, int ngb, FILE *out)
+{
+ NT_node *T;
+ int a,b, c, d, ****dist, i;
+ int **score, **order;
+
+ score=declare_int (TS->nseq, 3);
+ order=declare_int (TS->nseq, 2);
+ vsrand (0);
+
+ for (a=0; a<50; a++)
+ {
+ Sequence *seq, *trees;
+ int **r;
+ trees=duplicate_sequence (S);
+ seq=duplicate_sequence (TS);
+ for (b=0; b<TS->nseq; b++){order[b][0]=b;order[b][1]=rand()%10000;}
+ sort_int (order, 2, 1, 0, TS->nseq-1);
+ seq=reorder_seq_2(seq, order, 0,5);
+ r=treelist2groups (trees,seq, NULL, NULL);
+
+ for (b=0; b<5; b++)
+ {
+ score[order[b][0]][1]+=r[0][1];
+ score[order[b][0]][2]++;
+ }
+ HERE ("Score=%d", r[0][1]);
+ free_int (r, -1);
+ free_sequence (seq, -1);
+ free_sequence (trees, -1);
+
+ }
+
+ for ( a=0; a< TS->nseq; a++)
+ {
+ score[a][0]=a;
+ HERE ("%s => %d [%d]",TS->name[a], score[a][1]/score[a][2], score[a][2]);
+ score[a][1]/=(score[a][2])?score[a][2]:1;
+ }
+ sort_int_inv (score, 3, 1, 0, TS->nseq-1);
+
+ return score;
+}
+
+
+int** treelist2lti ( Sequence *S, Sequence *TS, int ngb, FILE *out)
+{
+ NT_node *T;
+ int a,b, c, d, ****dist, i;
+ float score0=0, score1=0;
+ int **result;
+
+
+ i=S->nseq;
+ T=treelist2prune_treelist (S, TS,NULL);
+
+ if (!ngb)ngb=TS->nseq*2;
+ dist=vcalloc ( S->nseq, sizeof (int****));
+ result=declare_int (TS->nseq, 2);
+ for (a=0; a<TS->nseq; a++)
+ {
+ float score_seq=0;
+ float n_seq=0;
+ for (b=0; b<TS->nseq;b++)
+ {
+ float score_pair=0;
+ float n_pair=0;
+ for (c=0; c<S->nseq; c++)
+ {
+ if (!dist[c])dist[c]=tree2dist(T[c], TS, NULL);
+ for (d=0; d<S->nseq; d++)
+ {
+ float score, d1, d2;
+
+ if (!dist[d])dist[d]=tree2dist(T[d], TS, NULL);
+ d1=dist[c][0][a][b];
+ d2=dist[d][0][a][b];
+ score=FABS((d1-d2));
+ if (d1>ngb || d2>ngb);
+ else
+ {
+ score_seq+=score;
+ score_pair+=score;
+ n_seq++;
+ n_pair++;
+ }
+ // if (d1 && d2) HERE ("%d %d", (int)d1, (int)d2);
+ }
+ }
+ score_pair=(score_pair*100)/(float)n_pair;
+ if (out)fprintf ( stdout, "\n>%-20s %-20s LTI: %7.3f [Kept %d Trees Out of %d] ", TS->name[a],TS->name[b], score_pair, S->nseq,i);
+ }
+
+ score_seq=(score_seq*100)/n_seq;
+ result[a][0]=a;
+ result[a][1]=(int)(100*score_seq);
+ if (out)fprintf ( stdout, "\n>%-20s %-20s LTI: %7.3f [Kept %d Trees Out of %d] ", TS->name[a],"*", score_seq, S->nseq, i);
+ }
+ sort_int (result,2,1,0, TS->nseq-1);
+ return result;
+}
+
+
+int ***tree2dist (NT_node T, Sequence *S, int ***d)
+{
+ int *l0, *r0,*l1, *r1, a, b;
+
+
+ if (!T) return d;
+ if (!S)S=tree2seq(T, NULL);
+ if (!d)
+ {
+ d=declare_arrayN (3, sizeof (float),2, S->nseq, S->nseq);
+ T=prune_tree(T, S);
+ T=recode_tree (T, S);
+ }
+
+ if (!T->left)return d;
+ if (!T->right) return d;
+
+ l0=(T->left)->idist;
+ r0=(T->right)->idist;
+
+ l1=(T->left)->ldist;
+ r1=(T->right)->ldist;
+
+
+
+ for (a=0; a< S->nseq; a++)
+ for (b=0; b<S->nseq; b++)
+ {
+ if (l0[a]>0 && r0[b]>0)d[0][a][b]=d[0][b][a]=l0[a]+r0[b];
+ if (l0[a]>0 && r0[b]>0)d[1][a][b]=d[1][b][a]=l1[a]+r1[b];
+ }
+
+ d=tree2dist (T->left, S, d);
+ d=tree2dist (T->right, S, d);
+
+
+ return d;
+}
+
+
+
+int **tree2dist_split ( NT_node T, Sequence *S, int **dist)
+{
+
+ FILE *fp;
+ int a, b, c, n=0;
+ char *buf=NULL, **list=NULL, *split_file;
+
+
+ if (!S)S=tree2seq(T, NULL);
+
+ T=prune_tree (T, S);
+ T=recode_tree (T, S);
+
+ split_file=vtmpnam (NULL);
+ fp=vfopen (split_file, "w");
+ display_code (T, S->nseq,fp);
+ vfclose (fp);
+
+ list=declare_char (2*S->nseq, S->nseq+1);
+ fp=vfopen (split_file, "r");
+
+ while ((buf=vfgets (buf,fp))!=NULL)
+ {
+ if (buf[0]=='1' || buf[0]=='0')sprintf (list[n++], "%s", buf);
+ }
+ vfclose (fp);
+ dist=declare_int ( S->nseq, S->nseq);
+ for (a=0; a< S->nseq; a++)
+ for ( b=0; b<S->nseq; b++)
+ for (c=0; c<n; c++)
+ if (list[c][a]!=list[c][b])dist[a][b]++;
+
+
+ return dist;
+}
+
+int** treelist2groups (Sequence *S, Sequence *TS, char *star_node, FILE *out)
+ {
+ NT_node *T;
+ int a, b, tot, n,i;
+ int v;
+ int *used;
+ int ntop;
+ int nsn;
+ int cov=100;
+ int **results;
+
+
+ i=S->nseq;
+ T=treelist2prune_treelist (S, TS,NULL);
+ nsn=(star_node)?atoi(star_node):0;
+
+ results=declare_int (S->nseq+1, 2);
+
+ if (nsn)
+ {
+ for (a=0; a< S->nseq; a++)tree2star_nodes(T[a],nsn);
+ }
+
+ used=vcalloc (S->nseq, sizeof (int));
+ for (ntop=0,a=0; a<S->nseq; a++)
+ {
+
+ if (used[a]==0)
+ {
+ ntop++;
+ if (out)fprintf ( out, "\nTree %s:",S->name[a]);
+ used[a]=1;
+ }
+ else continue;
+ tot=1;
+ for ( b=0; b<S->nseq; b++)
+ {
+ v=0;
+
+ v=(int)simple_tree_cmp (T[a], T[b], TS, 1);
+ if ( v==100)
+ {
+ used[b]=1;
+ used[a]++;
+ if (out)fprintf (stdout," %s ", S->name[b]);
+ tot++;
+ }
+ }
+
+ if (out)fprintf ( stdout, "__ N=%d\n", tot-1);
+ }
+
+
+ for (n=0,a=0; a<S->nseq; a++)
+ {
+ if ( used[a]>1)
+ {
+ if (out)fprintf ( out, "\n>%-15s %4d %6.2f TOPOLOGY_LIST\n", S->name[a], used[a]-1, (float)(((float)used[a]-1)*100/(float)S->nseq));
+ if (out)print_tree (T[a], "newick_tree", out);
+ results[n][0]=a;
+ results[n][1]=((used[a]-1)*100)/i;
+ n++;
+ }
+ }
+
+ for (a=0; a<S->nseq; a++) free_tree(T[a]);
+ vfree (T);
+
+ if (out)fprintf ( stdout, "\nTotal Number of different topologies: %d\n", ntop);
+ results[n][0]=-1;
+ sort_int_inv (results,2,1,0, n-1);
+ for (a=0; a<S->nseq; a++) free_tree(T[a]);
+ vfree (T);
+ return results;
+ }
+float simple_tree_cmp (NT_node T1, NT_node T2,Sequence *S, int mode)
+{
+ Tree_sim *TS1, *TS2;
+ float t, w, l, n;
+
+ TS1=vcalloc (1, sizeof (Tree_sim));
+ TS2=vcalloc (1, sizeof (Tree_sim));
+
+
+ T1=recode_tree(T1, S);
+ T2=recode_tree(T2, S);
+
+ n=new_compare_trees ( T1, T2, S->nseq, TS1);
+ new_compare_trees ( T2, T1, S->nseq, TS2);
+
+
+
+ t=(TS1->uw+TS2->uw)*100/(TS1->max_uw+TS2->max_uw);
+ w=(TS1->w+TS2->w)*100/(TS1->max_w+TS2->max_w);
+ l=(TS1->d+TS2->d)*100/(TS1->max_d+TS2->max_d);
+
+ vfree (TS1); vfree (TS2);
+ if ( mode ==1)return t;
+ else if (mode ==2) return w;
+ else return l;
+}
+int treelist2n (NT_node *L)
+{
+ int n=0;
+ while (L[n])n++;
+ return n;
+}
+int **treelist2avg_treecmp (NT_node *L, char *file)
+{
+ int a, b, n;
+ int **score;
+
+ if (file) L=read_tree_list (main_read_seq(file));
+ n=treelist2n (L);
+
+ score=declare_int (n, 2);
+ for (a=0; a<n; a++)score[a][0]=a;
+
+ for (a=0; a<n-1; a++)
+ {
+ output_completion (stderr,a,n,1, "Tree Cmp");
+ for (b=a+1; b<n; b++)
+ {
+ Tree_sim *ts;
+ ts=tree_cmp (L[a],L[b]);
+ score[a][1]+=ts->uw;
+ score[b][1]+=ts->uw;
+ vfree (ts);
+ }
+ }
+ sort_int_inv (score, 2, 1, 0, n-1);
+ if (file)free_treelist(L);
+ return score;
+}
+
+int treelist_file2consense (char *tree_file, char *outtree, char *outfile)
+{
+ static char *command;
+ static char *tmp_outtree;
+ static char *tmp_outfile;
+ FILE *fp;
+ int flag1=0;
+ int flag2=0;
+
+ if (!command)
+ {
+ command=vtmpnam (NULL);
+ tmp_outtree=vtmpnam (NULL);
+ tmp_outfile=vtmpnam (NULL);
+ }
+ if (!check_program_is_installed ("consense",NULL,NULL,"www.phylip.com",NO_REPORT))return 0;
+
+ fp=vfopen (command, "w");fprintf ( fp, "%s\nY\n", tree_file);fclose (fp);
+ if ( check_file_exists ("outtree")){flag1=1;printf_system ("mv outtree %s::IGNORE_FAILURE::", tmp_outtree);}
+ if ( check_file_exists ("outfile")){flag2=1;printf_system ("mv outfile %s::IGNORE_FAILURE::", tmp_outfile);}
+ printf_system ("consense <%s > /dev/null 2>/dev/null::IGNORE_FAILURE::", command);
+
+ if ( outtree)printf_system ("mv outtree %s::IGNORE_FAILURE::", outtree);
+ remove ("outtree");
+ if ( outfile)printf_system ("mv outfile %s::IGNORE_FAILURE::", outfile);
+ remove ("outfile");
+ if (flag1)printf_system ("mv %s outtree::IGNORE_FAILURE::", tmp_outtree);
+ if (flag2)printf_system ("mv %s outfile::IGNORE_FAILURE::", tmp_outfile);
+ return 1;
+}
+
+
+NT_node treelist2filtered_bootstrap ( NT_node *L,char *file, int **score, float t)
+{
+ NT_node BT, *L2;
+ int n,a;
+
+ if (t==1 || t==0 || !score)return treelist2bootstrap (L, file);
+
+ if (file)L=read_tree_list (main_read_seq(file));
+
+ n=treelist2n(L)*t;
+
+ if (n==0) return NULL;
+
+ L2=vcalloc ( n+1, sizeof (NT_node));
+ for (a=0; a<n; a++)
+ L2[a]=L[score[a][0]];
+
+ BT=treelist2bootstrap (L2, NULL);
+
+ vfree (L2);
+ if (file)free_treelist(L);
+ return BT;
+}
+
+NT_node treelist2bootstrap ( NT_node *L, char *file)
+{
+ char *outfile;
+ NT_node T;
+ FILE *fp;
+
+ if (!file)
+ {
+ file=vtmpnam (NULL);
+ vfclose (print_tree_list (L,"newick", vfopen (file, "w")));
+ }
+
+ outfile=vtmpnam (NULL);
+
+ printf_system( "msa2bootstrap.pl -i %s -o %s -input tree >/dev/null 2>/dev/null", file, outfile);
+
+ T=main_read_tree (outfile);
+ T=tree_dist2normalized_tree_dist (T,treelist2n(L));
+
+
+ return T;
+}
+
+
+
+Sequence * treelist2seq (Sequence *S)
+{
+ int a, b, c, n, i;
+ char **name;
+ NT_node *T;
+ Sequence *TS;
+ char *fname;
+ FILE *fp;
+
+ name=vcalloc (1, sizeof (char*));
+ fp=vfopen ((fname=vtmpnam (NULL)), "w");
+
+ T=read_tree_list (S);
+ for (n=0,a=0; a< S->nseq; a++)
+ {
+ TS=tree2seq(T[a], NULL);
+ for (b=0; b<TS->nseq; b++)
+ {
+ if ( (i=name_is_in_list (TS->name[b], name, n, 100))==-1)
+ {
+ name[n]=vcalloc (100, sizeof (int));
+ sprintf ( name[n], "%s", TS->name[b]);
+ n++;
+ name=vrealloc (name, (n+1)*sizeof (char*));
+ fprintf ( fp, ">%s\n", TS->name[b]);
+ }
+ }
+ free_sequence(TS, TS->nseq);
+ free_tree (T[a]);
+ }
+
+ vfclose (fp);
+ vfree (T);
+ return get_fasta_sequence (fname, NULL);
+}
+
+
+Sequence * treelist2sub_seq ( Sequence *S, int f)
+{
+ NT_node *T;
+ int a,b,c, s, i, n, maxnseq, tot;
+ int **count, **grid;
+ char *fname;
+ Sequence *FS, *TS;
+ FILE *fp;
+ if (!f)return treelist2seq(S);
+
+
+ //keep as many taxons as possible so that f% of the trees are kept
+ //1: count the frequency of each taxon
+
+ FS=treelist2seq (S);
+ maxnseq=FS->nseq;
+
+ count=declare_int (maxnseq, 3);
+ grid=declare_int (S->nseq,maxnseq+1);
+ T=read_tree_list (S);
+
+
+
+ for (a=0; a<FS->nseq; a++){count[a][0]=a;count[a][2]=1;}
+ for (n=0,a=0; a< S->nseq; a++)
+ {
+ TS=tree2seq(T[a], NULL);
+ for (b=0; b<TS->nseq; b++)
+ {
+ i=name_is_in_list (TS->name[b], FS->name, FS->nseq, 100);
+ if ( i==-1){myexit (EXIT_FAILURE);}
+ count[i][1]++;
+ grid[a][i]=1;
+ }
+ free_sequence(TS, TS->nseq);
+ free_tree (T[a]);
+ }
+ vfree (T);
+ sort_int ( count,3,1, 0, maxnseq-1);
+
+ for (a=0; a<maxnseq; a++)
+ {
+ count[a][2]=0;
+ for ( b=0; b< S->nseq; b++)grid[b][maxnseq]=1;//prepare to keep everything
+ for ( tot=S->nseq, b=0; b< S->nseq; b++)
+ {
+ for (c=0; c<maxnseq; c++)
+ {
+ s=count[c][0];
+ if (count[c][2] && !grid[b][s])
+ {
+ grid[b][maxnseq]=0;
+ tot--;
+ break;
+ }
+ }
+ }
+ tot=(tot*100)/S->nseq;
+ if ( tot>=f)break;
+ }
+ if (tot<f)return NULL;
+
+ fname=vtmpnam (NULL);
+ fp=vfopen (fname, "w");
+ for (a=0; a<maxnseq; a++)
+ {
+ if (count[a][2])
+ {
+ fprintf ( fp, ">%s LIMIT: %d %%\n", FS->name[count[a][0]], f);
+
+ }
+ }
+ vfclose (fp);
+ free_int (grid, -1); free_int (count, -1);
+ free_sequence (FS, FS->nseq);
+
+ return get_fasta_sequence (fname, NULL);
+}
+/******************************COPYRIGHT NOTICE*******************************/
+/*© Centro de Regulacio Genomica */
+/*and */
+/*Cedric Notredame */
+/*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
+/*All rights reserved.*/
+/*This file is part of T-COFFEE.*/
+/**/
+/* T-COFFEE is free software; you can redistribute it and/or modify*/
+/* it under the terms of the GNU General Public License as published by*/
+/* the Free Software Foundation; either version 2 of the License, or*/
+/* (at your option) any later version.*/
+/**/
+/* T-COFFEE is distributed in the hope that it will be useful,*/
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
+/* GNU General Public License for more details.*/
+/**/
+/* You should have received a copy of the GNU General Public License*/
+/* along with Foobar; if not, write to the Free Software*/
+/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
+/*............................................... |*/
+/* If you need some more information*/
+/* cedric.notredame@europe.com*/
+/*............................................... |*/
+/**/
+/**/
+/* */
+/******************************COPYRIGHT NOTICE*******************************/