X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=sources%2Foc%2Foc.c;fp=sources%2Foc%2Foc.c;h=3c497c294f0eb46091c942b7eeae7dd0f2ca9d9d;hb=81362e35a140cd040e948b921053e74267f8a6e3;hp=0000000000000000000000000000000000000000;hpb=2cf032f4b987ba747c04159965aed78e3820d942;p=jpred.git diff --git a/sources/oc/oc.c b/sources/oc/oc.c new file mode 100644 index 0000000..3c497c2 --- /dev/null +++ b/sources/oc/oc.c @@ -0,0 +1,1667 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* for regex's */ +#include +#include + +#include "gjutil.h" +#include "oc.h" + +/*#define VERY_SMALL DBL_MIN*/ +#define VERY_SMALL -DBL_MAX +#define VERY_BIG DBL_MAX +#define MAX_ID_LEN 30 + +/* Postscript plotting constants */ +#define POINT 72 +#define X_OFFSET 40 +#define Y_OFFSET 100 + +#define DEBUG + +float MAXside = 11.0 * POINT; /* Max dimensions of paper */ +float MINside = 8.0 * POINT; +float WTreeFraction = 0.9; /* Fraction of max taken up by Tree */ +float HTreeFraction = 0.9; /* (Calculated after subtracting offsets) */ +float TreeExtendFac = 0.05; /* Extension of tree beyond min/max value factor */ +double base_val = 1000.0; +float IdOffsetFactor = 0.01; +float NumberOffsetFactor = 0.05; +float NumberFontSize = 12; +float IdFontSize = 12; +float TitleFontSize = 12; + +/**************************************************************************** + Program oc: + + General purpose cluster analysis program. Written with flexibility + in mind rather than all out efficiency. + + Copyright(c) G. J. Barton (1993,1997,2002) + + Author: G. J. Barton + School of Life Sciences + University of Dundee + Dow St. + Dundee DD1 5EH + Scotland + + email:geoff@compbio.dundee.ac.uk + + Example input file: + +7 +First +Second +Third +Fourth +Fifth +Sixth +Seventh +100.0 +100.0 +50.0 +33.0 +25.0 +20.0 +100.0 +50.0 +50.0 +33.0 +33.0 +100.0 +33.0 +20.0 +25.0 +33.0 +20.0 +25.0 +100.0 +100.0 +100.0 + +11/8/93: Add single order option. This forces the clustering to only add +a single sequence at a time to the growing cluster. All other options are +equivalent to the full tree method. GJB. + +15/12/93: Correct bug in definition of VERY_SMALL ie now = -DBL_MAX. This +allows the complete and means linkage to work with negative numbers. + +04/08/2002:Update affiliation and also add gjnoc2 routine to the distribution. +----------------------------------------------------------------------------*/ + +int +main (int argc, char *argv[]) +{ + extern FILE *std_in, *std_out, *std_err; + extern double base_val; + int error; + int n; + int nclust; + int i, j, k, l; + int i1loc, j1loc; + int i2loc, j2loc; + int i3loc, j3loc; + double U1val, U2val, U3val, rval; + struct sc *clust; + double **arr; /* upper diagonal array structure */ + int *unclust; /* list of unclustered entities */ + int nunclust; /* number of unclustered entities */ + char **idents; /* array of idents for the entities */ + int *notparent; /* list of clusters that have no children */ + int nnotparent; /* number of clusters that have no children */ + int *inx; /* array of indexes - inx[i] is the re-ordered position */ + char *postfile; /* name of PostScript output file */ + + /* TIME INFORMATION */ + clock_t start_time, end_time, initial_time, final_time; + + /* options */ + int sim; /* similarity scores =1, distances = 2 */ + int type; /* single linkage = 1, complete linkage = 2, means = 3 */ + int option; /* use to feed the Grp_Grp function */ + int ps; + int dolog; /* flag to take logs on output to .ps file */ + double cut; + int do_cut; + int showid; + int amps_out; /* output tree file and order file for amps */ + int timeclus; /* set to 1 if timing output to std_err required */ + int single_order; + + /* amps files */ + char *amps_order_file; + char *amps_tree_file; + FILE *ford; + FILE *ftree; + FILE *fp; + + GJinitfile (); + + sim = 1; + type = 1; + ps = 0; /* default is no PostScript output */ + showid = 0; + do_cut = 0; + timeclus = 0; + amps_out = 0; + dolog = 0; + single_order = 0; + + start_time = clock (); + initial_time = start_time; + + if (argc > 1) + { + for (i = 1; i < argc; ++i) + { + if (strcmp (argv[i], "sim") == 0) + { + sim = 1; + base_val = VERY_BIG; + cut = VERY_SMALL; + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "dis") == 0) + { + sim = 2; + base_val = VERY_SMALL; + cut = VERY_BIG; + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "single") == 0) + { + type = 1; + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "complete") == 0) + { + type = 2; + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "means") == 0) + { + type = 3; + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "order") == 0) + { + /* do a single order clustering rather than a full tree */ + single_order = 1; + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "ps") == 0 + || strcmp (argv[i], "eps") == 0) + { + ++i; + ps = 1; + postfile = GJcat (2, argv[i], ".eps"); + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "log") == 0) + { + dolog = 1; + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "cut") == 0) + { + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + ++i; + cut = atof (argv[i]); + do_cut = 1; + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "amps") == 0) + { + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + amps_out = 1; + ++i; + if (single_order) + { + amps_order_file = GJcat (2, argv[i], ".ord"); + } + else + { + amps_tree_file = GJcat (2, argv[i], ".tree"); + amps_order_file = GJcat (2, argv[i], ".tord"); + } + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "id") == 0) + { + showid = 1; /* output idents rather than index */ + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "timeclus") == 0) + { + timeclus = 1; /* output idents rather than index */ + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "maxside") == 0) + { + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + ++i; + MAXside = atof (argv[i]) * POINT; + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else if (strcmp (argv[i], "minside") == 0) + { + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + ++i; + MINside = atof (argv[i]) * POINT; + /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */ + } + else + { + fprintf (std_err, "Unrecognised Option: argv[%d]=%s\n", i, argv[i]); + } + } + } + else + { + fprintf (std_err, "Cluster analysis program\n\n"); + fprintf (std_err, "Author: G. J. Barton, University of Dundee, UK\n"); + fprintf (std_err, " http://www.compbio.dundee.ac.uk\n"); + fprintf (std_err, "Copyright: G. J. Barton (1993,1997,2002,2004)\n\n"); + fprintf (std_err, "Please cite: OC a cluster analysis program, Barton, G. J. 1993\n\n"); + fprintf (std_err, "Usage: oc \n\n"); + fprintf (std_err, "Version 2.1a (Apr. 2005) - Requires a file to be piped to standard input\n"); + fprintf (std_err, "Format: Line 1: Number (N) of entities to cluster (e.g. 10)\n"); + fprintf (std_err, "Format: Lines 2 to 2+N-1: Identifier codes for the entities (e.g. Entity1)\n"); + fprintf (std_err, "Format: N*(N-1)/2: Distances, or similarities - ie the upper diagonal\n\n"); + fprintf (std_err, "Options:\n"); + fprintf (std_err, "sim = similarity / dis = distances\n"); + fprintf (std_err, "method = single/complete/means\n"); + fprintf (std_err, "ps = plot out portrait dendrogram to \n"); + fprintf (std_err, "eps = plot out portrait dendrogram to \n"); + fprintf (std_err, "minside = define length of short (x) side of plot in inches \n"); + fprintf (std_err, "maxside = define length of long (y) side of plot in inches \n"); + fprintf (std_err, "log = take logs before calculation \n"); + fprintf (std_err, "cut = only show clusters above/below the cutoff\n"); + fprintf (std_err, "id = output identifier codes rather than indexes for entities\n"); + fprintf (std_err, "timeclus = output times to generate each cluster\n"); + fprintf (std_err, "amps = produce amps .tree and .tord files\n\n"); + fprintf (std_err, "See oc_manual.txt file for brief instructions\n"); + + exit (EXIT_SUCCESS); + } + + /* Read in the OC file */ + /* allocate and read in the upper diagonal array */ + fprintf (std_err, "Reading Upper Diagonal\n"); + n = read_num (std_in); + idents = read_idents (std_in, n); + arr = read_up_diag (std_in, n); + if (dolog) + { + up_diag_log (arr, n); + } + +/* write_up_diag(std_out,arr,n);*/ + fprintf (std_err, "Read: %d Entries\n", n); + /* File has been read in */ + + end_time = clock (); + fprintf (std_err, "CPU time: %f seconds\n", + ((float) (end_time - start_time)) / CLOCKS_PER_SEC); + start_time = end_time; + + /* holds index of unclustered entities */ + + fprintf (std_err, "Setting up unclust\n"); + unclust = (int *) GJmalloc (sizeof (int) * n); + for (i = 0; i < n; ++i) + { + unclust[i] = i; + } + nunclust = n; + + fprintf (std_err, "Setting up notparent\n"); + notparent = (int *) GJmalloc (sizeof (int) * n); + for (i = 0; i < n; ++i) + { + notparent[i] = 0; + } + nnotparent = 0; + + + + /* set up the clust array to hold results */ + /* there are always n-1 clusters formed - we will also store the nth (all entities) */ + fprintf (std_err, "Setting up clust\n"); + clust = (struct sc *) GJmalloc (sizeof (struct sc) * n); + end_time = clock (); + fprintf (std_err, "CPU time: %f seconds\n", + ((float) (end_time - start_time)) / CLOCKS_PER_SEC); + start_time = end_time; + + nclust = 0; + U1val = VERY_SMALL; + U2val = VERY_SMALL; + U3val = VERY_SMALL; + if (sim == 1) + { + if (type == 1) + { + fprintf (std_err, "Single linkage on similarity\n"); + option = 1; + } + else if (type == 2) + { + fprintf (std_err, "Complete linkage on similarity\n"); + option = 0; + } + else if (type == 3) + { + fprintf (std_err, "Means linkage on similarity\n"); + option = 2; + } + } + else + { + if (type == 1) + { + fprintf (std_err, "Single linkage on distance\n"); + option = 0; + } + else if (type == 2) + { + fprintf (std_err, "Complete linkage on distance\n"); + option = 1; + } + else if (type == 3) + { + fprintf (std_err, "Means linkage on distance\n"); + option = 2; + } + } + +/* implement max similarity single linkage first */ + + fprintf (std_err, "Doing Cluster Analysis...\n"); + while (nclust < (n - 1)) + { + if (sim == 1) + { + U1val = VERY_SMALL; + U2val = VERY_SMALL; + U3val = VERY_SMALL; + } + else + { + U1val = VERY_BIG; + U2val = VERY_BIG; + U3val = VERY_BIG; + } + if (nunclust > 0) + { +/* fprintf(std_err,"Option 1\n");*/ + if (!single_order || (single_order && nclust == 0)) + { + /* we've not clustered all entities into groups */ + /* find max of what is left */ + for (i = 0; i < nunclust - 1; ++i) + { + for (j = (i + 1); j < nunclust; ++j) + { + rval = val_A_B (arr, n, unclust[i], unclust[j]); + if (sim == 1) + { + if (rval > U1val) + { + U1val = rval; + i1loc = unclust[i]; + j1loc = unclust[j]; + } + } + else + { + if (rval < U1val) + { + U1val = rval; + i1loc = unclust[i]; + j1loc = unclust[j]; + } + } + } + } /* ... for(i=0... */ + } /* ...if(!single_order... */ + } + if (nnotparent > 0 && nunclust > 0) + { +/* fprintf(std_err,"Option 2\n");*/ + /* we have some clusters */ + /* so compare them to the unclustered entitites */ + for (i = 0; i < nunclust; ++i) + { + for (j = 0; j < nnotparent; ++j) + { + k = notparent[j]; + rval = + val_Grp_B (arr, n, clust[k].members, clust[k].n, + unclust[i], option); + if (sim == 1) + { + if (rval >= U2val) + { + /* need >= because want biggest previous cluster */ + U2val = rval; + i2loc = unclust[i]; + j2loc = k; + } + } + else + { + if (rval <= U2val) + { + /* need <= because want biggest previous cluster */ + U2val = rval; + i2loc = unclust[i]; + j2loc = k; + } + } + } + } + } + if (nnotparent > 1) + { +/* fprintf(std_err,"Option 3\n");*/ + /* compare clusters to clusters if there are more than 1 */ + for (i = 0; i < nnotparent - 1; ++i) + { + k = notparent[i]; + for (j = (i + 1); j < nnotparent; ++j) + { + l = notparent[j]; + rval = val_Grp_Grp (arr, n, clust[k].members, clust[k].n, + clust[l].members, clust[l].n, option); + if (sim == 1) + { + if (rval >= U3val) + { + U3val = rval; + i3loc = k; + j3loc = l; + } /* if(rval >= U3val... */ + } + else + { + if (rval <= U3val) + { + U3val = rval; + i3loc = k; + j3loc = l; + } /* if(rval >= U3val... */ + } + } /* if j=(i+1) ... */ + } /* for(i=0... */ + } + /* find which search gave the biggest (or smallest) value */ + if ((sim == 1 && U3val >= U1val && U3val >= U2val) || + (sim == 2 && U3val <= U1val && U3val <= U2val)) + { + /* joining of two clusters */ + clust[nclust].score = U3val; + clust[nclust].n = clust[i3loc].n + clust[j3loc].n; + clust[nclust].members = + (int *) GJmalloc (sizeof (int) * clust[nclust].n); + for (i = 0; i < clust[i3loc].n; ++i) + { + clust[nclust].members[i] = clust[i3loc].members[i]; + } + for (i = 0; i < clust[j3loc].n; ++i) + { + clust[nclust].members[i + clust[i3loc].n] = + clust[j3loc].members[i]; + } + clust[nclust].parentA = &clust[i3loc]; + clust[nclust].parentB = &clust[j3loc]; + clust[nclust].lab = 1; + remove_notparent (notparent, &nnotparent, i3loc); + sub_notparent (notparent, &nnotparent, j3loc, nclust); + } + else if ((sim == 1 && U2val >= U1val && U2val >= U3val) || + (sim == 2 && U2val <= U1val && U2val <= U3val)) + { + /* joining of a single entity to an existing cluster */ + clust[nclust].score = U2val; + clust[nclust].n = 1 + clust[j2loc].n; + clust[nclust].members = + (int *) GJmalloc (sizeof (int) * clust[nclust].n); + /* copy previous cluster list into this one - not strictly neccessary */ + /* but done to simplify cluster comparisons above */ + for (i = 1; i < clust[nclust].n; ++i) + { + clust[nclust].members[i] = clust[j2loc].members[i - 1]; + } + /* stick solitary cluster member on the beginning */ + clust[nclust].members[0] = i2loc; + /* put in parent pointers */ + clust[nclust].parentA = make_entity (i2loc, base_val); + clust[nclust].parentB = &clust[j2loc]; + clust[nclust].lab = 1; + sub_notparent (notparent, &nnotparent, j2loc, nclust); + remove_unclust (unclust, &nunclust, i2loc); + } + else if ((sim == 1 && U1val >= U2val && U1val >= U3val) || + (sim == 2 && U1val <= U2val && U1val <= U3val)) + { + /* unclustered pair were biggest */ + clust[nclust].score = U1val; + clust[nclust].n = 2; + clust[nclust].members = + (int *) GJmalloc (sizeof (int) * clust[nclust].n); + clust[nclust].members[0] = i1loc; + clust[nclust].members[1] = j1loc; + clust[nclust].parentA = make_entity (i1loc, base_val); + clust[nclust].parentB = make_entity (j1loc, base_val); + clust[nclust].lab = 1; + add_notparent (notparent, &nnotparent, nclust); + remove_unclust (unclust, &nunclust, i1loc); + remove_unclust (unclust, &nunclust, j1loc); + } + +/* fprintf(std_err,"Current cluster\n");*/ +/* show_entity(&clust[nclust],std_err);*/ + + if (timeclus) + { + fprintf (std_err, "Cluster: %d DONE! ", nclust); + end_time = clock (); + fprintf (std_err, "CPU time: %f seconds\n", + ((float) (end_time - start_time)) / CLOCKS_PER_SEC); + start_time = end_time; + } + + ++nclust; + } + /* now we should have all clusters stored in the clust array */ + /* set up the inx array */ + inx = (int *) GJmalloc (sizeof (int) * n); + for (i = 0; i < clust[nclust - 1].n; ++i) + { + inx[clust[nclust - 1].members[i]] = i; + } + + if (do_cut) + { + /* only output the biggest clusters that are above the cutoff */ + if (sim == 1) + { + for (i = (nclust - 1); i >= 0; --i) + { + if (clust[i].lab == 1) + { + /* only output clusters that aren't already output as part of something bigger */ + if (clust[i].score >= cut) + { + fprintf (std_out, "## %d %g %d\n", i, clust[i].score, + clust[i].n); + show_id_entity (&clust[i], idents, std_out); + mark_parents (&clust[i]); + } + } + } + } + else + { + for (i = (nclust - 1); i >= 0; --i) + { + if (clust[i].lab == 1) + { + /* only output clusters that aren't already output as part of something bigger */ + if (clust[i].score <= cut) + { + fprintf (std_out, "## %d %g %d\n", i, clust[i].score, + clust[i].n); + show_id_entity (&clust[i], idents, std_out); + mark_parents (&clust[i]); + } + } + } + } + /* now output the unclustered entities */ + fprintf (std_out, "\nUNCLUSTERED ENTITIES\n"); + write_unclustered (&clust[nclust - 1], idents, std_out); + + } + else + { + /* output all clusters from smallest to largest */ + if (amps_out) + { +/* fprintf(std_err,"amps_out not implemented yet\n");*/ + if (!single_order) + { + fprintf (std_err, + "Opening AMPS order and tree files for writing\n"); + ford = GJfopen (amps_order_file, "w", 1); + ftree = GJfopen (amps_tree_file, "w", 1); + } + else + { + fprintf (std_err, "Opening AMPS order file for writing\n"); + ford = GJfopen (amps_order_file, "w", 1); + } + /* output order file */ + if (single_order) + { + /* order file must be reversed */ + for (i = nclust; i > -1; --i) + { + fprintf (ford, "%11d%20.2f%10d\n", + clust[nclust - 1].members[i] + 1, 0.0, 0); + } + } + else + { + /* order file for use with the tree */ + for (i = 0; i < (nclust + 1); ++i) + { + fprintf (ford, "%11d%20.2f%10d\n", + clust[nclust - 1].members[i] + 1, 0.0, 0); + } + } + GJfclose (ford, 1); + if (!single_order) + { + /* output the tree_file */ + for (i = 0; i < nclust; ++i) + { + print_amps_cluster (clust[i].parentA, inx, ftree); + print_amps_cluster (clust[i].parentB, inx, ftree); + } + GJfclose (ftree, 1); + } + } + else + { + for (i = 0; i < nclust; ++i) + { + fprintf (std_out, "## %d %g %d\n", i, clust[i].score, + clust[i].n); + if (showid) + { + show_id_entity (&clust[i], idents, std_out); + } + else + { + show_entity (&clust[i], std_out); + } + + /* show_inx_entity(&clust[i],inx,std_out); */ + + /* fprintf(std_out,"Parents:\n"); + show_entity(clust[i].parentA,std_out); + show_entity(clust[i].parentB,std_out); + */ + } + } + } + + if (ps == 1) + { + fp = GJfopen (postfile, "w", 1); + draw_dendrogram (fp, clust, idents, inx, nclust, sim); + } + + final_time = clock (); + fprintf (std_err, "Total CPU time: %f seconds\n", + (float) (final_time - initial_time) / CLOCKS_PER_SEC); + exit (EXIT_SUCCESS); + +} + +/********************************************************************* +up_diag_log: take logs for the upper diagonal structure +-------------------------------------------------------------------*/ + +void +up_diag_log (double **arr, int n) +{ + int i, j; + + for (i = 0; i < (n - 1); ++i) + { + for (j = 0; j < (n - i - 1); ++j) + { + arr[i][j] = log (arr[i][j]); + } + } +} + +/********************************************************************** +write_up_diag: +writes out n, the number of entities, followed by +n*(n-1)/2 numbers that are the pair distances or similarities between +the n entities. + +-----------------------------------------------------------------------*/ +void +write_up_diag (FILE * infile, double **ret_val, int n) +{ + int i, j; + + fprintf (infile, "Number of Entities: %d\n", n); + for (i = 0; i < (n - 1); ++i) + { + for (j = 0; j < (n - i - 1); ++j) + { + fprintf (infile, " %f", ret_val[i][j]); + } + fprintf (infile, "\n"); + } +} + +/********************************************************************** +show_entity: + +print out the score, number of members and the members for +the entity +----------------------------------------------------------------------*/ +void +show_entity (struct sc *entity, FILE * out) +{ + int i; + + if (entity == NULL) + { + fprintf (out, "No entity\n"); + return; + } + + fprintf (out, "Entity Score: %g Number of members: %d\n", + entity->score, entity->n); + + for (i = 0; i < entity->n; ++i) + { + fprintf (out, " %d", entity->members[i]); + } + fprintf (out, "\n"); +} + +/********************************************************************** +show_id_entity: + +print out the score, number of members and the members for +the entity using the id codes rather than numerical positions +----------------------------------------------------------------------*/ +void +show_id_entity (struct sc *entity, char **id, FILE * out) +{ + int i; + + if (entity == NULL) + { + fprintf (out, "No entity\n"); + return; + } + +/* fprintf(out,"Entity Score: %f Number of members: %d\n", + entity->score,entity->n); +*/ + + for (i = 0; i < entity->n; ++i) + { + fprintf (out, " %s", id[entity->members[i]]); + } + fprintf (out, "\n"); +} + +/********************************************************************** +show_inx_entity: + +print out the score, number of members and the members for +the entity - followed by the index for the entity +----------------------------------------------------------------------*/ +void +show_inx_entity (struct sc *entity, int *inx, FILE * out) +{ + int i; + + if (entity == NULL) + { + fprintf (out, "No entity\n"); + return; + } + + fprintf (out, "Entity Score: %g Number of members: %d\n", + entity->score, entity->n); + for (i = 0; i < entity->n; ++i) + { + fprintf (out, " %d", entity->members[i]); + } + fprintf (out, "\n"); + for (i = 0; i < entity->n; ++i) + { + fprintf (out, " %d", inx[entity->members[i]]); + } + fprintf (out, "\n"); +} + +/********************************************************************** +print_amps_cluster: + +print out the cluster in AMPS tree file format. +ie 1x,20i5 Fortran format. + +inx is necessary to so that the identifying codes are re-ordered +appropriately for AMPS + +Need +1 cos AMPS uses sequence numbering from 1...N not 0...N-1 + +----------------------------------------------------------------------*/ +void +print_amps_cluster (struct sc *entity, int *inx, FILE * fp) +{ + int i; + int j; + + fprintf (fp, " %5d\n", entity->n); + + j = 0; + for (i = 0; i < entity->n; ++i) + { + if (j == 20) + { + j = 0; + } + if (j == 0) + { + fprintf (fp, " %5d", inx[entity->members[i]] + 1); + } + else + { + fprintf (fp, "%5d", inx[entity->members[i]] + 1); + } + if (j == 19) + { + fprintf (fp, "\n"); + } + ++j; + } + /* change from < to <= to avoid blunders at end of line */ + if (j <= 19) + fprintf (fp, "\n"); +} + +/****************************************************************** +make_entity: return pointer to malloc'd space for a struct sc + structure for a single entity. i. + base_val: The value form minimum distance, or maximum similarity. +-------------------------------------------------------------------*/ +struct sc * +make_entity (int i, double base_val) +{ + struct sc *ret_val; + ret_val = (struct sc *) GJmalloc (sizeof (struct sc)); + ret_val->score = base_val; + ret_val->n = 1; + ret_val->members = (int *) GJmalloc (sizeof (int)); + ret_val->members[0] = i; + ret_val->parentA = NULL; + ret_val->parentB = NULL; + ret_val->lab = 1; + return ret_val; +} + +/****************************************************************** +val_A_B: Given the upper diagonal array, return the value stored + in the notional arr[i][j]. + Where 0 <= i < n-1 + and i < j < n + +Note: This could/should be turned into a macro - without the bounds + check. +------------------------------------------------------------------ + +double val_A_B( + double **arr, + int n, + int i, + int j) + +{ + extern FILE *std_err; + if(!(i >= 0 && i < n-1 && i < j && j < n)){ + GJerror("Invalid request in val_A_B"); + fprintf(std_err,"i,j,n %d %d %d \n",i,j,n); + exit(-1); + } + + return arr[i][j-i-1]; +} +*/ +/******************************************************************* +val_Grp_B: Given upper diagonal array, + find min, max, or mean of comparison between the + entities in grp and j + choice = 0 = find min value of grp-grp comparison + 1 = find max ... + 2 = find mean ... + +Note: there is some duplication in the code to avoid testing choice + on every step in the loop +--------------------------------------------------------------------*/ +double +val_Grp_B (double **arr, /* upper diagonal array */ + int n, /* side of array */ + int *grpA, /* array containing entities of group */ + int ng, /* number of members in grpA */ + int j, /* entity to compare to grpA */ + int choice /* see above */ + ) +{ + int k; + double val; + double rval; + + if (choice == 0) + { + /* min case */ + val = VERY_BIG; + for (k = 0; k < ng; ++k) + { + if (grpA[k] < j) + { + rval = val_A_B (arr, n, grpA[k], j); + } + else + { + rval = val_A_B (arr, n, j, grpA[k]); + } + if (rval < val) + val = rval; + } + } + else if (choice == 1) + { + val = VERY_SMALL; + for (k = 0; k < ng; ++k) + { + if (grpA[k] < j) + { + rval = val_A_B (arr, n, grpA[k], j); + } + else + { + rval = val_A_B (arr, n, j, grpA[k]); + } + if (rval > val) + val = rval; + } + } + else if (choice == 2) + { + val = 0.0; + for (k = 0; k < ng; ++k) + { + if (grpA[k] < j) + { + rval = val_A_B (arr, n, grpA[k], j); + } + else + { + rval = val_A_B (arr, n, j, grpA[k]); + } + val += rval; + } + val /= ng; + } + return val; +} + +/******************************************************************* +val_Grp_Grp: Given upper diagonal array, + find min, max, or mean of comparison between the + entities in grpA and grpB + choice = 0 = find min value of grp-grp comparison + 1 = find max ... + 2 = find mean ... + +Note: there is some duplication in the code to avoid testing choice + on every step in the loop +--------------------------------------------------------------------*/ +double +val_Grp_Grp (double **arr, /* upper diagonal array */ + int n, /* side of array */ + int *grpA, /* array containing entities of group */ + int ngA, /* number of members in grpA */ + int *grpB, /* array containing entities of group B */ + int ngB, /* number of members in grpB */ + int choice /* see above */ + ) +{ + int k, l; + double val; + double rval; + + if (choice == 0) + { + /* min case */ + val = VERY_BIG; + for (k = 0; k < ngA; ++k) + { + for (l = 0; l < ngB; ++l) + { + if (grpA[k] < grpB[l]) + { + rval = val_A_B (arr, n, grpA[k], grpB[l]); + } + else + { + rval = val_A_B (arr, n, grpB[l], grpA[k]); + } + if (rval < val) + val = rval; + } + } + } + else if (choice == 1) + { + val = VERY_SMALL; + for (k = 0; k < ngA; ++k) + { + for (l = 0; l < ngB; ++l) + { + if (grpA[k] < grpB[l]) + { + rval = val_A_B (arr, n, grpA[k], grpB[l]); + } + else + { + rval = val_A_B (arr, n, grpB[l], grpA[k]); + } + if (rval > val) + val = rval; + } + } + } + else if (choice == 2) + { + val = 0.0; + for (k = 0; k < ngA; ++k) + { + for (l = 0; l < ngB; ++l) + { + if (grpA[k] < grpB[l]) + { + rval = val_A_B (arr, n, grpA[k], grpB[l]); + } + else + { + rval = val_A_B (arr, n, grpB[l], grpA[k]); + } + val += rval; + } + } + val /= (ngA * ngB); + } + return val; +} + +/***************************************************************** +remove_unclust: + +Take the array containing integers showing members that have +yet to be clustered and remove an element. nunclust is returned +reduced by one. +-----------------------------------------------------------------*/ +void +remove_unclust (int *unclust, int *nunclust, int val) +{ + int i, j, found; + i = 0; + found = 0; + + while (i < *nunclust) + { + if (unclust[i] == val) + { + ++i; + found = 1; + } + if (found) + { + j = i - 1; + } + else + { + j = i; + } + unclust[j] = unclust[i]; + ++i; + } + --(*nunclust); +} + +void +iprintarr (int *arr, int n, FILE * out) +{ + int i; + for (i = 0; i < n; ++i) + { + fprintf (out, " %d", arr[i]); + } + fprintf (out, "\n"); +} + +/****************************************************************** +no_share: return 1 if grpA and grpB have no common members + 0 if there are common members +----------------------------------------------------------------*/ + +int +no_share (int *grpA, int ngA, int *grpB, int ngB) +{ + int i, j; + + for (i = 0; i < ngA; ++i) + { + for (j = 0; j < ngB; ++j) + { + if (grpA[i] == grpB[j]) + return 0; + } + } + return 1; +} + +/***************************************************************** +sub_notparent: given the notparent list substitutes A by B +-----------------------------------------------------------------*/ +void +sub_notparent (int *np, int *n, int A, int B) +{ + int i; + for (i = 0; i < *n; ++i) + { + if (np[i] == A) + { + np[i] = B; + return; + } + } +} + +/***************************************************************** +remove_notparent: given the notparent list removes A from the list + +-----------------------------------------------------------------*/ +void +remove_notparent (int *np, int *n, int A) +{ + remove_unclust (np, n, A); +} + +/***************************************************************** +add_notparent: given the notparent list adds A to the list + +-----------------------------------------------------------------*/ +void +add_notparent (int *np, int *n, int A) +{ + np[*n] = A; + ++(*n); +} + +/************************************************************************** +Return a pointer to space for an upper diagonal square array stored rowwise +G. J. B. 16 May 1993 + +double precision array. +side = n; +--------------------------------------------------------------------------*/ +double ** +GJDudarr (int n) +{ + double **ret_val; + int i, nm1; + + nm1 = n - 1; + ret_val = (double **) GJmalloc (sizeof (double *) * nm1); + + + for (i = 0; i < nm1; ++i) + { + ret_val[i] = (double *) GJmalloc (sizeof (double) * (nm1 - i)); + } + + return ret_val; +} + +/*************************************************************************** +draw_dendrogram: + +Output the dendrogram to fout using PostScript commands + +---------------------------------------------------------------------------*/ +void +draw_dendrogram (FILE * fout, + struct sc *clust, char **idents, int *inx, int n, int sim) +{ + extern double base_val; + extern float MINside, MAXside; + extern float WTreeFraction, HTreeFraction, TreeExtendFac; + + float Twidth, Theight; /* x and y limits for the tree part of the plot */ + float Xoffset, Yoffset; /* x and y offset (points) */ + + int i; + double x1, x2, x3, y1, y2; + double Xrange, Yrange, Xmin, Xmax, Ymin, Ymax; /*,TXmax; */ +/* double Xscale,Yscale;*/ + double Xid, Yid; + + char *buff; + + buff = GJstrcreate (MAX_ID_LEN, NULL); + + Xoffset = X_OFFSET; + Yoffset = Y_OFFSET; + + Twidth = (MINside * WTreeFraction) - Xoffset; + Theight = (MAXside * HTreeFraction) - Yoffset; + + Xmin = VERY_BIG; + Xmax = VERY_SMALL; + Ymin = VERY_BIG; + Ymax = VERY_SMALL; + + /* find the min and max in X excluding the base_val */ + for (i = 0; i < n; ++i) + { + if (sim == 1) + { + if (clust[i].score < Xmin) + Xmin = clust[i].score; + if (clust[i].score != base_val && clust[i].score > Xmax) + Xmax = clust[i].score; + } + else + { + if (clust[i].score != base_val && clust[i].score < Xmin) + Xmin = clust[i].score; + if (clust[i].score > Xmax) + Xmax = clust[i].score; + } + } + /* set the max/min value slightly bigger/smaller than the actual max/min */ + + if (sim == 1) + Xmax *= (1.0 + TreeExtendFac); + if (sim == 2) + Xmin *= (1.0 - TreeExtendFac); + Xrange = Xmax - Xmin; + Ymin = 0.0; + Ymax = n; + Yrange = (double) Ymax - Ymin; + + /* write out the preamble */ + + fprintf (fout, "%%!PS-Adobe-2.0 EPSF-2.0\n"); + fprintf (fout, "%%%%Title: oc output file\n"); + fprintf (fout, + "%%%%Creator: oc by G. J. Barton: http://www.compbio.dundee.ac.uk\n"); + /* fprintf(fout,"%%%%Pages: 1\n"); */ + fprintf (fout, "%%%%DocumentFonts: Times-Roman\n"); + fprintf (fout, "%%%%BoundingBox: %d %d %d %d\n", (int) Xoffset, + (int) Yoffset, (int) MINside, (int) MAXside); + fprintf (fout, "%%%%EndComments\n"); + + fprintf (fout, "/Times-Roman findfont 12 scalefont setfont\n"); + + /* get the X position for id codes then write them out */ + if (sim == 1) + { + Xid = Xmax + (Xrange * IdOffsetFactor); + } + else + { + Xid = Xmin - (Xrange * IdOffsetFactor); + } + Xid = trans_x (Xid, sim, Xmin, Xmax, Xrange, Twidth, Xoffset); + + for (i = 0; i < (n + 1); ++i) + { + Yid = trans_y ((double) i, Ymin, Ymax, Yrange, Theight, Yoffset); + PSPutText ((float) Xid, (float) Yid, idents[clust[n - 1].members[i]], + fout); + } + + /* write min and max scores below the plot */ + x1 = Xmin; + x2 = Xmax; + y1 = Ymin - (Yrange * NumberOffsetFactor); + x1 = trans_x (x1, sim, Xmin, Xmax, Xrange, Twidth, Xoffset); + x2 = trans_x (x2, sim, Xmin, Xmax, Xrange, Twidth, Xoffset); + y1 = trans_y (y1, Ymin, Ymax, Yrange, Theight, Yoffset); + sprintf (buff, "%g", Xmin); + PSPutText ((float) x1, (float) y1, buff, fout); + sprintf (buff, "%g", Xmax); + PSPutText ((float) x2, (float) y1, buff, fout); + + for (i = 0; i < n; ++i) + { + float bob = 1. / 0.; + + x1 = clust[i].parentA->score; + x2 = clust[i].score; + x3 = clust[i].parentB->score; + if (x1 == base_val && sim == 1) + x1 = Xmax; + if (x1 == base_val && sim == 2) + x1 = Xmin; + if (x3 == base_val && sim == 1) + x3 = Xmax; + if (x3 == base_val && sim == 2) + x3 = Xmin; + y1 = get_mean (clust[i].parentA[0].members, inx, clust[i].parentA[0].n); + y2 = get_mean (clust[i].parentB[0].members, inx, clust[i].parentB[0].n); + /* transform x and y to fit on the page, then */ + x1 = trans_x (x1, sim, Xmin, Xmax, Xrange, Twidth, Xoffset); + x2 = trans_x (x2, sim, Xmin, Xmax, Xrange, Twidth, Xoffset); + + /* test for inf */ + if (bob == x2) + { + fprintf (stderr, "%e, %e, %e, %e, %e, %e, %e\n", x1, (double)sim, Xmin, + Xmax, Xrange, Twidth, Xoffset); + exit (EXIT_FAILURE); + } + + x3 = trans_x (x3, sim, Xmin, Xmax, Xrange, Twidth, Xoffset); + y1 = trans_y (y1, Ymin, Ymax, Yrange, Theight, Yoffset); + y2 = trans_y (y2, Ymin, Ymax, Yrange, Theight, Yoffset); + + draw_arch (fout, (float) x1, (float) x2, (float) x3, (float) y1, + (float) y2); + } + fprintf (fout, "showpage\n"); +} + +double +get_mean (int *arr, int *inx, int n) +{ + double sum; + int i; + sum = 0.0; + for (i = 0; i < n; ++i) + { + sum += (double) inx[arr[i]]; + } + return sum / n; +} + +void +draw_arch (FILE * fout, float x1, float x2, float x3, float y1, float y2) +{ + fprintf (fout, "%f %f moveto\n", x1, y1); + fprintf (fout, "%f %f lineto\n", x2, y1); + fprintf (fout, "%f %f lineto\n", x2, y2); + fprintf (fout, "%f %f lineto\n", x3, y2); + fprintf (fout, "stroke newpath\n"); +} + +double +trans_x (double X, + int sim, + double Xmin, double Xmax, double Xrange, float Twidth, float Xoffset) +{ + if (sim == 1) + { + return (((X - Xmin) / Xrange) * (double) Twidth) + Xoffset; + } + else + { + return (((Xmax - X) / Xrange) * (double) Twidth) + Xoffset; + } +} + +double +trans_y (double X, + double Xmin, double Xmax, double Xrange, float Twidth, float Xoffset) +{ + return (((X - Xmin) / Xrange) * (double) Twidth) + Xoffset; +} + +void +PSPutText (float x, float y, char *text, FILE * outf) +{ + fprintf (outf, " %f %f moveto (%s) show \n", x, y, text); +} + +/********************************************************************** +mark_parents: This works back up the tree setting the entity->lab +values to 0 for all parents. +Hopefully the recursion won't run out of space for the size of problem +we typically deal with... +----------------------------------------------------------------------*/ + +void +mark_parents (struct sc *entity) +{ + entity->lab = 0; + if (entity->parentA != NULL) + { + mark_parents (entity->parentA); + } + if (entity->parentB != NULL) + { + mark_parents (entity->parentB); + } +} + +/********************************************************************* +write_unclustered: + +Work back down the tree and print out the identifiers of entitites that +are not clustered into any group. If mark_parents has not been called, +then all entities will be output. Otherwise, only those that have +not been marked will be output. +--------------------------------------------------------------------*/ + +void +write_unclustered (struct sc *entity, char **idents, FILE * fout) +{ + if (entity->lab == 1) + { + /* this could be an unclustered entity or a cluster + containing unclustered entities */ + if (entity->parentA == NULL && entity->parentB == NULL) + { + fprintf (fout, "%s\n", idents[entity->members[0]]); + return; + } + /* we don't need both if's since any entitity with one NULL parent + must also have the other parent NULL + */ + if (entity->parentA != NULL) + { + write_unclustered (entity->parentA, idents, fout); + } + if (entity->parentB != NULL) + { + write_unclustered (entity->parentB, idents, fout); + } + } +} + +/********************************************************************* + * OC FILE READING FUNCTIONS +--------------------------------------------------------------------*/ + + +/********************************************************************* +sys_fatal_error: + +Pass a string to the function, prints it and the system error, then exits with +failure. +--------------------------------------------------------------------*/ + +void +sys_fatal_error (const char *error) +{ + perror (error); + exit (EXIT_FAILURE); +} + +/********************************************************************* +getline: + +Returns a null terminated string representing a line, from a file pointer. Or, +if the file is empty, NULL. +--------------------------------------------------------------------*/ + +#define BUFFER 80 /* Size of buffer for reading in a line */ + +char * +getline_loc (FILE * file) +{ + int c, i, j; + char *line; + i = j = 0; + + if ((line = malloc (sizeof (char) * BUFFER)) == NULL) + sys_fatal_error ("malloc() failed\n"); + + while ((c = getc (file)) != EOF) + { + + if (i + 1 % BUFFER == 0) + { + if ((line = realloc (line, sizeof (char) * BUFFER * ++j)) == NULL) + sys_fatal_error ("realloc() failed\n"); + } + + if (c == (int) '\n') + break; + + line[i++] = (char) c; + } + + if (i == 0) + return NULL; + + if ((line = realloc (line, sizeof (char) * i)) == NULL) + sys_fatal_error ("realloc() failed\n"); + + line[i] = '\0'; + + return line; +} + +/********************************************************************** +read_num: +Reads in the number of sequences that are found in the OC file. + +Returns an int + +-----------------------------------------------------------------------*/ +int +read_num (FILE * infile) +{ + int n; + char *line; + + + /* regex_t cmpregex; + const char pattern[] = "^ *[0-9]\\+ *$"; + */ + + if ((line = getline_loc (infile)) == NULL) + { + fprintf (stderr, "File empty.\n"); + exit (EXIT_FAILURE); + } + + /* comment this stuff out as it fails on SunOS + + if (regcomp(&cmpregex, pattern, REG_NOSUB) != 0) { + fprintf(stderr, "Regex compilation failed! Check source code for error.\n"); + exit(EXIT_FAILURE); + } + + if (regexec(&cmpregex, line, 0, NULL, 0) == REG_NOMATCH) { + fprintf(stderr, "File doesn't contain integer number of entries on first line. Line is:\n%s\n", line); + exit(EXIT_FAILURE); + } + */ + + n = atoi (line); + + /* regfree(&cmpregex); */ + free (line); + return n; +} + +/********************************************************************** +read_idents: +reads n identifiers from infile + +returns a pointer to an array of identifiers +-----------------------------------------------------------------------*/ + +char ** +read_idents (FILE * infile, int n) +{ + char **ret_val; + int i; + int error; + char *buff; + + buff = GJstrcreate (MAX_ID_LEN, NULL); + ret_val = (char **) GJmalloc (sizeof (char *) * n); + + for (i = 0; i < n; ++i) + { + error = fscanf (infile, "%s", buff); + if (error == EOF) + { + fprintf (stderr, "Premature end of file reached in identifiers\n"); + exit (EXIT_FAILURE); + } + ret_val[i] = GJstrdup (buff); + } + return ret_val; +} + +/********************************************************************** +read_up_diag: +reads a file containing n, the number of entities, followed by +n*(n-1)/2 numbers that are the pair distances or similarities between +the n entities. + +Returns an upper diagonal type pointer to pointer array + +-----------------------------------------------------------------------*/ +double ** +read_up_diag (FILE * infile, int n) +{ + double **ret_val; + int error; + int i, j; + + ret_val = (double **) GJDudarr (n); + for (i = 0; i < (n - 1); ++i) + { + for (j = 0; j < (n - i - 1); ++j) + { + error = fscanf (infile, "%lf", &ret_val[i][j]); + if (error == EOF) + { + fprintf (stderr, + "Premature end of file reached in upper diagonal\n"); + exit (EXIT_FAILURE); + } + } + } + return ret_val; +}