11 #include <sys/types.h>
17 /*#define VERY_SMALL DBL_MIN*/
18 #define VERY_SMALL -DBL_MAX
19 #define VERY_BIG DBL_MAX
22 /* Postscript plotting constants */
29 float MAXside = 11.0 * POINT; /* Max dimensions of paper */
30 float MINside = 8.0 * POINT;
31 float WTreeFraction = 0.9; /* Fraction of max taken up by Tree */
32 float HTreeFraction = 0.9; /* (Calculated after subtracting offsets) */
33 float TreeExtendFac = 0.05; /* Extension of tree beyond min/max value factor */
34 double base_val = 1000.0;
35 float IdOffsetFactor = 0.01;
36 float NumberOffsetFactor = 0.05;
37 float NumberFontSize = 12;
38 float IdFontSize = 12;
39 float TitleFontSize = 12;
41 /****************************************************************************
44 General purpose cluster analysis program. Written with flexibility
45 in mind rather than all out efficiency.
47 Copyright(c) G. J. Barton (1993,1997,2002)
50 School of Life Sciences
56 email:geoff@compbio.dundee.ac.uk
90 11/8/93: Add single order option. This forces the clustering to only add
91 a single sequence at a time to the growing cluster. All other options are
92 equivalent to the full tree method. GJB.
94 15/12/93: Correct bug in definition of VERY_SMALL ie now = -DBL_MAX. This
95 allows the complete and means linkage to work with negative numbers.
97 04/08/2002:Update affiliation and also add gjnoc2 routine to the distribution.
98 ----------------------------------------------------------------------------*/
101 main (int argc, char *argv[])
103 extern FILE *std_in, *std_out, *std_err;
104 extern double base_val;
112 double U1val, U2val, U3val, rval;
114 double **arr; /* upper diagonal array structure */
115 int *unclust; /* list of unclustered entities */
116 int nunclust; /* number of unclustered entities */
117 char **idents; /* array of idents for the entities */
118 int *notparent; /* list of clusters that have no children */
119 int nnotparent; /* number of clusters that have no children */
120 int *inx; /* array of indexes - inx[i] is the re-ordered position */
121 char *postfile; /* name of PostScript output file */
123 /* TIME INFORMATION */
124 clock_t start_time, end_time, initial_time, final_time;
127 int sim; /* similarity scores =1, distances = 2 */
128 int type; /* single linkage = 1, complete linkage = 2, means = 3 */
129 int option; /* use to feed the Grp_Grp function */
131 int dolog; /* flag to take logs on output to .ps file */
135 int amps_out; /* output tree file and order file for amps */
136 int timeclus; /* set to 1 if timing output to std_err required */
140 char *amps_order_file;
141 char *amps_tree_file;
150 ps = 0; /* default is no PostScript output */
158 start_time = clock ();
159 initial_time = start_time;
163 for (i = 1; i < argc; ++i)
165 if (strcmp (argv[i], "sim") == 0)
170 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
172 else if (strcmp (argv[i], "dis") == 0)
175 base_val = VERY_SMALL;
177 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
179 else if (strcmp (argv[i], "single") == 0)
182 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
184 else if (strcmp (argv[i], "complete") == 0)
187 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
189 else if (strcmp (argv[i], "means") == 0)
192 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
194 else if (strcmp (argv[i], "order") == 0)
196 /* do a single order clustering rather than a full tree */
198 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
200 else if (strcmp (argv[i], "ps") == 0
201 || strcmp (argv[i], "eps") == 0)
205 postfile = GJcat (2, argv[i], ".eps");
206 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
208 else if (strcmp (argv[i], "log") == 0)
211 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
213 else if (strcmp (argv[i], "cut") == 0)
215 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
217 cut = atof (argv[i]);
219 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
221 else if (strcmp (argv[i], "amps") == 0)
223 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
228 amps_order_file = GJcat (2, argv[i], ".ord");
232 amps_tree_file = GJcat (2, argv[i], ".tree");
233 amps_order_file = GJcat (2, argv[i], ".tord");
235 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
237 else if (strcmp (argv[i], "id") == 0)
239 showid = 1; /* output idents rather than index */
240 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
242 else if (strcmp (argv[i], "timeclus") == 0)
244 timeclus = 1; /* output idents rather than index */
245 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
247 else if (strcmp (argv[i], "maxside") == 0)
249 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
251 MAXside = atof (argv[i]) * POINT;
252 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
254 else if (strcmp (argv[i], "minside") == 0)
256 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
258 MINside = atof (argv[i]) * POINT;
259 /* fprintf(std_err,"Option: argv[%d]=%s\n",i,argv[i]); */
263 fprintf (std_err, "Unrecognised Option: argv[%d]=%s\n", i, argv[i]);
269 fprintf (std_err, "Cluster analysis program\n\n");
270 fprintf (std_err, "Author: G. J. Barton, University of Dundee, UK\n");
271 fprintf (std_err, " http://www.compbio.dundee.ac.uk\n");
272 fprintf (std_err, "Copyright: G. J. Barton (1993,1997,2002,2004)\n\n");
273 fprintf (std_err, "Please cite: OC a cluster analysis program, Barton, G. J. 1993\n\n");
274 fprintf (std_err, "Usage: oc <sim/dis> <single/complete/means> <eps file> <cut N>\n\n");
275 fprintf (std_err, "Version 2.1a (Apr. 2005) - Requires a file to be piped to standard input\n");
276 fprintf (std_err, "Format: Line 1: Number (N) of entities to cluster (e.g. 10)\n");
277 fprintf (std_err, "Format: Lines 2 to 2+N-1: Identifier codes for the entities (e.g. Entity1)\n");
278 fprintf (std_err, "Format: N*(N-1)/2: Distances, or similarities - ie the upper diagonal\n\n");
279 fprintf (std_err, "Options:\n");
280 fprintf (std_err, "sim = similarity / dis = distances\n");
281 fprintf (std_err, "method = single/complete/means\n");
282 fprintf (std_err, "ps <file> = plot out portrait dendrogram to <file.eps> \n");
283 fprintf (std_err, "eps <file> = plot out portrait dendrogram to <file.eps> \n");
284 fprintf (std_err, "minside <dim> = define length of short (x) side of plot in inches \n");
285 fprintf (std_err, "maxside <dim> = define length of long (y) side of plot in inches \n");
286 fprintf (std_err, "log = take logs before calculation \n");
287 fprintf (std_err, "cut = only show clusters above/below the cutoff\n");
288 fprintf (std_err, "id = output identifier codes rather than indexes for entities\n");
289 fprintf (std_err, "timeclus = output times to generate each cluster\n");
290 fprintf (std_err, "amps <file> = produce amps <file>.tree and <file>.tord files\n\n");
291 fprintf (std_err, "See oc_manual.txt file for brief instructions\n");
296 /* Read in the OC file */
297 /* allocate and read in the upper diagonal array */
298 fprintf (std_err, "Reading Upper Diagonal\n");
299 n = read_num (std_in);
300 idents = read_idents (std_in, n);
301 arr = read_up_diag (std_in, n);
304 up_diag_log (arr, n);
307 /* write_up_diag(std_out,arr,n);*/
308 fprintf (std_err, "Read: %d Entries\n", n);
309 /* File has been read in */
312 fprintf (std_err, "CPU time: %f seconds\n",
313 ((float) (end_time - start_time)) / CLOCKS_PER_SEC);
314 start_time = end_time;
316 /* holds index of unclustered entities */
318 fprintf (std_err, "Setting up unclust\n");
319 unclust = (int *) GJmalloc (sizeof (int) * n);
320 for (i = 0; i < n; ++i)
326 fprintf (std_err, "Setting up notparent\n");
327 notparent = (int *) GJmalloc (sizeof (int) * n);
328 for (i = 0; i < n; ++i)
336 /* set up the clust array to hold results */
337 /* there are always n-1 clusters formed - we will also store the nth (all entities) */
338 fprintf (std_err, "Setting up clust\n");
339 clust = (struct sc *) GJmalloc (sizeof (struct sc) * n);
341 fprintf (std_err, "CPU time: %f seconds\n",
342 ((float) (end_time - start_time)) / CLOCKS_PER_SEC);
343 start_time = end_time;
353 fprintf (std_err, "Single linkage on similarity\n");
358 fprintf (std_err, "Complete linkage on similarity\n");
363 fprintf (std_err, "Means linkage on similarity\n");
371 fprintf (std_err, "Single linkage on distance\n");
376 fprintf (std_err, "Complete linkage on distance\n");
381 fprintf (std_err, "Means linkage on distance\n");
386 /* implement max similarity single linkage first */
388 fprintf (std_err, "Doing Cluster Analysis...\n");
389 while (nclust < (n - 1))
405 /* fprintf(std_err,"Option 1\n");*/
406 if (!single_order || (single_order && nclust == 0))
408 /* we've not clustered all entities into groups */
409 /* find max of what is left */
410 for (i = 0; i < nunclust - 1; ++i)
412 for (j = (i + 1); j < nunclust; ++j)
414 rval = val_A_B (arr, n, unclust[i], unclust[j]);
434 } /* ... for(i=0... */
435 } /* ...if(!single_order... */
437 if (nnotparent > 0 && nunclust > 0)
439 /* fprintf(std_err,"Option 2\n");*/
440 /* we have some clusters */
441 /* so compare them to the unclustered entitites */
442 for (i = 0; i < nunclust; ++i)
444 for (j = 0; j < nnotparent; ++j)
448 val_Grp_B (arr, n, clust[k].members, clust[k].n,
454 /* need >= because want biggest previous cluster */
464 /* need <= because want biggest previous cluster */
475 /* fprintf(std_err,"Option 3\n");*/
476 /* compare clusters to clusters if there are more than 1 */
477 for (i = 0; i < nnotparent - 1; ++i)
480 for (j = (i + 1); j < nnotparent; ++j)
483 rval = val_Grp_Grp (arr, n, clust[k].members, clust[k].n,
484 clust[l].members, clust[l].n, option);
492 } /* if(rval >= U3val... */
501 } /* if(rval >= U3val... */
503 } /* if j=(i+1) ... */
506 /* find which search gave the biggest (or smallest) value */
507 if ((sim == 1 && U3val >= U1val && U3val >= U2val) ||
508 (sim == 2 && U3val <= U1val && U3val <= U2val))
510 /* joining of two clusters */
511 clust[nclust].score = U3val;
512 clust[nclust].n = clust[i3loc].n + clust[j3loc].n;
513 clust[nclust].members =
514 (int *) GJmalloc (sizeof (int) * clust[nclust].n);
515 for (i = 0; i < clust[i3loc].n; ++i)
517 clust[nclust].members[i] = clust[i3loc].members[i];
519 for (i = 0; i < clust[j3loc].n; ++i)
521 clust[nclust].members[i + clust[i3loc].n] =
522 clust[j3loc].members[i];
524 clust[nclust].parentA = &clust[i3loc];
525 clust[nclust].parentB = &clust[j3loc];
526 clust[nclust].lab = 1;
527 remove_notparent (notparent, &nnotparent, i3loc);
528 sub_notparent (notparent, &nnotparent, j3loc, nclust);
530 else if ((sim == 1 && U2val >= U1val && U2val >= U3val) ||
531 (sim == 2 && U2val <= U1val && U2val <= U3val))
533 /* joining of a single entity to an existing cluster */
534 clust[nclust].score = U2val;
535 clust[nclust].n = 1 + clust[j2loc].n;
536 clust[nclust].members =
537 (int *) GJmalloc (sizeof (int) * clust[nclust].n);
538 /* copy previous cluster list into this one - not strictly neccessary */
539 /* but done to simplify cluster comparisons above */
540 for (i = 1; i < clust[nclust].n; ++i)
542 clust[nclust].members[i] = clust[j2loc].members[i - 1];
544 /* stick solitary cluster member on the beginning */
545 clust[nclust].members[0] = i2loc;
546 /* put in parent pointers */
547 clust[nclust].parentA = make_entity (i2loc, base_val);
548 clust[nclust].parentB = &clust[j2loc];
549 clust[nclust].lab = 1;
550 sub_notparent (notparent, &nnotparent, j2loc, nclust);
551 remove_unclust (unclust, &nunclust, i2loc);
553 else if ((sim == 1 && U1val >= U2val && U1val >= U3val) ||
554 (sim == 2 && U1val <= U2val && U1val <= U3val))
556 /* unclustered pair were biggest */
557 clust[nclust].score = U1val;
559 clust[nclust].members =
560 (int *) GJmalloc (sizeof (int) * clust[nclust].n);
561 clust[nclust].members[0] = i1loc;
562 clust[nclust].members[1] = j1loc;
563 clust[nclust].parentA = make_entity (i1loc, base_val);
564 clust[nclust].parentB = make_entity (j1loc, base_val);
565 clust[nclust].lab = 1;
566 add_notparent (notparent, &nnotparent, nclust);
567 remove_unclust (unclust, &nunclust, i1loc);
568 remove_unclust (unclust, &nunclust, j1loc);
571 /* fprintf(std_err,"Current cluster\n");*/
572 /* show_entity(&clust[nclust],std_err);*/
576 fprintf (std_err, "Cluster: %d DONE! ", nclust);
578 fprintf (std_err, "CPU time: %f seconds\n",
579 ((float) (end_time - start_time)) / CLOCKS_PER_SEC);
580 start_time = end_time;
585 /* now we should have all clusters stored in the clust array */
586 /* set up the inx array */
587 inx = (int *) GJmalloc (sizeof (int) * n);
588 for (i = 0; i < clust[nclust - 1].n; ++i)
590 inx[clust[nclust - 1].members[i]] = i;
595 /* only output the biggest clusters that are above the cutoff */
598 for (i = (nclust - 1); i >= 0; --i)
600 if (clust[i].lab == 1)
602 /* only output clusters that aren't already output as part of something bigger */
603 if (clust[i].score >= cut)
605 fprintf (std_out, "## %d %g %d\n", i, clust[i].score,
607 show_id_entity (&clust[i], idents, std_out);
608 mark_parents (&clust[i]);
615 for (i = (nclust - 1); i >= 0; --i)
617 if (clust[i].lab == 1)
619 /* only output clusters that aren't already output as part of something bigger */
620 if (clust[i].score <= cut)
622 fprintf (std_out, "## %d %g %d\n", i, clust[i].score,
624 show_id_entity (&clust[i], idents, std_out);
625 mark_parents (&clust[i]);
630 /* now output the unclustered entities */
631 fprintf (std_out, "\nUNCLUSTERED ENTITIES\n");
632 write_unclustered (&clust[nclust - 1], idents, std_out);
637 /* output all clusters from smallest to largest */
640 /* fprintf(std_err,"amps_out not implemented yet\n");*/
644 "Opening AMPS order and tree files for writing\n");
645 ford = GJfopen (amps_order_file, "w", 1);
646 ftree = GJfopen (amps_tree_file, "w", 1);
650 fprintf (std_err, "Opening AMPS order file for writing\n");
651 ford = GJfopen (amps_order_file, "w", 1);
653 /* output order file */
656 /* order file must be reversed */
657 for (i = nclust; i > -1; --i)
659 fprintf (ford, "%11d%20.2f%10d\n",
660 clust[nclust - 1].members[i] + 1, 0.0, 0);
665 /* order file for use with the tree */
666 for (i = 0; i < (nclust + 1); ++i)
668 fprintf (ford, "%11d%20.2f%10d\n",
669 clust[nclust - 1].members[i] + 1, 0.0, 0);
675 /* output the tree_file */
676 for (i = 0; i < nclust; ++i)
678 print_amps_cluster (clust[i].parentA, inx, ftree);
679 print_amps_cluster (clust[i].parentB, inx, ftree);
686 for (i = 0; i < nclust; ++i)
688 fprintf (std_out, "## %d %g %d\n", i, clust[i].score,
692 show_id_entity (&clust[i], idents, std_out);
696 show_entity (&clust[i], std_out);
699 /* show_inx_entity(&clust[i],inx,std_out); */
701 /* fprintf(std_out,"Parents:\n");
702 show_entity(clust[i].parentA,std_out);
703 show_entity(clust[i].parentB,std_out);
711 fp = GJfopen (postfile, "w", 1);
712 draw_dendrogram (fp, clust, idents, inx, nclust, sim);
715 final_time = clock ();
716 fprintf (std_err, "Total CPU time: %f seconds\n",
717 (float) (final_time - initial_time) / CLOCKS_PER_SEC);
722 /*********************************************************************
723 up_diag_log: take logs for the upper diagonal structure
724 -------------------------------------------------------------------*/
727 up_diag_log (double **arr, int n)
731 for (i = 0; i < (n - 1); ++i)
733 for (j = 0; j < (n - i - 1); ++j)
735 arr[i][j] = log (arr[i][j]);
740 /**********************************************************************
742 writes out n, the number of entities, followed by
743 n*(n-1)/2 numbers that are the pair distances or similarities between
746 -----------------------------------------------------------------------*/
748 write_up_diag (FILE * infile, double **ret_val, int n)
752 fprintf (infile, "Number of Entities: %d\n", n);
753 for (i = 0; i < (n - 1); ++i)
755 for (j = 0; j < (n - i - 1); ++j)
757 fprintf (infile, " %f", ret_val[i][j]);
759 fprintf (infile, "\n");
763 /**********************************************************************
766 print out the score, number of members and the members for
768 ----------------------------------------------------------------------*/
770 show_entity (struct sc *entity, FILE * out)
776 fprintf (out, "No entity\n");
780 fprintf (out, "Entity Score: %g Number of members: %d\n",
781 entity->score, entity->n);
783 for (i = 0; i < entity->n; ++i)
785 fprintf (out, " %d", entity->members[i]);
790 /**********************************************************************
793 print out the score, number of members and the members for
794 the entity using the id codes rather than numerical positions
795 ----------------------------------------------------------------------*/
797 show_id_entity (struct sc *entity, char **id, FILE * out)
803 fprintf (out, "No entity\n");
807 /* fprintf(out,"Entity Score: %f Number of members: %d\n",
808 entity->score,entity->n);
811 for (i = 0; i < entity->n; ++i)
813 fprintf (out, " %s", id[entity->members[i]]);
818 /**********************************************************************
821 print out the score, number of members and the members for
822 the entity - followed by the index for the entity
823 ----------------------------------------------------------------------*/
825 show_inx_entity (struct sc *entity, int *inx, FILE * out)
831 fprintf (out, "No entity\n");
835 fprintf (out, "Entity Score: %g Number of members: %d\n",
836 entity->score, entity->n);
837 for (i = 0; i < entity->n; ++i)
839 fprintf (out, " %d", entity->members[i]);
842 for (i = 0; i < entity->n; ++i)
844 fprintf (out, " %d", inx[entity->members[i]]);
849 /**********************************************************************
852 print out the cluster in AMPS tree file format.
853 ie 1x,20i5 Fortran format.
855 inx is necessary to so that the identifying codes are re-ordered
856 appropriately for AMPS
858 Need +1 cos AMPS uses sequence numbering from 1...N not 0...N-1
860 ----------------------------------------------------------------------*/
862 print_amps_cluster (struct sc *entity, int *inx, FILE * fp)
867 fprintf (fp, " %5d\n", entity->n);
870 for (i = 0; i < entity->n; ++i)
878 fprintf (fp, " %5d", inx[entity->members[i]] + 1);
882 fprintf (fp, "%5d", inx[entity->members[i]] + 1);
890 /* change from < to <= to avoid blunders at end of line */
895 /******************************************************************
896 make_entity: return pointer to malloc'd space for a struct sc
897 structure for a single entity. i.
898 base_val: The value form minimum distance, or maximum similarity.
899 -------------------------------------------------------------------*/
901 make_entity (int i, double base_val)
904 ret_val = (struct sc *) GJmalloc (sizeof (struct sc));
905 ret_val->score = base_val;
907 ret_val->members = (int *) GJmalloc (sizeof (int));
908 ret_val->members[0] = i;
909 ret_val->parentA = NULL;
910 ret_val->parentB = NULL;
915 /******************************************************************
916 val_A_B: Given the upper diagonal array, return the value stored
917 in the notional arr[i][j].
921 Note: This could/should be turned into a macro - without the bounds
923 ------------------------------------------------------------------
932 extern FILE *std_err;
933 if(!(i >= 0 && i < n-1 && i < j && j < n)){
934 GJerror("Invalid request in val_A_B");
935 fprintf(std_err,"i,j,n %d %d %d \n",i,j,n);
939 return arr[i][j-i-1];
942 /*******************************************************************
943 val_Grp_B: Given upper diagonal array,
944 find min, max, or mean of comparison between the
945 entities in grp and j
946 choice = 0 = find min value of grp-grp comparison
950 Note: there is some duplication in the code to avoid testing choice
951 on every step in the loop
952 --------------------------------------------------------------------*/
954 val_Grp_B (double **arr, /* upper diagonal array */
955 int n, /* side of array */
956 int *grpA, /* array containing entities of group */
957 int ng, /* number of members in grpA */
958 int j, /* entity to compare to grpA */
959 int choice /* see above */
970 for (k = 0; k < ng; ++k)
974 rval = val_A_B (arr, n, grpA[k], j);
978 rval = val_A_B (arr, n, j, grpA[k]);
984 else if (choice == 1)
987 for (k = 0; k < ng; ++k)
991 rval = val_A_B (arr, n, grpA[k], j);
995 rval = val_A_B (arr, n, j, grpA[k]);
1001 else if (choice == 2)
1004 for (k = 0; k < ng; ++k)
1008 rval = val_A_B (arr, n, grpA[k], j);
1012 rval = val_A_B (arr, n, j, grpA[k]);
1021 /*******************************************************************
1022 val_Grp_Grp: Given upper diagonal array,
1023 find min, max, or mean of comparison between the
1024 entities in grpA and grpB
1025 choice = 0 = find min value of grp-grp comparison
1029 Note: there is some duplication in the code to avoid testing choice
1030 on every step in the loop
1031 --------------------------------------------------------------------*/
1033 val_Grp_Grp (double **arr, /* upper diagonal array */
1034 int n, /* side of array */
1035 int *grpA, /* array containing entities of group */
1036 int ngA, /* number of members in grpA */
1037 int *grpB, /* array containing entities of group B */
1038 int ngB, /* number of members in grpB */
1039 int choice /* see above */
1050 for (k = 0; k < ngA; ++k)
1052 for (l = 0; l < ngB; ++l)
1054 if (grpA[k] < grpB[l])
1056 rval = val_A_B (arr, n, grpA[k], grpB[l]);
1060 rval = val_A_B (arr, n, grpB[l], grpA[k]);
1067 else if (choice == 1)
1070 for (k = 0; k < ngA; ++k)
1072 for (l = 0; l < ngB; ++l)
1074 if (grpA[k] < grpB[l])
1076 rval = val_A_B (arr, n, grpA[k], grpB[l]);
1080 rval = val_A_B (arr, n, grpB[l], grpA[k]);
1087 else if (choice == 2)
1090 for (k = 0; k < ngA; ++k)
1092 for (l = 0; l < ngB; ++l)
1094 if (grpA[k] < grpB[l])
1096 rval = val_A_B (arr, n, grpA[k], grpB[l]);
1100 rval = val_A_B (arr, n, grpB[l], grpA[k]);
1110 /*****************************************************************
1113 Take the array containing integers showing members that have
1114 yet to be clustered and remove an element. nunclust is returned
1116 -----------------------------------------------------------------*/
1118 remove_unclust (int *unclust, int *nunclust, int val)
1124 while (i < *nunclust)
1126 if (unclust[i] == val)
1139 unclust[j] = unclust[i];
1146 iprintarr (int *arr, int n, FILE * out)
1149 for (i = 0; i < n; ++i)
1151 fprintf (out, " %d", arr[i]);
1153 fprintf (out, "\n");
1156 /******************************************************************
1157 no_share: return 1 if grpA and grpB have no common members
1158 0 if there are common members
1159 ----------------------------------------------------------------*/
1162 no_share (int *grpA, int ngA, int *grpB, int ngB)
1166 for (i = 0; i < ngA; ++i)
1168 for (j = 0; j < ngB; ++j)
1170 if (grpA[i] == grpB[j])
1177 /*****************************************************************
1178 sub_notparent: given the notparent list substitutes A by B
1179 -----------------------------------------------------------------*/
1181 sub_notparent (int *np, int *n, int A, int B)
1184 for (i = 0; i < *n; ++i)
1194 /*****************************************************************
1195 remove_notparent: given the notparent list removes A from the list
1197 -----------------------------------------------------------------*/
1199 remove_notparent (int *np, int *n, int A)
1201 remove_unclust (np, n, A);
1204 /*****************************************************************
1205 add_notparent: given the notparent list adds A to the list
1207 -----------------------------------------------------------------*/
1209 add_notparent (int *np, int *n, int A)
1215 /**************************************************************************
1216 Return a pointer to space for an upper diagonal square array stored rowwise
1217 G. J. B. 16 May 1993
1219 double precision array.
1221 --------------------------------------------------------------------------*/
1229 ret_val = (double **) GJmalloc (sizeof (double *) * nm1);
1232 for (i = 0; i < nm1; ++i)
1234 ret_val[i] = (double *) GJmalloc (sizeof (double) * (nm1 - i));
1240 /***************************************************************************
1243 Output the dendrogram to fout using PostScript commands
1245 ---------------------------------------------------------------------------*/
1247 draw_dendrogram (FILE * fout,
1248 struct sc *clust, char **idents, int *inx, int n, int sim)
1250 extern double base_val;
1251 extern float MINside, MAXside;
1252 extern float WTreeFraction, HTreeFraction, TreeExtendFac;
1254 float Twidth, Theight; /* x and y limits for the tree part of the plot */
1255 float Xoffset, Yoffset; /* x and y offset (points) */
1258 double x1, x2, x3, y1, y2;
1259 double Xrange, Yrange, Xmin, Xmax, Ymin, Ymax; /*,TXmax; */
1260 /* double Xscale,Yscale;*/
1265 buff = GJstrcreate (MAX_ID_LEN, NULL);
1270 Twidth = (MINside * WTreeFraction) - Xoffset;
1271 Theight = (MAXside * HTreeFraction) - Yoffset;
1278 /* find the min and max in X excluding the base_val */
1279 for (i = 0; i < n; ++i)
1283 if (clust[i].score < Xmin)
1284 Xmin = clust[i].score;
1285 if (clust[i].score != base_val && clust[i].score > Xmax)
1286 Xmax = clust[i].score;
1290 if (clust[i].score != base_val && clust[i].score < Xmin)
1291 Xmin = clust[i].score;
1292 if (clust[i].score > Xmax)
1293 Xmax = clust[i].score;
1296 /* set the max/min value slightly bigger/smaller than the actual max/min */
1299 Xmax *= (1.0 + TreeExtendFac);
1301 Xmin *= (1.0 - TreeExtendFac);
1302 Xrange = Xmax - Xmin;
1305 Yrange = (double) Ymax - Ymin;
1307 /* write out the preamble */
1309 fprintf (fout, "%%!PS-Adobe-2.0 EPSF-2.0\n");
1310 fprintf (fout, "%%%%Title: oc output file\n");
1312 "%%%%Creator: oc by G. J. Barton: http://www.compbio.dundee.ac.uk\n");
1313 /* fprintf(fout,"%%%%Pages: 1\n"); */
1314 fprintf (fout, "%%%%DocumentFonts: Times-Roman\n");
1315 fprintf (fout, "%%%%BoundingBox: %d %d %d %d\n", (int) Xoffset,
1316 (int) Yoffset, (int) MINside, (int) MAXside);
1317 fprintf (fout, "%%%%EndComments\n");
1319 fprintf (fout, "/Times-Roman findfont 12 scalefont setfont\n");
1321 /* get the X position for id codes then write them out */
1324 Xid = Xmax + (Xrange * IdOffsetFactor);
1328 Xid = Xmin - (Xrange * IdOffsetFactor);
1330 Xid = trans_x (Xid, sim, Xmin, Xmax, Xrange, Twidth, Xoffset);
1332 for (i = 0; i < (n + 1); ++i)
1334 Yid = trans_y ((double) i, Ymin, Ymax, Yrange, Theight, Yoffset);
1335 PSPutText ((float) Xid, (float) Yid, idents[clust[n - 1].members[i]],
1339 /* write min and max scores below the plot */
1342 y1 = Ymin - (Yrange * NumberOffsetFactor);
1343 x1 = trans_x (x1, sim, Xmin, Xmax, Xrange, Twidth, Xoffset);
1344 x2 = trans_x (x2, sim, Xmin, Xmax, Xrange, Twidth, Xoffset);
1345 y1 = trans_y (y1, Ymin, Ymax, Yrange, Theight, Yoffset);
1346 sprintf (buff, "%g", Xmin);
1347 PSPutText ((float) x1, (float) y1, buff, fout);
1348 sprintf (buff, "%g", Xmax);
1349 PSPutText ((float) x2, (float) y1, buff, fout);
1351 for (i = 0; i < n; ++i)
1353 float bob = 1. / 0.;
1355 x1 = clust[i].parentA->score;
1356 x2 = clust[i].score;
1357 x3 = clust[i].parentB->score;
1358 if (x1 == base_val && sim == 1)
1360 if (x1 == base_val && sim == 2)
1362 if (x3 == base_val && sim == 1)
1364 if (x3 == base_val && sim == 2)
1366 y1 = get_mean (clust[i].parentA[0].members, inx, clust[i].parentA[0].n);
1367 y2 = get_mean (clust[i].parentB[0].members, inx, clust[i].parentB[0].n);
1368 /* transform x and y to fit on the page, then */
1369 x1 = trans_x (x1, sim, Xmin, Xmax, Xrange, Twidth, Xoffset);
1370 x2 = trans_x (x2, sim, Xmin, Xmax, Xrange, Twidth, Xoffset);
1375 fprintf (stderr, "%e, %e, %e, %e, %e, %e, %e\n", x1, (double)sim, Xmin,
1376 Xmax, Xrange, Twidth, Xoffset);
1377 exit (EXIT_FAILURE);
1380 x3 = trans_x (x3, sim, Xmin, Xmax, Xrange, Twidth, Xoffset);
1381 y1 = trans_y (y1, Ymin, Ymax, Yrange, Theight, Yoffset);
1382 y2 = trans_y (y2, Ymin, Ymax, Yrange, Theight, Yoffset);
1384 draw_arch (fout, (float) x1, (float) x2, (float) x3, (float) y1,
1387 fprintf (fout, "showpage\n");
1391 get_mean (int *arr, int *inx, int n)
1396 for (i = 0; i < n; ++i)
1398 sum += (double) inx[arr[i]];
1404 draw_arch (FILE * fout, float x1, float x2, float x3, float y1, float y2)
1406 fprintf (fout, "%f %f moveto\n", x1, y1);
1407 fprintf (fout, "%f %f lineto\n", x2, y1);
1408 fprintf (fout, "%f %f lineto\n", x2, y2);
1409 fprintf (fout, "%f %f lineto\n", x3, y2);
1410 fprintf (fout, "stroke newpath\n");
1416 double Xmin, double Xmax, double Xrange, float Twidth, float Xoffset)
1420 return (((X - Xmin) / Xrange) * (double) Twidth) + Xoffset;
1424 return (((Xmax - X) / Xrange) * (double) Twidth) + Xoffset;
1430 double Xmin, double Xmax, double Xrange, float Twidth, float Xoffset)
1432 return (((X - Xmin) / Xrange) * (double) Twidth) + Xoffset;
1436 PSPutText (float x, float y, char *text, FILE * outf)
1438 fprintf (outf, " %f %f moveto (%s) show \n", x, y, text);
1441 /**********************************************************************
1442 mark_parents: This works back up the tree setting the entity->lab
1443 values to 0 for all parents.
1444 Hopefully the recursion won't run out of space for the size of problem
1445 we typically deal with...
1446 ----------------------------------------------------------------------*/
1449 mark_parents (struct sc *entity)
1452 if (entity->parentA != NULL)
1454 mark_parents (entity->parentA);
1456 if (entity->parentB != NULL)
1458 mark_parents (entity->parentB);
1462 /*********************************************************************
1465 Work back down the tree and print out the identifiers of entitites that
1466 are not clustered into any group. If mark_parents has not been called,
1467 then all entities will be output. Otherwise, only those that have
1468 not been marked will be output.
1469 --------------------------------------------------------------------*/
1472 write_unclustered (struct sc *entity, char **idents, FILE * fout)
1474 if (entity->lab == 1)
1476 /* this could be an unclustered entity or a cluster
1477 containing unclustered entities */
1478 if (entity->parentA == NULL && entity->parentB == NULL)
1480 fprintf (fout, "%s\n", idents[entity->members[0]]);
1483 /* we don't need both if's since any entitity with one NULL parent
1484 must also have the other parent NULL
1486 if (entity->parentA != NULL)
1488 write_unclustered (entity->parentA, idents, fout);
1490 if (entity->parentB != NULL)
1492 write_unclustered (entity->parentB, idents, fout);
1497 /*********************************************************************
1498 * OC FILE READING FUNCTIONS
1499 --------------------------------------------------------------------*/
1502 /*********************************************************************
1505 Pass a string to the function, prints it and the system error, then exits with
1507 --------------------------------------------------------------------*/
1510 sys_fatal_error (const char *error)
1513 exit (EXIT_FAILURE);
1516 /*********************************************************************
1519 Returns a null terminated string representing a line, from a file pointer. Or,
1520 if the file is empty, NULL.
1521 --------------------------------------------------------------------*/
1523 #define BUFFER 80 /* Size of buffer for reading in a line */
1526 getline_loc (FILE * file)
1532 if ((line = malloc (sizeof (char) * BUFFER)) == NULL)
1533 sys_fatal_error ("malloc() failed\n");
1535 while ((c = getc (file)) != EOF)
1538 if (i + 1 % BUFFER == 0)
1540 if ((line = realloc (line, sizeof (char) * BUFFER * ++j)) == NULL)
1541 sys_fatal_error ("realloc() failed\n");
1544 if (c == (int) '\n')
1547 line[i++] = (char) c;
1553 if ((line = realloc (line, sizeof (char) * i)) == NULL)
1554 sys_fatal_error ("realloc() failed\n");
1561 /**********************************************************************
1563 Reads in the number of sequences that are found in the OC file.
1567 -----------------------------------------------------------------------*/
1569 read_num (FILE * infile)
1575 /* regex_t cmpregex;
1576 const char pattern[] = "^ *[0-9]\\+ *$";
1579 if ((line = getline_loc (infile)) == NULL)
1581 fprintf (stderr, "File empty.\n");
1582 exit (EXIT_FAILURE);
1585 /* comment this stuff out as it fails on SunOS
1587 if (regcomp(&cmpregex, pattern, REG_NOSUB) != 0) {
1588 fprintf(stderr, "Regex compilation failed! Check source code for error.\n");
1592 if (regexec(&cmpregex, line, 0, NULL, 0) == REG_NOMATCH) {
1593 fprintf(stderr, "File doesn't contain integer number of entries on first line. Line is:\n%s\n", line);
1600 /* regfree(&cmpregex); */
1605 /**********************************************************************
1607 reads n identifiers from infile
1609 returns a pointer to an array of identifiers
1610 -----------------------------------------------------------------------*/
1613 read_idents (FILE * infile, int n)
1620 buff = GJstrcreate (MAX_ID_LEN, NULL);
1621 ret_val = (char **) GJmalloc (sizeof (char *) * n);
1623 for (i = 0; i < n; ++i)
1625 error = fscanf (infile, "%s", buff);
1628 fprintf (stderr, "Premature end of file reached in identifiers\n");
1629 exit (EXIT_FAILURE);
1631 ret_val[i] = GJstrdup (buff);
1636 /**********************************************************************
1638 reads a file containing n, the number of entities, followed by
1639 n*(n-1)/2 numbers that are the pair distances or similarities between
1642 Returns an upper diagonal type pointer to pointer array
1644 -----------------------------------------------------------------------*/
1646 read_up_diag (FILE * infile, int n)
1652 ret_val = (double **) GJDudarr (n);
1653 for (i = 0; i < (n - 1); ++i)
1655 for (j = 0; j < (n - i - 1); ++j)
1657 error = fscanf (infile, "%lf", &ret_val[i][j]);
1661 "Premature end of file reached in upper diagonal\n");
1662 exit (EXIT_FAILURE);