2 * Jalview - A Sequence Alignment Editor and Viewer
\r
3 * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
\r
5 * This program is free software; you can redistribute it and/or
\r
6 * modify it under the terms of the GNU General Public License
\r
7 * as published by the Free Software Foundation; either version 2
\r
8 * of the License, or (at your option) any later version.
\r
10 * This program is distributed in the hope that it will be useful,
\r
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
13 * GNU General Public License for more details.
\r
15 * You should have received a copy of the GNU General Public License
\r
16 * along with this program; if not, write to the Free Software
\r
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
\r
22 // http://evolution.genetics.washington.edu/phylip/newick_doc.html
\r
23 // TODO: Implement Basic NHX tag parsing and preservation
\r
24 // TODO: http://evolution.genetics.wustl.edu/eddy/forester/NHX.html
\r
25 // TODO: Extended SequenceNodeI to hold parsed NHX strings
\r
26 package uk.ac.vamsas.objects.utils;
\r
29 import java.util.regex.Matcher;
\r
30 import java.util.regex.Pattern;
\r
32 import uk.ac.vamsas.client.Vobject;
\r
33 import uk.ac.vamsas.client.VorbaId;
\r
34 import uk.ac.vamsas.objects.core.SequenceType;
\r
40 * @version $Revision: 1.12 $
\r
42 public class NewickFile {
\r
43 public class BinaryNode {
\r
54 /** DOCUMENT ME!! */
\r
55 public int bootstrap;
\r
58 * Creates a new BinaryNode object.
\r
60 public BinaryNode() {
\r
61 left = right = parent = null;
\r
66 * Creates a new BinaryNode object.
\r
75 public BinaryNode(VorbaId element, BinaryNode parent, String name) {
\r
76 this.element = element;
\r
77 this.parent = parent;
\r
80 left = right = null;
\r
86 * @return DOCUMENT ME!
\r
88 public VorbaId element() {
\r
98 * @return DOCUMENT ME!
\r
100 public VorbaId setElement(VorbaId v) {
\r
101 return element = v;
\r
107 * @return DOCUMENT ME!
\r
109 public BinaryNode left() {
\r
119 * @return DOCUMENT ME!
\r
121 public BinaryNode setLeft(BinaryNode n) {
\r
128 * @return DOCUMENT ME!
\r
130 public BinaryNode right() {
\r
140 * @return DOCUMENT ME!
\r
142 public BinaryNode setRight(BinaryNode n) {
\r
149 * @return DOCUMENT ME!
\r
151 public BinaryNode parent() {
\r
161 * @return DOCUMENT ME!
\r
163 public BinaryNode setParent(BinaryNode n) {
\r
170 * @return DOCUMENT ME!
\r
172 public boolean isLeaf() {
\r
173 return (left == null) && (right == null);
\r
177 * attaches FIRST and SECOND node arguments as the LEFT and RIGHT children
\r
178 * of this node (removing any old references) a null parameter DOES NOT mean
\r
179 * that the pointer to the corresponding child node is set to NULL - you
\r
180 * should use setChild(null), or detach() for this.
\r
183 public void SetChildren(BinaryNode leftchild, BinaryNode rightchild) {
\r
184 if (leftchild != null) {
\r
185 this.setLeft(leftchild);
\r
186 leftchild.detach();
\r
187 leftchild.setParent(this);
\r
190 if (rightchild != null) {
\r
191 this.setRight(rightchild);
\r
192 rightchild.detach();
\r
193 rightchild.setParent(this);
\r
198 * Detaches the node from the binary tree, along with all its child nodes.
\r
200 * @return BinaryNode The detached node.
\r
202 public BinaryNode detach() {
\r
203 if (this.parent != null) {
\r
204 if (this.parent.left == this) {
\r
205 this.parent.left = null;
\r
207 if (this.parent.right == this) {
\r
208 this.parent.right = null;
\r
213 this.parent = null;
\r
219 * Traverses up through the tree until a node with a free leftchild is
\r
222 * @return BinaryNode
\r
224 public BinaryNode ascendLeft() {
\r
225 BinaryNode c = this;
\r
229 } while ((c != null) && (c.left() != null) && !c.left().isLeaf());
\r
235 * Traverses up through the tree until a node with a free rightchild is
\r
236 * discovered. Jalview builds trees by descent on the left, so this may be
\r
239 * @return BinaryNode
\r
241 public BinaryNode ascendRight() {
\r
242 BinaryNode c = this;
\r
246 } while ((c != null) && (c.right() != null) && !c.right().isLeaf());
\r
257 public void setName(String name) {
\r
264 * @return DOCUMENT ME!
\r
266 public String getName() {
\r
276 public void setBootstrap(int boot) {
\r
277 this.bootstrap = boot;
\r
283 * @return DOCUMENT ME!
\r
285 public int getBootstrap() {
\r
290 * @return unquoted string of form VorbaId|v|node name string
\r
292 public String getNewickNodeName() {
\r
293 if (element!=null || name!=null)
\r
295 return ((element!=null) ? element.getId()+"|v|" : "")
\r
296 + ((name == null) ? "" : name);
\r
300 public boolean parseNodeNameString(uk.ac.vamsas.client.ClientDocument binder)
\r
303 if (element==null && name!=null && name.indexOf("|v|")>-1)
\r
305 element = binder.getObject(null).getVorbaId(); // TODO: fix THIS ! name.substring(0, name.indexOf("|v")));
\r
312 public class SequenceNode extends BinaryNode {
\r
313 /** DOCUMENT ME!! */
\r
316 /** DOCUMENT ME!! */
\r
317 public boolean dummy = false;
\r
319 private boolean placeholder = false;
\r
322 * Creates a new SequenceNode object.
\r
324 public SequenceNode() {
\r
329 * Creates a new SequenceNode object.
\r
340 public SequenceNode(VorbaId val, SequenceNode parent, float dist, String name) {
\r
341 super(val, parent, name);
\r
344 public SequenceNode(Vobject val, SequenceNode parent, float dist, String name) {
\r
345 super(val.getVorbaId(), parent, name);
\r
350 * Creates a new SequenceNode object.
\r
365 public SequenceNode(Vobject val, SequenceNode parent, String name,
\r
366 float dist, int bootstrap, boolean dummy) {
\r
367 super(val.getVorbaId(), parent, name);
\r
369 this.bootstrap = bootstrap;
\r
370 this.dummy = dummy;
\r
375 * true if node is created for the representation of polytomous
\r
378 public boolean isDummy() {
\r
383 * @param placeholder is true if the sequence refered to in the element node
\r
384 * is not actually present in the associated alignment
\r
386 public boolean isPlaceholder() {
\r
387 return placeholder;
\r
396 * @return DOCUMENT ME!
\r
398 public boolean setDummy(boolean newstate) {
\r
399 boolean oldstate = dummy;
\r
408 * @param Placeholder
\r
411 public void setPlaceholder(boolean Placeholder) {
\r
412 this.placeholder = Placeholder;
\r
416 * ascends the tree but doesn't stop until a non-dummy node is discovered.
\r
417 * This will probably break if the tree is a mixture of BinaryNodes and
\r
420 public SequenceNode AscendTree() {
\r
421 SequenceNode c = this;
\r
424 c = (SequenceNode) c.parent();
\r
425 } while ((c != null) && c.dummy);
\r
434 private boolean HasBootstrap = false;
\r
436 private boolean HasDistances = false;
\r
438 private boolean RootHasDistance = false;
\r
441 boolean ReplaceUnderscores = false;
\r
443 boolean printRootInfo = false;
\r
445 private Pattern[] NodeSafeName = new Pattern[] {
\r
446 Pattern.compile("[\\[,:'()]"), // test for requiring quotes
\r
447 Pattern.compile("'"), // escaping quote characters
\r
448 Pattern.compile("/w") // unqoted whitespace transformation
\r
451 char QuoteChar = '\'';
\r
453 String newickFile = null;
\r
456 * Creates a new NewickFile object.
\r
461 * @throws IOException
\r
464 public NewickFile(String inStr) throws IOException {
\r
465 newickFile = inStr;
\r
468 public NewickFile(File inFile) throws IOException {
\r
469 errormessage = "Problem's reading file "+inFile;
\r
470 dataIn = new java.io.BufferedReader(new InputStreamReader(new java.io.FileInputStream(inFile)));
\r
474 * Creates a new NewickFile object.
\r
479 public NewickFile(SequenceNode newtree) {
\r
484 * Creates a new NewickFile object.
\r
491 public NewickFile(SequenceNode newtree, boolean bootstrap) {
\r
492 HasBootstrap = bootstrap;
\r
497 * Creates a new NewickFile object.
\r
506 public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances) {
\r
508 HasBootstrap = bootstrap;
\r
509 HasDistances = distances;
\r
513 * Creates a new NewickFile object.
\r
521 * @param rootdistance
\r
524 public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances,
\r
525 boolean rootdistance) {
\r
527 HasBootstrap = bootstrap;
\r
528 HasDistances = distances;
\r
529 RootHasDistance = rootdistance;
\r
538 * Message qualifier (ie the incorrect data)
\r
540 * range of characters either side to dump
\r
542 * character position
\r
544 * newick string being parsed
\r
546 * @return composed error message
\r
548 private String ErrorStringrange(String Error, String Er, int r, int p,
\r
550 return ((Error == null) ? "" : Error)
\r
555 + s.substring(((p - r) < 0) ? 0 : (p - r), ((p + r) > s.length()) ? s
\r
556 .length() : (p + r)) + " )\n";
\r
561 * @return true if tree has bootstrap values
\r
563 public boolean HasBootstrap() {
\r
564 return HasBootstrap;
\r
569 * @return true if tree has distances on branches
\r
571 public boolean HasDistances() {
\r
572 return HasDistances;
\r
577 * @return true if root has a stem distance
\r
579 public boolean HasRootDistance() {
\r
580 return RootHasDistance;
\r
583 * hacked out of jalview code
\r
586 String errormessage;
\r
587 java.io.BufferedReader dataIn=null;
\r
588 public String nextLine() throws IOException {
\r
591 dataIn = new BufferedReader(new StringReader(newickFile));
\r
595 throw new IOException("IMPLEMENTATION ERROR: NewickFile has not been initialised for reading a newick string.");
\r
597 return dataIn.readLine();
\r
598 throw new IOException("Invalid Source Stream:" + errormessage);
\r
601 * call this to convert the newick string into a binary node linked tree
\r
603 * @throws IOException
\r
604 * if the newick string cannot be parsed.
\r
606 public void parse() throws IOException {
\r
608 if (newickFile==null)
\r
610 // fill nf with complete tree file
\r
612 StringBuffer file = new StringBuffer();
\r
614 while ((nf = nextLine()) != null) {
\r
618 nf = file.toString();
\r
625 root = new SequenceNode();
\r
627 SequenceNode realroot = null;
\r
628 SequenceNode c = root;
\r
632 // int flen = nf.length();
\r
634 String Error = null;
\r
635 String nodename = null;
\r
637 float DefDistance = (float) 0.001; // @param Default distance for a node -
\r
639 int DefBootstrap = 0; // @param Default bootstrap for a node
\r
641 float distance = DefDistance;
\r
642 int bootstrap = DefBootstrap;
\r
644 boolean ascending = false; // flag indicating that we are leaving the
\r
647 Pattern majorsyms = Pattern.compile(
\r
650 Matcher mjsyms = majorsyms.matcher(nf);
\r
651 while (mjsyms.find(cp) && (Error == null)) {
\r
652 int fcp = mjsyms.start();
\r
654 switch (nf.charAt(fcp)) {
\r
655 case '[': // Comment or structured/extended NH format info
\r
658 if (nf.indexOf(']',fcp)>-1) {
\r
659 // Skip the comment field
\r
660 cp = nf.indexOf(']',fcp);
\r
662 Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp, nf);
\r
671 // ascending should not be set
\r
672 // New Internal node
\r
674 Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf);
\r
682 if (c.right() == null) {
\r
683 c.setRight(new SequenceNode(null, c, null, DefDistance, DefBootstrap,
\r
685 c = (SequenceNode) c.right();
\r
687 if (c.left() != null) {
\r
688 // Dummy node for polytomy - keeps c.left free for new node
\r
689 SequenceNode tmpn = new SequenceNode(null, c, null, 0, 0, true);
\r
690 tmpn.SetChildren(c.left(), c.right());
\r
694 c.setLeft(new SequenceNode(null, c, null, DefDistance, DefBootstrap,
\r
696 c = (SequenceNode) c.left();
\r
699 if (realroot == null) {
\r
704 distance = DefDistance;
\r
705 bootstrap = DefBootstrap;
\r
710 // Deal with quoted fields
\r
713 Matcher qnodename = Pattern.compile("([^']|'')+'").matcher(nf);
\r
714 if (qnodename.find(fcp)) {
\r
715 nodename = new String(qnodename.group(1));
\r
716 cp = qnodename.end(0);
\r
718 Error = ErrorStringrange(Error, "Unterminated quotes for nodename",
\r
727 Error = ErrorStringrange(Error,
\r
728 "Wayward semicolon (depth=" + d + ")", 7, fcp, nf);
\r
731 // cp advanced at the end of default
\r
734 // Parse simpler field strings
\r
735 String fstring = nf.substring(cp, fcp);
\r
736 Matcher uqnodename = Pattern.compile("\\b([^' :;\\](),]+)").matcher(fstring);
\r
737 if (uqnodename.matches()
\r
738 && ((uqnodename.start(1) == 0) || (fstring.charAt(uqnodename
\r
739 .start(1) - 1) != ':'))) // JBPNote HACK!
\r
741 if (nodename == null) {
\r
742 if (ReplaceUnderscores) {
\r
743 nodename = uqnodename.group(1).replace('_', ' ');
\r
745 nodename = uqnodename.group(1);
\r
748 Error = ErrorStringrange(Error,
\r
749 "File has broken algorithm - overwritten nodename", 10, fcp, nf);
\r
753 Matcher nbootstrap = Pattern.compile("\\S+([0-9+]+)\\S*:").matcher(fstring);
\r
756 if (nbootstrap.matches()
\r
757 && (nbootstrap.start(1) > uqnodename.end(1))) {
\r
759 bootstrap = (new Integer(nbootstrap.group(1))).intValue();
\r
760 HasBootstrap = true;
\r
761 } catch (Exception e) {
\r
762 Error = ErrorStringrange(Error, "Can't parse bootstrap value", 4,
\r
763 cp + nbootstrap.start(0), nf);
\r
767 Matcher ndist = Pattern.compile(
\r
768 ":([-0-9Ee.+]+)").matcher(fstring);
\r
769 boolean nodehasdistance = false;
\r
771 if (ndist.matches()) {
\r
773 distance = (new Float(ndist.group(1))).floatValue();
\r
774 HasDistances = true;
\r
775 nodehasdistance = true;
\r
776 } catch (Exception e) {
\r
777 Error = ErrorStringrange(Error, "Can't parse node distance value",
\r
778 7, cp + ndist.start(0), nf);
\r
783 // Write node info here
\r
784 c.setName(nodename);
\r
785 // Trees without distances still need a render distance
\r
786 c.dist = (HasDistances) ? distance : DefDistance;
\r
787 // be consistent for internal bootstrap defaults too
\r
788 c.setBootstrap((HasBootstrap) ? bootstrap : DefBootstrap);
\r
789 if (c == realroot) {
\r
790 RootHasDistance = nodehasdistance; // JBPNote This is really
\r
791 // UGLY!!! Ensure root node gets
\r
792 // its given distance
\r
795 // Find a place to put the leaf
\r
796 SequenceNode newnode = new SequenceNode(null, c, nodename,
\r
797 (HasDistances) ? distance : DefDistance,
\r
798 (HasBootstrap) ? bootstrap : DefBootstrap, false);
\r
800 if (c.right() == null) {
\r
801 c.setRight(newnode);
\r
803 if (c.left() == null) {
\r
804 c.setLeft(newnode);
\r
806 // Insert a dummy node for polytomy
\r
807 // dummy nodes have distances
\r
808 SequenceNode newdummy = new SequenceNode(null, c, null,
\r
809 (HasDistances ? 0 : DefDistance), 0, true);
\r
810 newdummy.SetChildren(c.left(), newnode);
\r
811 c.setLeft(newdummy);
\r
817 // move back up the tree from preceding closure
\r
818 c = c.AscendTree();
\r
820 if ((d > -1) && (c == null)) {
\r
821 Error = ErrorStringrange(
\r
823 "File broke algorithm: Lost place in tree (is there an extra ')' ?)",
\r
828 if (nf.charAt(fcp) == ')') {
\r
832 if (nf.charAt(fcp) == ',') {
\r
836 // Just advance focus, if we need to
\r
837 if ((c.left() != null) && (!c.left().isLeaf())) {
\r
838 c = (SequenceNode) c.left();
\r
843 // else : We do nothing if ';' is encountered.
\r
846 // Reset new node properties to obvious fakes
\r
848 distance = DefDistance;
\r
849 bootstrap = DefBootstrap;
\r
855 if (Error != null) {
\r
856 throw (new IOException("NewickFile: " + Error + "\n"));
\r
859 root = (SequenceNode) root.right().detach(); // remove the imaginary root.
\r
861 if (!RootHasDistance) {
\r
862 root.dist = (HasDistances) ? 0 : DefDistance;
\r
869 * @return DOCUMENT ME!
\r
871 public SequenceNode getTree() {
\r
874 public uk.ac.vamsas.objects.core.Treenode[] matchTreeNodeNames(String[] names, Vobject[] boundObjects)
\r
877 // also - need to reconstruct a names object id mapping (or BInaryNode) mapping for the parsed tree file
\r
881 * Generate a newick format tree according to internal flags for bootstraps,
\r
882 * distances and root distances.
\r
884 * @return new hampshire tree in a single line
\r
886 public String print() {
\r
887 synchronized (this) {
\r
888 StringBuffer tf = new StringBuffer();
\r
891 return (tf.append(";").toString());
\r
898 * Generate a newick format tree according to internal flags for distances and
\r
899 * root distances and user specificied writing of bootstraps.
\r
901 * @param withbootstraps
\r
902 * controls if bootstrap values are explicitly written.
\r
904 * @return new hampshire tree in a single line
\r
906 public String print(boolean withbootstraps) {
\r
907 synchronized (this) {
\r
908 boolean boots = this.HasBootstrap;
\r
909 this.HasBootstrap = withbootstraps;
\r
911 String rv = print();
\r
912 this.HasBootstrap = boots;
\r
920 * Generate newick format tree according to internal flags for writing root
\r
923 * @param withbootstraps
\r
924 * explicitly write bootstrap values
\r
926 * explicitly write distances
\r
928 * @return new hampshire tree in a single line
\r
930 public String print(boolean withbootstraps, boolean withdists) {
\r
931 synchronized (this) {
\r
932 boolean dists = this.HasDistances;
\r
933 this.HasDistances = withdists;
\r
935 String rv = print(withbootstraps);
\r
936 this.HasDistances = dists;
\r
943 * Generate newick format tree according to user specified flags
\r
945 * @param withbootstraps
\r
946 * explicitly write bootstrap values
\r
948 * explicitly write distances
\r
949 * @param printRootInfo
\r
950 * explicitly write root distance
\r
952 * @return new hampshire tree in a single line
\r
954 public String print(boolean withbootstraps, boolean withdists,
\r
955 boolean printRootInfo) {
\r
956 synchronized (this) {
\r
957 boolean rootinfo = printRootInfo;
\r
958 this.printRootInfo = printRootInfo;
\r
960 String rv = print(withbootstraps, withdists);
\r
961 this.printRootInfo = rootinfo;
\r
970 * @return DOCUMENT ME!
\r
972 char getQuoteChar() {
\r
982 * @return DOCUMENT ME!
\r
984 char setQuoteChar(char c) {
\r
985 char old = QuoteChar;
\r
997 * @return DOCUMENT ME!
\r
999 private String nodeName(String name) {
\r
1000 if (NodeSafeName[0].matcher(name).find()) {
\r
1001 return QuoteChar + NodeSafeName[1].matcher(name).replaceAll("''") + QuoteChar; // quite
\r
1003 return NodeSafeName[2].matcher(name).replaceAll("_"); // whitespace
\r
1013 * @return DOCUMENT ME!
\r
1015 private String printNodeField(SequenceNode c) {
\r
1016 return c.getNewickNodeName()
\r
1017 + ((HasBootstrap) ? ((c.getBootstrap() > -1) ? (" " + c.getBootstrap())
\r
1018 : "") : "") + ((HasDistances) ? (":" + c.dist) : "");
\r
1027 * @return DOCUMENT ME!
\r
1029 private String printRootField(SequenceNode root) {
\r
1030 return (printRootInfo) ? (((root.getName() == null) ? "" : nodeName(root
\r
1032 + ((HasBootstrap) ? ((root.getBootstrap() > -1) ? (" " + root
\r
1033 .getBootstrap()) : "") : "") + ((RootHasDistance) ? (":" + root.dist)
\r
1038 // Non recursive call deals with root node properties
\r
1039 public void print(StringBuffer tf, SequenceNode root) {
\r
1040 if (root != null) {
\r
1041 if (root.isLeaf() && printRootInfo) {
\r
1042 tf.append(printRootField(root));
\r
1044 if (root.isDummy()) {
\r
1045 _print(tf, (SequenceNode) root.right());
\r
1046 _print(tf, (SequenceNode) root.left());
\r
1049 _print(tf, (SequenceNode) root.right());
\r
1051 if (root.left() != null) {
\r
1055 _print(tf, (SequenceNode) root.left());
\r
1056 tf.append(")" + printRootField(root));
\r
1062 // Recursive call for non-root nodes
\r
1063 public void _print(StringBuffer tf, SequenceNode c) {
\r
1066 tf.append(printNodeField(c));
\r
1068 if (c.isDummy()) {
\r
1069 _print(tf, (SequenceNode) c.left());
\r
1070 if (c.left() != null) {
\r
1073 _print(tf, (SequenceNode) c.right());
\r
1076 _print(tf, (SequenceNode) c.right());
\r
1078 if (c.left() != null) {
\r
1082 _print(tf, (SequenceNode) c.left());
\r
1083 tf.append(")" + printNodeField(c));
\r
1090 public static void main(String[] args) {
\r
1092 if (args == null || args.length != 1) {
\r
1094 .println("Takes one argument - file name of a newick tree file.");
\r
1098 File fn = new File(args[0]);
\r
1100 StringBuffer newickfile = new StringBuffer();
\r
1101 BufferedReader treefile = new BufferedReader(new FileReader(fn));
\r
1104 while ((l = treefile.readLine()) != null) {
\r
1105 newickfile.append(l);
\r
1109 System.out.println("Read file :\n");
\r
1110 NewickFile trf = new NewickFile(fn);
\r
1112 System.out.println("Original file :\n");
\r
1114 System.out.println(Pattern.compile("\n+").matcher(newickfile.toString()).replaceAll("") + "\n");
\r
1116 System.out.println("Parsed file.\n");
\r
1117 System.out.println("Default output type for original input.\n");
\r
1118 System.out.println(trf.print());
\r
1119 System.out.println("Without bootstraps.\n");
\r
1120 System.out.println(trf.print(false));
\r
1121 System.out.println("Without distances.\n");
\r
1122 System.out.println(trf.print(true, false));
\r
1123 System.out.println("Without bootstraps but with distanecs.\n");
\r
1124 System.out.println(trf.print(false, true));
\r
1125 System.out.println("Without bootstraps or distanecs.\n");
\r
1126 System.out.println(trf.print(false, false));
\r
1127 System.out.println("With bootstraps and with distances.\n");
\r
1128 System.out.println(trf.print(true, true));
\r
1129 } catch (java.io.IOException e) {
\r
1130 System.err.println("Exception\n" + e);
\r
1131 e.printStackTrace();
\r