4 * Jalview - A Sequence Alignment Editor and Viewer
\r
5 * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
\r
7 * This program is free software; you can redistribute it and/or
\r
8 * modify it under the terms of the GNU General Public License
\r
9 * as published by the Free Software Foundation; either version 2
\r
10 * of the License, or (at your option) any later version.
\r
12 * This program is distributed in the hope that it will be useful,
\r
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
15 * GNU General Public License for more details.
\r
17 * You should have received a copy of the GNU General Public License
\r
18 * along with this program; if not, write to the Free Software
\r
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
\r
24 // http://evolution.genetics.washington.edu/phylip/newick_doc.html
\r
25 // TODO: Implement Basic NHX tag parsing and preservation
\r
26 // TODO: http://evolution.genetics.wustl.edu/eddy/forester/NHX.html
\r
27 // TODO: Extended SequenceNodeI to hold parsed NHX strings
\r
28 package uk.ac.vamsas.objects.utils.trees;
\r
32 import java.util.Enumeration;
\r
33 import java.util.Hashtable;
\r
34 import java.util.Vector;
\r
35 import java.util.regex.Matcher;
\r
36 import java.util.regex.Pattern;
\r
38 import uk.ac.vamsas.client.Vobject;
\r
39 import uk.ac.vamsas.objects.core.Treenode;
\r
40 import uk.ac.vamsas.objects.core.Vref;
\r
46 * @version $Revision: 1.12 $
\r
48 public class NewickFile {
\r
51 private boolean HasBootstrap = false;
\r
53 private boolean HasDistances = false;
\r
55 private boolean RootHasDistance = false;
\r
58 boolean ReplaceUnderscores = false;
\r
60 boolean printRootInfo = false;
\r
62 private Pattern[] NodeSafeName = new Pattern[] {
\r
63 Pattern.compile("[\\[,:'()]"), // test for requiring quotes
\r
64 Pattern.compile("'"), // escaping quote characters
\r
65 Pattern.compile("/w") // unqoted whitespace transformation
\r
68 char QuoteChar = '\'';
\r
70 String newickFile = null;
\r
73 * Creates a new NewickFile object.
\r
78 * @throws IOException
\r
81 public NewickFile(String inStr) throws IOException {
\r
85 public NewickFile(File inFile) throws IOException {
\r
86 errormessage = "Problem's reading file "+inFile;
\r
87 dataIn = new java.io.BufferedReader(new InputStreamReader(new java.io.FileInputStream(inFile)));
\r
91 * Creates a new NewickFile object.
\r
96 public NewickFile(SequenceNode newtree) {
\r
101 * Creates a new NewickFile object.
\r
108 public NewickFile(SequenceNode newtree, boolean bootstrap) {
\r
109 HasBootstrap = bootstrap;
\r
114 * Creates a new NewickFile object.
\r
123 public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances) {
\r
125 HasBootstrap = bootstrap;
\r
126 HasDistances = distances;
\r
130 * Creates a new NewickFile object.
\r
138 * @param rootdistance
\r
141 public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances,
\r
142 boolean rootdistance) {
\r
144 HasBootstrap = bootstrap;
\r
145 HasDistances = distances;
\r
146 RootHasDistance = rootdistance;
\r
155 * Message qualifier (ie the incorrect data)
\r
157 * range of characters either side to dump
\r
159 * character position
\r
161 * newick string being parsed
\r
163 * @return composed error message
\r
165 private String ErrorStringrange(String Error, String Er, int r, int p,
\r
167 return ((Error == null) ? "" : Error)
\r
172 + s.substring(((p - r) < 0) ? 0 : (p - r), ((p + r) > s.length()) ? s
\r
173 .length() : (p + r)) + " )\n";
\r
178 * @return true if tree has bootstrap values
\r
180 public boolean HasBootstrap() {
\r
181 return HasBootstrap;
\r
186 * @return true if tree has distances on branches
\r
188 public boolean HasDistances() {
\r
189 return HasDistances;
\r
194 * @return true if root has a stem distance
\r
196 public boolean HasRootDistance() {
\r
197 return RootHasDistance;
\r
200 * hacked out of jalview code
\r
203 String errormessage;
\r
204 java.io.BufferedReader dataIn=null;
\r
205 public String nextLine() throws IOException {
\r
208 dataIn = new BufferedReader(new StringReader(newickFile));
\r
212 throw new IOException("IMPLEMENTATION ERROR: NewickFile has not been initialised for reading a newick string.");
\r
214 return dataIn.readLine();
\r
215 throw new IOException("Invalid Source Stream:" + errormessage);
\r
218 * call this to convert the newick string into a binary node linked tree
\r
220 * @throws IOException
\r
221 * if the newick string cannot be parsed.
\r
223 public void parse() throws IOException {
\r
225 if (newickFile==null)
\r
227 // fill nf with complete tree file
\r
229 StringBuffer file = new StringBuffer();
\r
231 while ((nf = nextLine()) != null) {
\r
235 nf = file.toString();
\r
242 root = new SequenceNode();
\r
244 SequenceNode realroot = null;
\r
245 SequenceNode c = root;
\r
249 // int flen = nf.length();
\r
251 String Error = null;
\r
252 String nodename = null;
\r
254 float DefDistance = (float) 0.001; // @param Default distance for a node -
\r
256 int DefBootstrap = 0; // @param Default bootstrap for a node
\r
258 float distance = DefDistance;
\r
259 int bootstrap = DefBootstrap;
\r
261 boolean ascending = false; // flag indicating that we are leaving the
\r
264 Pattern majorsyms = Pattern.compile(
\r
267 Matcher mjsyms = majorsyms.matcher(nf);
\r
268 while (mjsyms.find(cp) && (Error == null)) {
\r
269 int fcp = mjsyms.start();
\r
271 switch (nf.charAt(fcp)) {
\r
272 case '[': // Comment or structured/extended NH format info
\r
275 if (nf.indexOf(']',fcp)>-1) {
\r
276 // Skip the comment field
\r
277 cp = nf.indexOf(']',fcp);
\r
279 Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp, nf);
\r
288 // ascending should not be set
\r
289 // New Internal node
\r
291 Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf);
\r
299 if (c.right() == null) {
\r
300 c.setRight(new SequenceNode(null, c, null, DefDistance, DefBootstrap,
\r
302 c = (SequenceNode) c.right();
\r
304 if (c.left() != null) {
\r
305 // Dummy node for polytomy - keeps c.left free for new node
\r
306 SequenceNode tmpn = new SequenceNode(null, c, null, 0, 0, true);
\r
307 tmpn.SetChildren(c.left(), c.right());
\r
311 c.setLeft(new SequenceNode(null, c, null, DefDistance, DefBootstrap,
\r
313 c = (SequenceNode) c.left();
\r
316 if (realroot == null) {
\r
321 distance = DefDistance;
\r
322 bootstrap = DefBootstrap;
\r
327 // Deal with quoted fields
\r
330 Matcher qnodename = Pattern.compile("([^']|'')+'").matcher(nf);
\r
331 if (qnodename.find(fcp)) {
\r
332 nodename = new String(qnodename.group(1));
\r
333 cp = qnodename.end(0);
\r
335 Error = ErrorStringrange(Error, "Unterminated quotes for nodename",
\r
344 Error = ErrorStringrange(Error,
\r
345 "Wayward semicolon (depth=" + d + ")", 7, fcp, nf);
\r
348 // cp advanced at the end of default
\r
351 // Parse simpler field strings
\r
352 String fstring = nf.substring(cp, fcp);
\r
353 Matcher uqnodename = Pattern.compile("\\b([^' :;\\](),]+)").matcher(fstring);
\r
354 if (uqnodename.matches()
\r
355 && ((uqnodename.start(1) == 0) || (fstring.charAt(uqnodename
\r
356 .start(1) - 1) != ':'))) // JBPNote HACK!
\r
358 if (nodename == null) {
\r
359 if (ReplaceUnderscores) {
\r
360 nodename = uqnodename.group(1).replace('_', ' ');
\r
362 nodename = uqnodename.group(1);
\r
365 Error = ErrorStringrange(Error,
\r
366 "File has broken algorithm - overwritten nodename", 10, fcp, nf);
\r
370 Matcher nbootstrap = Pattern.compile("\\S+([0-9+]+)\\S*:").matcher(fstring);
\r
373 if (nbootstrap.matches()
\r
374 && (nbootstrap.start(1) > uqnodename.end(1))) {
\r
376 bootstrap = (new Integer(nbootstrap.group(1))).intValue();
\r
377 HasBootstrap = true;
\r
378 } catch (Exception e) {
\r
379 Error = ErrorStringrange(Error, "Can't parse bootstrap value", 4,
\r
380 cp + nbootstrap.start(0), nf);
\r
384 Matcher ndist = Pattern.compile(
\r
385 ":([-0-9Ee.+]+)").matcher(fstring);
\r
386 boolean nodehasdistance = false;
\r
388 if (ndist.matches()) {
\r
390 distance = (new Float(ndist.group(1))).floatValue();
\r
391 HasDistances = true;
\r
392 nodehasdistance = true;
\r
393 } catch (Exception e) {
\r
394 Error = ErrorStringrange(Error, "Can't parse node distance value",
\r
395 7, cp + ndist.start(0), nf);
\r
400 // Write node info here
\r
401 c.setName(nodename);
\r
402 // Trees without distances still need a render distance
\r
403 c.dist = (HasDistances) ? distance : DefDistance;
\r
404 // be consistent for internal bootstrap defaults too
\r
405 c.setBootstrap((HasBootstrap) ? bootstrap : DefBootstrap);
\r
406 if (c == realroot) {
\r
407 RootHasDistance = nodehasdistance; // JBPNote This is really
\r
408 // UGLY!!! Ensure root node gets
\r
409 // its given distance
\r
412 // Find a place to put the leaf
\r
413 SequenceNode newnode = new SequenceNode(null, c, nodename,
\r
414 (HasDistances) ? distance : DefDistance,
\r
415 (HasBootstrap) ? bootstrap : DefBootstrap, false);
\r
417 if (c.right() == null) {
\r
418 c.setRight(newnode);
\r
420 if (c.left() == null) {
\r
421 c.setLeft(newnode);
\r
423 // Insert a dummy node for polytomy
\r
424 // dummy nodes have distances
\r
425 SequenceNode newdummy = new SequenceNode(null, c, null,
\r
426 (HasDistances ? 0 : DefDistance), 0, true);
\r
427 newdummy.SetChildren(c.left(), newnode);
\r
428 c.setLeft(newdummy);
\r
434 // move back up the tree from preceding closure
\r
435 c = c.AscendTree();
\r
437 if ((d > -1) && (c == null)) {
\r
438 Error = ErrorStringrange(
\r
440 "File broke algorithm: Lost place in tree (is there an extra ')' ?)",
\r
445 if (nf.charAt(fcp) == ')') {
\r
449 if (nf.charAt(fcp) == ',') {
\r
453 // Just advance focus, if we need to
\r
454 if ((c.left() != null) && (!c.left().isLeaf())) {
\r
455 c = (SequenceNode) c.left();
\r
460 // else : We do nothing if ';' is encountered.
\r
463 // Reset new node properties to obvious fakes
\r
465 distance = DefDistance;
\r
466 bootstrap = DefBootstrap;
\r
472 if (Error != null) {
\r
473 throw (new IOException("NewickFile: " + Error + "\n"));
\r
476 root = (SequenceNode) root.right().detach(); // remove the imaginary root.
\r
478 if (!RootHasDistance) {
\r
479 root.dist = (HasDistances) ? 0 : DefDistance;
\r
486 * @return DOCUMENT ME!
\r
488 public SequenceNode getTree() {
\r
491 public uk.ac.vamsas.objects.core.Treenode[] matchTreeNodeNames(String[] names, Vobject[] boundObjects)
\r
494 // also - need to reconstruct a names object id mapping (or BInaryNode) mapping for the parsed tree file
\r
498 * Generate a newick format tree according to internal flags for bootstraps,
\r
499 * distances and root distances.
\r
501 * @return new hampshire tree in a single line
\r
503 public String print() {
\r
504 synchronized (this) {
\r
505 StringBuffer tf = new StringBuffer();
\r
508 return (tf.append(";").toString());
\r
515 * Generate a newick format tree according to internal flags for distances and
\r
516 * root distances and user specificied writing of bootstraps.
\r
518 * @param withbootstraps
\r
519 * controls if bootstrap values are explicitly written.
\r
521 * @return new hampshire tree in a single line
\r
523 public String print(boolean withbootstraps) {
\r
524 synchronized (this) {
\r
525 boolean boots = this.HasBootstrap;
\r
526 this.HasBootstrap = withbootstraps;
\r
528 String rv = print();
\r
529 this.HasBootstrap = boots;
\r
537 * Generate newick format tree according to internal flags for writing root
\r
540 * @param withbootstraps
\r
541 * explicitly write bootstrap values
\r
543 * explicitly write distances
\r
545 * @return new hampshire tree in a single line
\r
547 public String print(boolean withbootstraps, boolean withdists) {
\r
548 synchronized (this) {
\r
549 boolean dists = this.HasDistances;
\r
550 this.HasDistances = withdists;
\r
552 String rv = print(withbootstraps);
\r
553 this.HasDistances = dists;
\r
560 * Generate newick format tree according to user specified flags
\r
562 * @param withbootstraps
\r
563 * explicitly write bootstrap values
\r
565 * explicitly write distances
\r
566 * @param printRootInfo
\r
567 * explicitly write root distance
\r
569 * @return new hampshire tree in a single line
\r
571 public String print(boolean withbootstraps, boolean withdists,
\r
572 boolean printRootInfo) {
\r
573 synchronized (this) {
\r
574 boolean rootinfo = printRootInfo;
\r
575 this.printRootInfo = printRootInfo;
\r
577 String rv = print(withbootstraps, withdists);
\r
578 this.printRootInfo = rootinfo;
\r
587 * @return DOCUMENT ME!
\r
589 char getQuoteChar() {
\r
599 * @return DOCUMENT ME!
\r
601 char setQuoteChar(char c) {
\r
602 char old = QuoteChar;
\r
614 * @return DOCUMENT ME!
\r
616 private String nodeName(String name) {
\r
617 if (NodeSafeName[0].matcher(name).find()) {
\r
618 return QuoteChar + NodeSafeName[1].matcher(name).replaceAll("''") + QuoteChar; // quite
\r
620 return NodeSafeName[2].matcher(name).replaceAll("_"); // whitespace
\r
630 * @return DOCUMENT ME!
\r
632 private String printNodeField(SequenceNode c) {
\r
633 return //c.getNewickNodeName()
\r
634 ((c.getName() == null) ? "" : nodeName(c.getName()))
\r
635 + ((HasBootstrap) ? ((c.getBootstrap() > -1) ? (" " + c.getBootstrap())
\r
636 : "") : "") + ((HasDistances) ? (":" + c.dist) : "");
\r
645 * @return DOCUMENT ME!
\r
647 private String printRootField(SequenceNode root) {
\r
648 return (printRootInfo) ? (((root.getName() == null) ? "" : nodeName(root
\r
650 + ((HasBootstrap) ? ((root.getBootstrap() > -1) ? (" " + root
\r
651 .getBootstrap()) : "") : "") + ((RootHasDistance) ? (":" + root.dist)
\r
656 // Non recursive call deals with root node properties
\r
657 public void print(StringBuffer tf, SequenceNode root) {
\r
658 if (root != null) {
\r
659 if (root.isLeaf() && printRootInfo) {
\r
660 tf.append(printRootField(root));
\r
662 if (root.isDummy()) {
\r
663 _print(tf, (SequenceNode) root.right());
\r
664 _print(tf, (SequenceNode) root.left());
\r
667 _print(tf, (SequenceNode) root.right());
\r
669 if (root.left() != null) {
\r
673 _print(tf, (SequenceNode) root.left());
\r
674 tf.append(")" + printRootField(root));
\r
680 // Recursive call for non-root nodes
\r
681 public void _print(StringBuffer tf, SequenceNode c) {
\r
684 tf.append(printNodeField(c));
\r
687 _print(tf, (SequenceNode) c.left());
\r
688 if (c.left() != null) {
\r
691 _print(tf, (SequenceNode) c.right());
\r
694 _print(tf, (SequenceNode) c.right());
\r
696 if (c.left() != null) {
\r
700 _print(tf, (SequenceNode) c.left());
\r
701 tf.append(")" + printNodeField(c));
\r
708 public static void main(String[] args) {
\r
710 if (args == null || args.length != 1) {
\r
712 .println("Takes one argument - file name of a newick tree file.");
\r
716 File fn = new File(args[0]);
\r
718 StringBuffer newickfile = new StringBuffer();
\r
719 BufferedReader treefile = new BufferedReader(new FileReader(fn));
\r
722 while ((l = treefile.readLine()) != null) {
\r
723 newickfile.append(l);
\r
727 System.out.println("Read file :\n");
\r
728 NewickFile trf = new NewickFile(fn);
\r
730 System.out.println("Original file :\n");
\r
732 System.out.println(Pattern.compile("\n+").matcher(newickfile.toString()).replaceAll("") + "\n");
\r
734 System.out.println("Parsed file.\n");
\r
735 System.out.println("Default output type for original input.\n");
\r
736 System.out.println(trf.print());
\r
737 System.out.println("Without bootstraps.\n");
\r
738 System.out.println(trf.print(false));
\r
739 System.out.println("Without distances.\n");
\r
740 System.out.println(trf.print(true, false));
\r
741 System.out.println("Without bootstraps but with distanecs.\n");
\r
742 System.out.println(trf.print(false, true));
\r
743 System.out.println("Without bootstraps or distanecs.\n");
\r
744 System.out.println(trf.print(false, false));
\r
745 System.out.println("With bootstraps and with distances.\n");
\r
746 System.out.println(trf.print(true, true));
\r
747 } catch (java.io.IOException e) {
\r
748 System.err.println("Exception\n" + e);
\r
749 e.printStackTrace();
\r
753 * Search for leaf nodes.
\r
755 * @param node root node to search from
\r
756 * @param leaves Vector of leaves to add leaf node objects too.
\r
758 * @return Vector of leaf nodes on binary tree
\r
760 public Vector findLeaves(SequenceNode node, Vector leaves)
\r
767 if ( (node.left() == null) && (node.right() == null)) // Interior node detection
\r
769 leaves.addElement(node);
\r
775 /* TODO: Identify internal nodes... if (node.isSequenceLabel())
\r
777 leaves.addElement(node);
\r
779 findLeaves( (SequenceNode) node.left(), leaves);
\r
780 findLeaves( (SequenceNode) node.right(), leaves);
\r
787 * make tree node vector from a newick tree structure with associated vamsas objects
\r
791 public Treenode[] makeTreeNodes() {
\r
792 Vector leaves = new Vector();
\r
793 findLeaves(root, leaves);
\r
794 Vector tnv = new Vector();
\r
795 Enumeration l = leaves.elements();
\r
796 Hashtable nodespecs = new Hashtable();
\r
797 while (l.hasMoreElements())
\r
799 BinaryNode tnode = (BinaryNode) l.nextElement();
\r
800 if (tnode instanceof SequenceNode)
\r
802 if (!((SequenceNode) tnode).isPlaceholder())
\r
804 Object assocseq = ((SequenceNode) tnode).element();
\r
805 if (assocseq instanceof Vobject)
\r
807 Vobject vobj = (Vobject) assocseq;
\r
810 Treenode node = new Treenode();
\r
811 node.setNodespec(makeNodeSpec(nodespecs, tnode));
\r
812 node.setName(tnode.getName());
\r
813 Vref vr = new Vref();
\r
816 tnv.addElement(node);
\r
820 System.err.println("WARNING: Unassociated treeNode "+tnode.element().toString()+" "
\r
821 +((tnode.getName()!=null) ? " label "+tnode.getName() : ""));
\r
829 Treenode[] tn = new Treenode[tnv.size()];
\r
833 return new Treenode[] {};
\r
835 private String makeNodeSpec(Hashtable nodespecs, BinaryNode tnode)
\r
837 String nname = new String(tnode.getName());
\r
838 Integer nindx = (Integer) nodespecs.get(nname);
\r
841 nindx = new Integer(1);
\r
843 nname = nindx.toString()+" "+nname;
\r
847 * call to match up Treenode specs to NJTree parsed from document object.
\r
851 * as returned from NJTree.findLeaves( .., ..) ..
\r
854 private BinaryNode findNodeSpec(String nodespec, Vector leaves)
\r
857 String nspec = nodespec.substring(nodespec.indexOf(' ')+1);
\r
858 String oval = nodespec.substring(0, nodespec.indexOf(' '));
\r
860 occurence = new Integer(oval).intValue();
\r
862 catch (Exception e)
\r
864 System.err.println("Invalid nodespec '"+nodespec+"'");
\r
867 BinaryNode bn = null;
\r
870 Enumeration en = leaves.elements();
\r
871 while (en.hasMoreElements() && nocc<occurence)
\r
873 bn = (BinaryNode) en.nextElement();
\r
874 if (bn instanceof SequenceNode && bn.getName().equals(nspec))
\r
884 * re-decorate the newick node representation with the VorbaId of an object mapped by its corresponding TreeNode.
\r
885 * Note: this only takes the first object in the first set of references.
\r
887 * @return vector of mappings { treenode, SequenceNode, Vobject for VorbaId on sequence node }
\r
889 public Vector attachTreeMap(Treenode[] tn)
\r
891 if (root!=null || tn==null)
\r
893 Vector leaves = new Vector();
\r
894 Vector nodemap=new Vector();
\r
895 findLeaves(root, leaves);
\r
896 int sz = tn.length;
\r
901 Treenode node = tn[i++];
\r
902 BinaryNode mappednode = findNodeSpec(node.getNodespec(),leaves);
\r
903 if (mappednode!=null && mappednode instanceof SequenceNode) {
\r
904 SequenceNode leaf = (SequenceNode) leaves.elementAt(i++);
\r
905 // check if we can make the specified association
\r
906 Vobject noderef = null;
\r
908 while (noderef==null && vrf<node.getVrefCount())
\r
910 if (refv<node.getVref(vrf).getRefsCount())
\r
912 Object ref = node.getVref(vrf).getRefs(refv++);
\r
913 if (ref instanceof Vobject)
\r
915 noderef = (Vobject) ref;
\r
924 nodemap.addElement(new Object[] { node, leaf, noderef });
\r
925 leaf.setElement(noderef);
\r
926 leaf.setPlaceholder(false);
\r
928 leaf.setPlaceholder(true);
\r