/* * This file is part of the Vamsas Client version 0.1. * Copyright 2009 by Jim Procter, Iain Milne, Pierre Marguerite, * Andrew Waterhouse and Dominik Lindner. * * Earlier versions have also been incorporated into Jalview version 2.4 * since 2008, and TOPALi version 2 since 2007. * * The Vamsas Client is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The Vamsas Client is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the Vamsas Client. If not, see . */ // NewickFile.java // Tree I/O // http://evolution.genetics.washington.edu/phylip/newick_doc.html // TODO: Implement Basic NHX tag parsing and preservation // TODO: http://evolution.genetics.wustl.edu/eddy/forester/NHX.html // TODO: Extended SequenceNodeI to hold parsed NHX strings package uk.ac.vamsas.objects.utils.trees; import java.io.*; import java.util.Enumeration; import java.util.Hashtable; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import uk.ac.vamsas.client.Vobject; import uk.ac.vamsas.objects.core.Treenode; import uk.ac.vamsas.objects.core.Vref; /** * DOCUMENT ME! * * @author $author$ * @version $Revision: 1.12 $ */ public class NewickFile { SequenceNode root; private boolean HasBootstrap = false; private boolean HasDistances = false; private boolean RootHasDistance = false; // File IO Flags boolean ReplaceUnderscores = true; boolean printRootInfo = true; // in case root has anything to preserve private Pattern[] NodeSafeName = new Pattern[] { Pattern.compile("[\\[,:'()]"), // test for requiring quotes Pattern.compile("'"), // escaping quote characters Pattern.compile("\\s") // unqoted whitespace transformation }; char QuoteChar = '\''; String newickFile = null; /** * Creates a new NewickFile object * * @param inStr * Newick style tree string * * @throws IOException * if string is not a valid newick file */ public NewickFile(String inStr) throws IOException { newickFile = inStr; parse(); } public NewickFile(File inFile) throws IOException { errormessage = "Problem's reading file " + inFile; dataIn = new java.io.BufferedReader(new InputStreamReader( new java.io.FileInputStream(inFile))); parse(); } /** * Creates a new NewickFile object. * * @param newtree * DOCUMENT ME! */ public NewickFile(SequenceNode newtree) { root = newtree; } /** * Creates a new NewickFile object. * * @param newtree * DOCUMENT ME! * @param bootstrap * DOCUMENT ME! */ public NewickFile(SequenceNode newtree, boolean bootstrap) { HasBootstrap = bootstrap; root = newtree; } /** * Creates a new NewickFile object. * * @param newtree * DOCUMENT ME! * @param bootstrap * DOCUMENT ME! * @param distances * DOCUMENT ME! */ public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances) { root = newtree; HasBootstrap = bootstrap; HasDistances = distances; } /** * Creates a new NewickFile object. * * @param newtree * DOCUMENT ME! * @param bootstrap * DOCUMENT ME! * @param distances * DOCUMENT ME! * @param rootdistance * DOCUMENT ME! */ public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances, boolean rootdistance) { root = newtree; HasBootstrap = bootstrap; HasDistances = distances; RootHasDistance = rootdistance; } /** * DOCUMENT ME! * * @param Error * Message prefix * @param Er * Message qualifier (ie the incorrect data) * @param r * range of characters either side to dump * @param p * character position * @param s * newick string being parsed * * @return composed error message */ private String ErrorStringrange(String Error, String Er, int r, int p, String s) { return ((Error == null) ? "" : Error) + Er + " at position " + p + " ( " + s.substring(((p - r) < 0) ? 0 : (p - r), ((p + r) > s.length()) ? s .length() : (p + r)) + " )\n"; } /** * * @return true if tree has bootstrap values */ public boolean HasBootstrap() { return HasBootstrap; } /** * * @return true if tree has distances on branches */ public boolean HasDistances() { return HasDistances; } /** * * @return true if root has a stem distance */ public boolean HasRootDistance() { return RootHasDistance; } /* * hacked out of jalview code */ boolean error; String errormessage; java.io.BufferedReader dataIn = null; public String nextLine() throws IOException { if (dataIn == null && newickFile == null) throw new IOException( "IMPLEMENTATION ERROR: NewickFile has not been initialised for reading a newick string."); if (dataIn == null) { dataIn = new BufferedReader(new StringReader(newickFile)); error = false; } if (!error) return dataIn.readLine(); throw new IOException("Invalid Source Stream:" + errormessage); } /** * call this to convert the newick string into a binary node linked tree Note: * this is automatically called by the constructors, so you normally wouldn't * need to use this. * * @throws IOException * if the newick string cannot be parsed. */ public void parse() throws IOException { String nf; if (newickFile == null) { // fill nf with complete tree file StringBuffer file = new StringBuffer(); while ((nf = nextLine()) != null) { file.append(nf); } nf = file.toString(); } else { nf = newickFile; } root = new SequenceNode(); SequenceNode realroot = null; SequenceNode c = root; int d = -1; int cp = 0; // int flen = nf.length(); String Error = null; String nodename = null; float DefDistance = (float) 0.001; // @param Default distance for a node - // very very small int DefBootstrap = -1; // @param Default bootstrap for a node float distance = DefDistance; int bootstrap = DefBootstrap; boolean ascending = false; // flag indicating that we are leaving the // current node Pattern majorsyms = Pattern.compile("[(\\['),;]"); Matcher mjsyms = majorsyms.matcher(nf); char schar; int nextcp = 0; int ncp = cp; while (mjsyms.find(cp) && (Error == null)) { int fcp = mjsyms.start(); switch (schar = nf.charAt(fcp)) { case '(': // ascending should not be set // New Internal node if (ascending) { Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf); continue; } ; d++; if (c.right() == null) { c.setRight(new SequenceNode(null, c, null, DefDistance, DefBootstrap, false)); c = (SequenceNode) c.right(); } else { if (c.left() != null) { // Dummy node for polytomy - keeps c.left free for new node SequenceNode tmpn = new SequenceNode(null, c, null, 0, 0, true); tmpn.SetChildren(c.left(), c.right()); c.setRight(tmpn); } c.setLeft(new SequenceNode(null, c, null, DefDistance, DefBootstrap, false)); c = (SequenceNode) c.left(); } if (realroot == null) { realroot = c; } nodename = null; distance = DefDistance; bootstrap = DefBootstrap; cp = fcp + 1; break; // Deal with quoted fields case '\'': Matcher qnodename = Pattern.compile("([^']|'')+'").matcher(nf); if (qnodename.find(fcp)) { nodename = new String(qnodename.group(1)); cp = qnodename.end(0); } else { Error = ErrorStringrange(Error, "Unterminated quotes for nodename", 7, fcp, nf); } break; default: // Reached termininating root node label. if (schar == ';' && d != -1) { Error = ErrorStringrange(Error, "Wayward semicolon (depth=" + d + ")", 7, fcp, nf); } // Skip Comment or structured/extended NH format info if (schar == '[') { if ((nextcp = nf.indexOf(']', fcp)) > -1) { // verified that comment is properly terminated. // now skip the comment field nextcp++; break; // go and search for the next node separator, leaving ncp at // beginning of node info } else { Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp, nf); nextcp = 0; break; } } // Parse simpler field strings from substring between ncp and node // separator String fstring = nf.substring(ncp, fcp); // extract any comments from the nodeinfo. while (fstring.indexOf(']') > -1) { int cstart = fstring.indexOf('['); int cend = fstring.indexOf(']'); String comment = fstring.substring(cstart + 1, cend); // TODO: put // this // somewhere ? fstring = fstring.substring(0, cstart) + fstring.substring(cend + 1); } Matcher uqnodename = Pattern.compile("^([^' :;\\](),]+).*").matcher( fstring); if (uqnodename.matches() && ((uqnodename.start(1) == 0) || (fstring.charAt(uqnodename .start(1) - 1) != ':'))) // JBPNote HACK! { if (nodename == null) { if (ReplaceUnderscores) { nodename = uqnodename.group(1).replace('_', ' '); } else { nodename = uqnodename.group(1); } } else { Error = ErrorStringrange(Error, "File has broken algorithm - overwritten nodename", 10, fcp, nf); } } Matcher nbootstrap = Pattern.compile("\\s*([+0-9]+)\\s*:.*").matcher( fstring); if (nbootstrap.matches()) { if (nodename != null && nbootstrap.group(1).equals(nodename)) { nodename = null; // empty nodename - only bootstrap value } if ((nodename == null || nodename.length() == 0) || nbootstrap.start(1) >= uqnodename.end(1)) { try { bootstrap = (new Integer(nbootstrap.group(1))).intValue(); HasBootstrap = true; } catch (Exception e) { Error = ErrorStringrange(Error, "Can't parse bootstrap value", 4, ncp + nbootstrap.start(0), nf); } } } Matcher ndist = Pattern.compile(".*:([-0-9Ee.+]+)").matcher(fstring); boolean nodehasdistance = false; if (ndist.matches()) { try { distance = (new Float(ndist.group(1))).floatValue(); HasDistances = true; nodehasdistance = true; } catch (Exception e) { Error = ErrorStringrange(Error, "Can't parse node distance value", 7, ncp + ndist.start(0), nf); } } if (ascending) { // Write node info here c.setName(nodename); // Trees without distances still need a render distance c.dist = (HasDistances) ? distance : DefDistance; // be consistent for internal bootstrap defaults too c.setBootstrap((HasBootstrap) ? bootstrap : DefBootstrap); if (c == realroot) { RootHasDistance = nodehasdistance; // JBPNote This is really // UGLY!!! Ensure root node gets // its given distance } } else { // Find a place to put the leaf SequenceNode newnode = new SequenceNode(null, c, nodename, (HasDistances) ? distance : DefDistance, (HasBootstrap) ? bootstrap : DefBootstrap, false); if (c.right() == null) { c.setRight(newnode); } else { if (c.left() == null) { c.setLeft(newnode); } else { // Insert a dummy node for polytomy // dummy nodes have distances SequenceNode newdummy = new SequenceNode(null, c, null, (HasDistances ? 0 : DefDistance), 0, true); newdummy.SetChildren(c.left(), newnode); c.setLeft(newdummy); } } } if (ascending) { // move back up the tree from preceding closure c = c.AscendTree(); if ((d > -1) && (c == null)) { Error = ErrorStringrange( Error, "File broke algorithm: Lost place in tree (is there an extra ')' ?)", 7, fcp, nf); } } if (nf.charAt(fcp) == ')') { d--; ascending = true; } else { if (nf.charAt(fcp) == ',') { if (ascending) { ascending = false; } else { // Just advance focus, if we need to if ((c.left() != null) && (!c.left().isLeaf())) { c = (SequenceNode) c.left(); } } } } // Reset new node properties to obvious fakes nodename = null; distance = DefDistance; bootstrap = DefBootstrap; } // Advance character pointers if necessary if (nextcp == 0) { ncp = cp = fcp + 1; } else { cp = nextcp; nextcp = 0; } } if (Error != null) { throw (new IOException("NewickFile: " + Error + "\n")); } root = (SequenceNode) root.right().detach(); // remove the imaginary root. if (!RootHasDistance) { root.dist = (HasDistances) ? 0 : DefDistance; } } /** * DOCUMENT ME! * * @return DOCUMENT ME! */ public SequenceNode getTree() { return root; } public uk.ac.vamsas.objects.core.Treenode[] matchTreeNodeNames( String[] names, Vobject[] boundObjects) { // todo! // also - need to reconstruct a names object id mapping (or BInaryNode) // mapping for the parsed tree file return null; } /** * Generate a newick format tree according to internal flags for bootstraps, * distances and root distances. * * @return new hampshire tree in a single line */ public String print() { synchronized (this) { StringBuffer tf = new StringBuffer(); print(tf, root); return (tf.append(";").toString()); } } /** * * * Generate a newick format tree according to internal flags for distances and * root distances and user specificied writing of bootstraps. * * @param withbootstraps * controls if bootstrap values are explicitly written. * * @return new hampshire tree in a single line */ public String print(boolean withbootstraps) { synchronized (this) { boolean boots = this.HasBootstrap; this.HasBootstrap = withbootstraps; String rv = print(); this.HasBootstrap = boots; return rv; } } /** * * Generate newick format tree according to internal flags for writing root * node distances. * * @param withbootstraps * explicitly write bootstrap values * @param withdists * explicitly write distances * * @return new hampshire tree in a single line */ public String print(boolean withbootstraps, boolean withdists) { synchronized (this) { boolean dists = this.HasDistances; this.HasDistances = withdists; String rv = print(withbootstraps); this.HasDistances = dists; return rv; } } /** * Generate newick format tree according to user specified flags * * @param withbootstraps * explicitly write bootstrap values * @param withdists * explicitly write distances * @param printRootInfo * explicitly write root distance * * @return new hampshire tree in a single line */ public String print(boolean withbootstraps, boolean withdists, boolean printRootInfo) { synchronized (this) { boolean rootinfo = printRootInfo; this.printRootInfo = printRootInfo; String rv = print(withbootstraps, withdists); this.printRootInfo = rootinfo; return rv; } } /** * DOCUMENT ME! * * @return DOCUMENT ME! */ char getQuoteChar() { return QuoteChar; } /** * DOCUMENT ME! * * @param c * DOCUMENT ME! * * @return DOCUMENT ME! */ char setQuoteChar(char c) { char old = QuoteChar; QuoteChar = c; return old; } /** * DOCUMENT ME! * * @param name * DOCUMENT ME! * * @return DOCUMENT ME! */ private String nodeName(String name) { if (NodeSafeName[0].matcher(name).find()) { return QuoteChar + NodeSafeName[1].matcher(name).replaceAll("''") + QuoteChar; // quite } else { return NodeSafeName[2].matcher(name).replaceAll("_"); // whitespace } } /** * DOCUMENT ME! * * @param c * DOCUMENT ME! * * @return DOCUMENT ME! */ private String printNodeField(SequenceNode c) { return // c.getNewickNodeName() ((c.getName() == null) ? "" : nodeName(c.getName())) + ((HasBootstrap) ? ((c.getBootstrap() > -1) ? (((c.getName() == null) ? " " : "") + c.getBootstrap()) : "") : "") + ((HasDistances) ? (":" + c.dist) : ""); } /** * DOCUMENT ME! * * @param root * DOCUMENT ME! * * @return DOCUMENT ME! */ private String printRootField(SequenceNode root) { return (printRootInfo) ? (((root.getName() == null) ? "" : nodeName(root .getName())) + ((HasBootstrap) ? ((root.getBootstrap() > -1) ? ((root.getName() != null ? " " : "") + root.getBootstrap()) : "") : "") + ((RootHasDistance) ? (":" + root.dist) : "")) : ""; } // Non recursive call deals with root node properties public void print(StringBuffer tf, SequenceNode root) { if (root != null) { if (root.isLeaf() && printRootInfo) { tf.append(printRootField(root)); } else { if (root.isDummy()) { _print(tf, (SequenceNode) root.right()); _print(tf, (SequenceNode) root.left()); } else { tf.append("("); _print(tf, (SequenceNode) root.right()); if (root.left() != null) { tf.append(","); } _print(tf, (SequenceNode) root.left()); tf.append(")" + printRootField(root)); } } } } // Recursive call for non-root nodes public void _print(StringBuffer tf, SequenceNode c) { if (c != null) { if (c.isLeaf()) { tf.append(printNodeField(c)); } else { if (c.isDummy()) { _print(tf, (SequenceNode) c.left()); if (c.left() != null) { tf.append(","); } _print(tf, (SequenceNode) c.right()); } else { tf.append("("); _print(tf, (SequenceNode) c.right()); if (c.left() != null) { tf.append(","); } _print(tf, (SequenceNode) c.left()); tf.append(")" + printNodeField(c)); } } } } // Test public static void main(String[] args) { try { if (args == null || args.length != 1) { System.err .println("Takes one argument - file name of a newick tree file."); System.exit(0); } File fn = new File(args[0]); StringBuffer newickfile = new StringBuffer(); BufferedReader treefile = new BufferedReader(new FileReader(fn)); String l; while ((l = treefile.readLine()) != null) { newickfile.append(l); } treefile.close(); System.out.println("Read file :\n"); NewickFile trf = new NewickFile(fn); // trf.parse(); System.out.println("Original file :\n"); System.out.println(Pattern.compile("\n+").matcher(newickfile.toString()) .replaceAll("") + "\n"); System.out.println("Parsed file.\n"); System.out.println("Default output type for original input.\n"); System.out.println(trf.print()); System.out.println("Without bootstraps.\n"); System.out.println(trf.print(false)); System.out.println("Without distances.\n"); System.out.println(trf.print(true, false)); System.out.println("Without bootstraps but with distanecs.\n"); System.out.println(trf.print(false, true)); System.out.println("Without bootstraps or distanecs.\n"); System.out.println(trf.print(false, false)); System.out.println("With bootstraps and with distances.\n"); System.out.println(trf.print(true, true)); System.out.println("leaves.\n"); Vector lvs = new Vector(); trf.findLeaves(trf.root, lvs); Enumeration lv = lvs.elements(); while (lv.hasMoreElements()) { BinaryNode leave = (BinaryNode) lv.nextElement(); if (leave.getName() != null) { System.out.println("Node:'" + leave.getName() + "'"); } } } catch (java.io.IOException e) { System.err.println("Exception\n" + e); e.printStackTrace(); } } /** * Search for leaf nodes. * * @param node * root node to search from * @param leaves * Vector of leaves to add leaf node objects too. * * @return Vector of leaf nodes on binary tree */ public Vector findLeaves(SequenceNode node, Vector leaves) { if (node == null) { return leaves; } if ((node.left() == null) && (node.right() == null)) // Interior node // detection { leaves.addElement(node); return leaves; } else { /* * TODO: Identify internal nodes... if (node.isSequenceLabel()) { * leaves.addElement(node); } */ findLeaves((SequenceNode) node.left(), leaves); findLeaves((SequenceNode) node.right(), leaves); } return leaves; } /** * make tree node vector from a newick tree structure with associated vamsas * objects * * @param ntree * @return Treenode definitions for nodes with associated objects */ public Treenode[] makeTreeNodes() { return makeTreeNodes(true); } /** * make treenode vector for a parsed tree with/out leaf node associations * * @param ignoreplaceholders * if true means only associated nodes are returned * @return treenode vector for associated or all leaves */ public Treenode[] makeTreeNodes(boolean ignoreplaceholders) { Vector leaves = new Vector(); findLeaves(root, leaves); Vector tnv = new Vector(); Enumeration l = leaves.elements(); Hashtable nodespecs = new Hashtable(); while (l.hasMoreElements()) { BinaryNode tnode = (BinaryNode) l.nextElement(); if (tnode instanceof SequenceNode) { if (!(ignoreplaceholders && ((SequenceNode) tnode).isPlaceholder())) { Object assocseq = ((SequenceNode) tnode).element(); if (assocseq instanceof Vobject) { Vobject vobj = (Vobject) assocseq; if (vobj != null) { Treenode node = new Treenode(); node.setNodespec(makeNodeSpec(nodespecs, tnode)); node.setName(tnode.getName()); Vref vr = new Vref(); vr.addRefs(vobj); node.addVref(vr); tnv.addElement(node); } else { System.err.println("WARNING: Unassociated treeNode " + tnode.element().toString() + " " + ((tnode.getName() != null) ? " label " + tnode.getName() : "")); } } } } } if (tnv.size() > 0) { Treenode[] tn = new Treenode[tnv.size()]; tnv.copyInto(tn); return tn; } return new Treenode[] {}; } private String makeNodeSpec(Hashtable nodespecs, BinaryNode tnode) { String nname = new String(tnode.getName()); Integer nindx = (Integer) nodespecs.get(nname); if (nindx == null) { nindx = new Integer(1); } nname = nindx.toString() + " " + nname; return nname; } /** * call to match up Treenode specs to NJTree parsed from document object. * * @param nodespec * @param leaves * as returned from NJTree.findLeaves( .., ..) .. * @return */ private BinaryNode findNodeSpec(String nodespec, Vector leaves) { int occurence = -1; String nspec = nodespec.substring(nodespec.indexOf(' ') + 1); String oval = nodespec.substring(0, nodespec.indexOf(' ')); try { occurence = new Integer(oval).intValue(); } catch (Exception e) { System.err.println("Invalid nodespec '" + nodespec + "'"); return null; } BinaryNode bn = null; int nocc = 0; Enumeration en = leaves.elements(); while (en.hasMoreElements() && nocc < occurence) { bn = (BinaryNode) en.nextElement(); if (bn instanceof SequenceNode && bn.getName().equals(nspec)) { --occurence; } else bn = null; } return bn; } /** * * re-decorate the newick node representation with the VorbaId of an object * mapped by its corresponding TreeNode. Note: this only takes the first * object in the first set of references. * * @param tn * @return vector of mappings { treenode, SequenceNode, Vobject for VorbaId on * sequence node } */ public Vector attachTreeMap(Treenode[] tn) { if (root != null || tn == null) return null; Vector leaves = new Vector(); Vector nodemap = new Vector(); findLeaves(root, leaves); int sz = tn.length; int i = 0; while (i < sz) { Treenode node = tn[i++]; BinaryNode mappednode = findNodeSpec(node.getNodespec(), leaves); if (mappednode != null && mappednode instanceof SequenceNode) { SequenceNode leaf = (SequenceNode) leaves.elementAt(i++); // check if we can make the specified association Vobject noderef = null; int vrf = 0, refv = 0; while (noderef == null && vrf < node.getVrefCount()) { if (refv < node.getVref(vrf).getRefsCount()) { Object ref = node.getVref(vrf).getRefs(refv++); if (ref instanceof Vobject) { noderef = (Vobject) ref; } } else { refv = 0; vrf++; } } if (noderef != null) { nodemap.addElement(new Object[] { node, leaf, noderef }); leaf.setElement(noderef); leaf.setPlaceholder(false); } else { leaf.setPlaceholder(true); } } } return nodemap; } }