From d99c9d2451c3d45e43f6fd87ca2f18f576537098 Mon Sep 17 00:00:00 2001 From: jprocter Date: Thu, 17 Mar 2005 16:13:18 +0000 Subject: [PATCH] Newick/New Hampshire tree format parser and generator. --- src/jalview/io/NewickFile.java | 490 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 490 insertions(+) create mode 100755 src/jalview/io/NewickFile.java diff --git a/src/jalview/io/NewickFile.java b/src/jalview/io/NewickFile.java new file mode 100755 index 0000000..997903c --- /dev/null +++ b/src/jalview/io/NewickFile.java @@ -0,0 +1,490 @@ +// NewickFile.java +// Tree I/O +// http://evolution.genetics.washington.edu/phylip/newick_doc.html +package jalview.io; + +import java.io.*; +import java.util.*; +import jalview.datamodel.*; + +public class NewickFile extends FileParse +{ + SequenceNode root; + + private boolean HasBootstrap = false; + private boolean HasDistances = false; + private boolean RootHasDistance = false; + private String ErrorStringrange(String Error, String Er, int r, int p, String s) { + return ((Error==null) ? "" : Error) + + Er + + " at position "+p+" ( " + + s.substring((p-r)<0 ? 0 : (p-r), + (p+r)>s.length() ? s.length() : (p+r)) + + " )\n"; + } + + // @tree annotations + // These are set automatically by the reader + public boolean HasBootstrap() { + return HasBootstrap; + } + + public boolean HasDistances() { + return HasDistances; + } + public NewickFile(String inStr) throws IOException + { + super(inStr, "Paste"); + } + + public NewickFile(String inFile, String type) + throws IOException + { + + super(inFile, type); + } + + public void parse() throws IOException + { + String nf; + + { // fill nf with complete tree file + StringBuffer file = new StringBuffer(); + while ( (nf = nextLine()) != null) + { + file.append(nf); + } + nf = file.toString(); + } + + root = new SequenceNode(); + SequenceNode c = root; + int d = -1; + int cp = 0; + int flen = nf.length(); + String Error = null; + String nodename = null; + float distance=-99999; + int bootstrap=-99999; + boolean ascending = false; // flag indicating that we are leaving the current node + com.stevesoft.pat.Regex majorsyms = new com.stevesoft.pat.Regex("[(\\['),;]"); + while (majorsyms.searchFrom(nf, cp) && Error==null) { + int fcp = majorsyms.matchedFrom(); + switch (nf.charAt(fcp)) { + + case '[': // Comment or structured/extended NH format info + com.stevesoft.pat.Regex comment = new com.stevesoft.pat.Regex("]"); + if (comment.searchFrom(nf, fcp)) + { + // Skip the comment field + cp = 1 + comment.matchedFrom(); + } + else + { + Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp, nf); + } + ; + break; + + case '(': + + // ascending should not be set + // New Internal node + if (ascending) + { + Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf); + continue; + } + ; + d++; + if (c.right() == null) + { + c.setRight(new SequenceNode(null, c, 0, null)); + c = (SequenceNode) c.right(); + } + else + { + if (c.left() != null) + { + // Dummy node for polytomy - keeps c.left free for new node + SequenceNode tmpn = new SequenceNode(null, c, null, 0, 0, true); + tmpn.SetChildren(c.left(), c.right()); + c.setRight(tmpn); + } + c.setLeft(new SequenceNode(null, c, 0, null)); + c = (SequenceNode) c.left(); + } + nodename = null; + distance = -99999; + bootstrap = -99999; + cp = fcp + 1; + break; + + // Deal with quoted fields + case '\'': + com.stevesoft.pat.Regex qnodename = new com.stevesoft.pat.Regex( + "([^']|'')+'"); + if (qnodename.searchFrom(nf, fcp)) + { + int nl = qnodename.stringMatched().length(); + nodename = new String(qnodename.stringMatched().substring(0, nl - 1)); + cp = fcp + nl + 1; + } + else + { + Error = ErrorStringrange(Error, "Unterminated quotes for nodename", + 7, fcp, nf); + } + break; + + case ';': + if (d != -1) + { + Error = ErrorStringrange(Error, + "Wayward semicolon (depth=" + d + ")", 7, + fcp, nf); + } + // cp advanced at the end of default + + default: + + // Parse simpler field strings + String fstring = nf.substring(cp, fcp); + com.stevesoft.pat.Regex uqnodename = new com.stevesoft.pat.Regex( + "\\b([^' :;\\](),]+)"); + com.stevesoft.pat.Regex nbootstrap = new com.stevesoft.pat.Regex( + "\\S+([0-9+]+)\\S*:"); + com.stevesoft.pat.Regex ndist = new com.stevesoft.pat.Regex( + ":([-0-9.+]+)"); + if (uqnodename.search(fstring) + && (uqnodename.matchedFrom(1) == 0 + || fstring.charAt(uqnodename.matchedFrom(1) - 1) != ':')) // JBPNote HACK! + + { + if (nodename == null) + { + nodename = uqnodename.stringMatched(1).replace('_', ' '); + } + else + { + Error = ErrorStringrange(Error, + "File has broken algorithm - overwritten nodename", + 10, fcp, + nf); + } + } + if (nbootstrap.search(fstring) + && + nbootstrap.matchedFrom(1) + > uqnodename.matchedFrom(1) + uqnodename.stringMatched().length()) + { + try + { + bootstrap = (new Integer(nbootstrap.stringMatched(1))).intValue(); + HasBootstrap = true; + } + catch (Exception e) + { + Error = ErrorStringrange(Error, "Can't parse bootstrap value", 4, + cp + nbootstrap.matchedFrom(), nf); + } + } + + if (ndist.search(fstring)) + { + try + { + distance = (new Float(ndist.stringMatched(1))).floatValue(); + HasDistances = true; + if (c.parent()==root) { + RootHasDistance = true; + } + } + catch (Exception e) + { + Error = ErrorStringrange(Error, "Can't parse node distance value", + 7, cp + ndist.matchedFrom(), nf); + } + } + + if (ascending) + { + // Write node info here + c.setName(nodename); + c.dist = (HasDistances) ? distance : 0; + c.setBootstrap((HasBootstrap) ? bootstrap : 0); + } + else + { + // Find a place to put the leaf + SequenceNode newnode = new SequenceNode(null, c, + (HasDistances) ? distance : 0, + nodename); + + newnode.setBootstrap(HasBootstrap ? bootstrap : 0); + + if (c.right() == null) + { + c.setRight(newnode); + } + else + { + if (c.left() == null) + { + c.setLeft(newnode); + } + else + { + // Insert a dummy node for polytomy + SequenceNode newdummy = new SequenceNode(null, c, null, 0, 0, true); + newdummy.SetChildren(c.left(), newnode); + c.setLeft(newdummy); + } + } + } + if (ascending) + { + // move back up the tree from preceding closure + c = c.AscendTree(); + if (d > -1 && c == null) + { + Error = ErrorStringrange(Error, + "File broke algorithm: Lost place in tree (is there an extra ')' ?)", + 7, fcp, nf); + } + } + + if (nf.charAt(fcp) == ')') { + d--; + ascending = true; + } else { + if (nf.charAt(fcp) == ',') { + if (ascending) { + ascending = false; + } else { + // Just advance focus, if we need to + if (c.left()!=null && (!c.left().isLeaf())) { + c = (SequenceNode) c.left(); + } + } + } // else : We do nothing if ';' is encountered. + } + // Reset new node properties to obvious fakes + nodename = null; + distance = -99999; + bootstrap = -99999; + + cp=fcp+1; + } + } + + if (Error!=null) { + throw(new IOException("NewickFile: "+Error+"\n")); + } + + root = (SequenceNode) root.right().detach(); // remove the imaginary root. + } + + public NewickFile(SequenceNode newtree) { + root = newtree; + } + + public NewickFile(SequenceNode newtree, boolean bootstrap) { + HasBootstrap = bootstrap; + root=newtree; + } + public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances) { + root = newtree; + HasBootstrap = bootstrap; + HasDistances = distances; + } + + public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances, boolean rootdistance) { + root = newtree; + HasBootstrap = bootstrap; + HasDistances = distances; + RootHasDistance = rootdistance; + } + + public SequenceNode getTree() { + return root; + } + + public String print() { + synchronized (this) { + StringBuffer tf = new StringBuffer(); + print(tf, root); + return (tf.append(";").toString()); + } + } + + public String print(boolean withbootstraps) { + synchronized(this) { + boolean boots = this.HasBootstrap; + this.HasBootstrap = withbootstraps; + String rv = print(); + this.HasBootstrap = boots; + return rv; + } + } + + public String print(boolean withbootstraps, boolean withdists) { + synchronized(this) { + boolean dists = this.HasDistances; + this.HasDistances = withdists; + String rv = print(withbootstraps); + this.HasDistances = dists; + return rv; + } + } + boolean printRootInfo = false; + + + public String print(boolean withbootstraps, boolean withdists, boolean printRootInfo) { + synchronized(this) { + boolean rootinfo = printRootInfo; + this.printRootInfo = printRootInfo; + String rv = print(withbootstraps, withdists); + this.printRootInfo = rootinfo; + return rv; + } + } + private com.stevesoft.pat.Regex[] NodeSafeName = + new com.stevesoft.pat.Regex[] + { new com.stevesoft.pat.Regex().perlCode("m/[\\[,:'()]/"), // test for requiring quotes + new com.stevesoft.pat.Regex().perlCode("s/'/''/"), // escaping quote characters + new com.stevesoft.pat.Regex().perlCode("s/\\/w/_/") // unqoted whitespace transformation + }; + char QuoteChar='\''; + char getQuoteChar() { + return QuoteChar; + } + + char setQuoteChar(char c) { + char old = QuoteChar; + QuoteChar = c; + return old; + } + + private String nodeName(String name) { + if (NodeSafeName[0].search(name)) { + return QuoteChar + NodeSafeName[1].replaceAll(name) + QuoteChar; + } else { + return NodeSafeName[2].replaceAll(name); + } + } + private String printNodeField(SequenceNode c) { + + return ( (c.getName() == null) ? "" + : nodeName(c.getName())) + + ( (HasBootstrap) + ? ( (c.getBootstrap() > -1) + ? " " + c.getBootstrap() + : "") + : "") + + ( (HasDistances) + ? ":" + c.dist : ""); + } + private String printRootField(SequenceNode root) { + + return (printRootInfo) + ? (( (root.getName() == null) ? "" + : nodeName(root.getName())) + + ( (HasBootstrap) + ? ( (root.getBootstrap() > -1) + ? " " + root.getBootstrap() + : "") + : "") + + ( (RootHasDistance) + ? ":" + root.dist : "")) + : ""; + } + +// Non recursive call deals with root node properties + public void print(StringBuffer tf, SequenceNode root) { + if (root!=null) { + if (root.isLeaf() && printRootInfo) { + tf.append(printRootField(root)); + } else { + if (root.isDummy()) { + _print(tf, (SequenceNode) root.right()); + _print(tf, (SequenceNode) root.left()); + } else { + tf.append("("); + _print(tf, (SequenceNode) root.right()); + if (root.left() != null) { + tf.append(","); + } + _print(tf, (SequenceNode) root.left()); + tf.append(")" + printRootField(root)); + } + } + } + } + + // Recursive call for non-root nodes + public void _print(StringBuffer tf, SequenceNode c) { + if (c!=null) { + if (c.isLeaf()) { + tf.append(printNodeField(c)); + } else { + if (c.isDummy()) { + _print(tf, (SequenceNode) c.right()); + _print(tf, (SequenceNode) c.left()); + } else { + tf.append("("); + _print(tf, (SequenceNode) c.right()); + if (c.left() != null) { + tf.append(","); + } + _print(tf, (SequenceNode) c.left()); + tf.append(")" + printNodeField(c)); + } + } + } + } + +// Test + public static void main(String[] args) +{ + try + { + File fn = new File(args[0]); + + StringBuffer newickfile = new StringBuffer(); + BufferedReader treefile = + new BufferedReader(new FileReader(fn)); + String l; + while ((l = treefile.readLine())!=null) { + newickfile.append(l); + } + treefile.close(); + System.out.println("Read file :\n"); + NewickFile trf = new NewickFile(args[0], "File"); + trf.parse(); + System.out.println("Original file :\n"); + com.stevesoft.pat.Regex nonl = new com.stevesoft.pat.Regex("\n+", ""); + System.out.println(nonl.replaceAll(newickfile.toString())+"\n"); + + System.out.println("Parsed file.\n"); + System.out.println("Default output type for original input.\n"); + System.out.println(trf.print()); + System.out.println("Without bootstraps.\n"); + System.out.println(trf.print(false)); + System.out.println("Without distances.\n"); + System.out.println(trf.print(true,false)); + System.out.println("Without bootstraps but with distanecs.\n"); + System.out.println(trf.print(false, true)); + System.out.println("Without bootstraps or distanecs.\n"); + System.out.println(trf.print(false, false)); + System.out.println("With bootstraps and with distances.\n"); + System.out.println(trf.print(true, true)); + } + catch (java.io.IOException e) + { + System.out.println("Exception\n" + e); + e.printStackTrace(); + } +} + +} -- 1.7.10.2