2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 // http://evolution.genetics.washington.edu/phylip/newick_doc.html
24 // TODO: Implement Basic NHX tag parsing and preservation
25 // TODO: http://evolution.genetics.wustl.edu/eddy/forester/NHX.html
26 // TODO: Extended SequenceNodeI to hold parsed NHX strings
29 import java.io.BufferedReader;
31 import java.io.FileReader;
32 import java.io.IOException;
33 import java.util.Locale;
34 import java.util.StringTokenizer;
36 import com.stevesoft.pat.Regex;
38 import jalview.bin.Jalview;
39 import jalview.bin.Jalview.ExitCode;
40 import jalview.datamodel.BinaryNode;
41 import jalview.datamodel.SequenceNode;
42 import jalview.util.MessageManager;
45 * Parse a new hanpshire style tree Caveats: NHX files are NOT supported and the
46 * tree distances and topology are unreliable when they are parsed. TODO: on
47 * this: NHX codes are appended in comments beginning with &&NHX. The codes are
48 * given below (from http://www.phylosoft.org/forester/NHX.html): Element Type
49 * Description Corresponding phyloXML element (parent element in parentheses) no
50 * tag string name of this node/clade (MUST BE FIRST, IF ASSIGNED)
51 * <name>(<clade>) : decimal branch length to parent node (MUST BE SECOND, IF
52 * ASSIGNED) <branch_length>(<clade>) :GN= string gene name <name>(<sequence>)
53 * :AC= string sequence accession <accession>(<sequence>) :ND= string node
54 * identifier - if this is being used, it has to be unique within each phylogeny
55 * <node_id>(<clade>) :B= decimal confidence value for parent branch
56 * <confidence>(<clade>) :D= 'T', 'F', or '?' 'T' if this node represents a
57 * duplication event - 'F' if this node represents a speciation event, '?' if
58 * this node represents an unknown event (D= tag should be replaced by Ev= tag)
59 * n/a :Ev=duplications>speciations>gene losses>event type>duplication type int
60 * int int string string event (replaces the =D tag), number of duplication,
61 * speciation, and gene loss events, type of event (transfer, fusion, root,
62 * unknown, other, speciation_duplication_loss, unassigned) <events>(<clade>)
63 * :E= string EC number at this node <annotation>(<sequence>) :Fu= string
64 * function at this node <annotation>(<sequence>)
65 * :DS=protein-length>from>to>support>name>from>... int int int double string
66 * int ... domain structure at this node <domain_architecture>(<sequence>) :S=
67 * string species name of the species/phylum at this node <taxonomy>(<clade>)
68 * :T= integer taxonomy ID of the species/phylum at this node <id>(<taxonomy>)
69 * :W= integer width of parent branch <width>(<clade>) :C=rrr.ggg.bbb
70 * integer.integer.integer color of parent branch <color>(<clade>) :Co= 'Y' or
71 * 'N' collapse this node when drawing the tree (default is not to collapse) n/a
72 * :XB= string custom data associated with a branch <property>(<clade>) :XN=
73 * string custom data associated with a node <property>(<clade>) :O= integer
74 * orthologous to this external node n/a :SN= integer subtree neighbors n/a :SO=
75 * integer super orthologous (no duplications on paths) to this external node
81 public class NewickFile extends FileParse
85 private boolean HasBootstrap = false;
87 private boolean HasDistances = false;
89 private boolean RootHasDistance = false;
92 boolean ReplaceUnderscores = false;
94 boolean printRootInfo = true;
96 private Regex[] NodeSafeName = new Regex[] {
97 new Regex().perlCode("m/[\\[,:'()]/"), // test for
100 new Regex().perlCode("s/'/''/"), // escaping quote
102 new Regex().perlCode("s/\\/w/_/") // unqoted whitespace
106 char QuoteChar = '\'';
109 * Creates a new NewickFile object.
114 * @throws IOException
117 public NewickFile(String inStr) throws IOException
119 super(inStr, DataSourceType.PASTE);
123 * Creates a new NewickFile object.
130 * @throws IOException
133 public NewickFile(String inFile, DataSourceType protocol)
136 super(inFile, protocol);
139 public NewickFile(FileParse source) throws IOException
145 * Creates a new NewickFile object.
150 public NewickFile(BinaryNode newtree)
156 * Creates a new NewickFile object.
163 public NewickFile(SequenceNode newtree, boolean bootstrap)
165 HasBootstrap = bootstrap;
170 * Creates a new NewickFile object.
179 public NewickFile(BinaryNode newtree, boolean bootstrap,
183 HasBootstrap = bootstrap;
184 HasDistances = distances;
188 * Creates a new NewickFile object.
196 * @param rootdistance
199 public NewickFile(BinaryNode newtree, boolean bootstrap,
200 boolean distances, boolean rootdistance)
203 HasBootstrap = bootstrap;
204 HasDistances = distances;
205 RootHasDistance = rootdistance;
222 * @return DOCUMENT ME!
224 private String ErrorStringrange(String Error, String Er, int r, int p,
227 return ((Error == null) ? "" : Error) + Er + " at position " + p + " ( "
228 + s.substring(((p - r) < 0) ? 0 : (p - r),
229 ((p + r) > s.length()) ? s.length() : (p + r))
234 // These are set automatically by the reader
235 public boolean HasBootstrap()
243 * @return DOCUMENT ME!
245 public boolean HasDistances()
250 public boolean HasRootDistance()
252 return RootHasDistance;
256 * parse the filesource as a newick file (new hampshire and/or extended)
258 * @throws IOException
259 * with a line number and character position for badly formatted NH
262 public void parse() throws IOException
266 { // fill nf with complete tree file
268 StringBuffer file = new StringBuffer();
270 while ((nf = nextLine()) != null)
275 nf = file.toString();
278 root = new SequenceNode();
280 BinaryNode realroot = null;
285 // int flen = nf.length();
288 String nodename = null;
289 String commentString2 = null; // comments after simple node props
291 double DefDistance = (float) 0.001; // @param Default distance for a node -
293 int DefBootstrap = -1; // @param Default bootstrap for a node
295 double distance = DefDistance;
296 int bootstrap = DefBootstrap;
298 boolean ascending = false; // flag indicating that we are leaving the
301 Regex majorsyms = new Regex("[(\\['),;]");
305 boolean parsednodename = false;
306 while (majorsyms.searchFrom(nf, cp) && (Error == null))
308 int fcp = majorsyms.matchedFrom();
310 switch (schar = nf.charAt(fcp))
314 // ascending should not be set
318 Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf);
324 if (c.right() == null)
326 c.setRight(new SequenceNode(null, c, null, DefDistance,
327 DefBootstrap, false));
332 if (c.left() != null)
334 // Dummy node for polytomy - keeps c.left free for new node
335 BinaryNode tmpn = new SequenceNode(null, c, null, 0, 0, true);
336 tmpn.SetChildren(c.left(), c.right());
340 c.setLeft(new SequenceNode(null, c, null, DefDistance,
341 DefBootstrap, false));
345 if (realroot == null)
351 distance = DefDistance;
352 bootstrap = DefBootstrap;
357 // Deal with quoted fields
360 Regex qnodename = new Regex("'([^']|'')+'");
362 if (qnodename.searchFrom(nf, fcp))
364 int nl = qnodename.stringMatched().length();
365 nodename = new String(
366 qnodename.stringMatched().substring(1, nl - 1));
367 // unpack any escaped colons
368 Regex xpandquotes = Regex.perlCode("s/''/'/");
369 String widernodename = xpandquotes.replaceAll(nodename);
370 nodename = widernodename;
371 // jump to after end of quoted nodename
372 nextcp = fcp + nl + 1;
373 parsednodename = true;
377 Error = ErrorStringrange(Error,
378 "Unterminated quotes for nodename", 7, fcp, nf);
388 Error = ErrorStringrange(Error,
389 "Wayward semicolon (depth=" + d + ")", 7, fcp, nf);
391 // cp advanced at the end of default
395 // node string contains Comment or structured/extended NH format info
397 * if ((fcp-cp>1 && nf.substring(cp,fcp).trim().length()>1)) { // will
398 * process in remains jalview.bin.Console.errPrintln("skipped text:
399 * '"+nf.substring(cp,fcp)+"'"); }
401 // verify termination.
402 Regex comment = new Regex("]");
403 if (comment.searchFrom(nf, fcp))
405 // Skip the comment field
406 nextcp = comment.matchedFrom() + 1;
407 warningMessage = "Tree file contained comments which may confuse input algorithm.";
410 // cp advanced at the end of default to nextcp, ncp is unchanged so
411 // any node info can be read.
415 Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp,
419 // Parse simpler field strings
420 String fstring = nf.substring(ncp, fcp);
421 // remove any comments before we parse the node info
422 // TODO: test newick file with quoted square brackets in node name (is
424 while (fstring.indexOf(']') > -1)
426 int cstart = fstring.indexOf('[');
427 int cend = fstring.indexOf(']');
428 commentString2 = fstring.substring(cstart + 1, cend);
429 fstring = fstring.substring(0, cstart)
430 + fstring.substring(cend + 1);
433 Regex uqnodename = new Regex("\\b([^' :;\\](),]+)");
434 Regex nbootstrap = new Regex("\\s*([0-9+]+)\\s*:");
435 Regex ndist = new Regex(":([-0-9Ee.+]+)");
437 if (!parsednodename && uqnodename.search(fstring)
438 && ((uqnodename.matchedFrom(1) == 0) || (fstring
439 .charAt(uqnodename.matchedFrom(1) - 1) != ':'))) // JBPNote
442 if (nodename == null)
444 if (ReplaceUnderscores)
446 nodename = uqnodename.stringMatched(1).replace('_', ' ');
450 nodename = uqnodename.stringMatched(1);
455 Error = ErrorStringrange(Error,
456 "File has broken algorithm - overwritten nodename", 10,
460 // get comment bootstraps
462 if (nbootstrap.search(fstring))
464 if (nbootstrap.stringMatched(1)
465 .equals(uqnodename.stringMatched(1)))
467 nodename = null; // no nodename here.
469 if (nodename == null || nodename.length() == 0
470 || nbootstrap.matchedFrom(1) > (uqnodename.matchedFrom(1)
471 + uqnodename.stringMatched().length()))
475 bootstrap = (Integer.valueOf(nbootstrap.stringMatched(1)))
478 } catch (Exception e)
480 Error = ErrorStringrange(Error, "Can't parse bootstrap value",
481 4, ncp + nbootstrap.matchedFrom(), nf);
486 boolean nodehasdistance = false;
488 if (ndist.search(fstring))
492 distance = (Double.valueOf(ndist.stringMatched(1)))
495 nodehasdistance = true;
496 } catch (Exception e)
498 Error = ErrorStringrange(Error,
499 "Can't parse node distance value", 7,
500 ncp + ndist.matchedFrom(), nf);
506 // Write node info here
508 // Trees without distances still need a render distance
509 c.dist = (HasDistances) ? distance : DefDistance;
510 // be consistent for internal bootstrap defaults too
511 c.setBootstrap((HasBootstrap) ? bootstrap : DefBootstrap);
514 RootHasDistance = nodehasdistance; // JBPNote This is really
515 // UGLY!!! Ensure root node gets
516 // its given distance
518 parseNHXNodeProps(c, commentString2);
519 commentString2 = null;
523 // Find a place to put the leaf
524 BinaryNode newnode = new SequenceNode(null, c, nodename,
525 (HasDistances) ? distance : DefDistance,
526 (HasBootstrap) ? bootstrap : DefBootstrap, false);
527 parseNHXNodeProps(c, commentString2);
528 commentString2 = null;
530 if (c.right() == null)
536 if (c.left() == null)
542 // Insert a dummy node for polytomy
543 // dummy nodes have distances
544 BinaryNode newdummy = new SequenceNode(null, c, null,
545 (HasDistances ? 0 : DefDistance), 0, true);
546 newdummy.SetChildren(c.left(), newnode);
554 // move back up the tree from preceding closure
557 if ((d > -1) && (c == null))
559 Error = ErrorStringrange(Error,
560 "File broke algorithm: Lost place in tree (is there an extra ')' ?)",
565 if (nf.charAt(fcp) == ')')
572 if (nf.charAt(fcp) == ',')
580 // Just advance focus, if we need to
581 if ((c.left() != null) && (!c.left().isLeaf()))
589 // Reset new node properties to obvious fakes
591 distance = DefDistance;
592 bootstrap = DefBootstrap;
593 commentString2 = null;
594 parsednodename = false;
609 throw (new IOException(
610 MessageManager.formatMessage("exception.newfile", new String[]
611 { Error.toString() })));
615 throw (new IOException(
616 MessageManager.formatMessage("exception.newfile", new String[]
617 { MessageManager.getString("label.no_tree_read_in") })));
619 // THe next line is failing for topali trees - not sure why yet. if
620 // (root.right()!=null && root.isDummy())
621 root = root.right().detach(); // remove the imaginary root.
623 if (!RootHasDistance)
625 root.dist = (HasDistances) ? 0 : DefDistance;
630 * parse NHX codes in comment strings and update NewickFile state flags for
631 * distances and bootstraps, and add any additional properties onto the node.
634 * @param commentString
635 * @param commentString2
637 private void parseNHXNodeProps(BinaryNode c, String commentString)
639 // TODO: store raw comment on the sequenceNode so it can be recovered when
641 if (commentString != null && commentString.startsWith("&&NHX"))
643 StringTokenizer st = new StringTokenizer(commentString.substring(5),
645 while (st.hasMoreTokens())
647 String tok = st.nextToken();
648 int colpos = tok.indexOf("=");
652 String code = tok.substring(0, colpos);
653 String value = tok.substring(colpos + 1);
656 // parse out code/value pairs
657 if (code.toLowerCase(Locale.ROOT).equals("b"))
660 Float iv = Float.valueOf(value);
661 v = iv.intValue(); // jalview only does integer bootstraps
667 } catch (Exception e)
669 jalview.bin.Console.errPrintln(
670 "Couldn't parse code '" + code + "' = '" + value + "'");
671 e.printStackTrace(System.err);
682 * @return DOCUMENT ME!
684 public BinaryNode getTree()
690 * Generate a newick format tree according to internal flags for bootstraps,
691 * distances and root distances.
693 * @return new hampshire tree in a single line
695 public String print()
699 StringBuffer tf = new StringBuffer();
702 return (tf.append(";").toString());
709 * Generate a newick format tree according to internal flags for distances and
710 * root distances and user specificied writing of bootstraps.
712 * @param withbootstraps
713 * controls if bootstrap values are explicitly written.
715 * @return new hampshire tree in a single line
717 public String print(boolean withbootstraps)
721 boolean boots = this.HasBootstrap;
722 this.HasBootstrap = withbootstraps;
725 this.HasBootstrap = boots;
733 * Generate newick format tree according to internal flags for writing root
736 * @param withbootstraps
737 * explicitly write bootstrap values
739 * explicitly write distances
741 * @return new hampshire tree in a single line
743 public String print(boolean withbootstraps, boolean withdists)
747 boolean dists = this.HasDistances;
748 this.HasDistances = withdists;
750 String rv = print(withbootstraps);
751 this.HasDistances = dists;
758 * Generate newick format tree according to user specified flags
760 * @param withbootstraps
761 * explicitly write bootstrap values
763 * explicitly write distances
764 * @param printRootInfo
765 * explicitly write root distance
767 * @return new hampshire tree in a single line
769 public String print(boolean withbootstraps, boolean withdists,
770 boolean printRootInfo)
774 boolean rootinfo = printRootInfo;
775 this.printRootInfo = printRootInfo;
777 String rv = print(withbootstraps, withdists);
778 this.printRootInfo = rootinfo;
787 * @return DOCUMENT ME!
800 * @return DOCUMENT ME!
802 char setQuoteChar(char c)
804 char old = QuoteChar;
816 * @return DOCUMENT ME!
818 private String nodeName(String name)
820 if (NodeSafeName[0].search(name))
822 return QuoteChar + NodeSafeName[1].replaceAll(name) + QuoteChar;
826 return NodeSafeName[2].replaceAll(name);
836 * @return DOCUMENT ME!
838 private String printNodeField(BinaryNode c)
840 return ((c.getName() == null) ? "" : nodeName(c.getName()))
841 + ((HasBootstrap) ? ((c.getBootstrap() > -1)
842 ? ((c.getName() != null ? " " : "") + c.getBootstrap())
844 + ((HasDistances) ? (":" + c.dist) : "");
853 * @return DOCUMENT ME!
855 private String printRootField(BinaryNode root)
857 return (printRootInfo)
858 ? (((root.getName() == null) ? "" : nodeName(root.getName()))
860 ? ((root.getBootstrap() > -1)
861 ? ((root.getName() != null ? " " : "")
862 + +root.getBootstrap())
865 + ((RootHasDistance) ? (":" + root.dist) : ""))
869 // Non recursive call deals with root node properties
870 public void print(StringBuffer tf, BinaryNode root)
874 if (root.isLeaf() && printRootInfo)
876 tf.append(printRootField(root));
882 _print(tf, root.right());
883 _print(tf, root.left());
888 _print(tf, root.right());
890 if (root.left() != null)
895 _print(tf, root.left());
896 tf.append(")" + printRootField(root));
902 // Recursive call for non-root nodes
903 public void _print(StringBuffer tf, BinaryNode c)
909 tf.append(printNodeField(c));
915 _print(tf, c.left());
916 if (c.left() != null)
920 _print(tf, c.right());
925 _print(tf, c.right());
927 if (c.left() != null)
932 _print(tf, c.left());
933 tf.append(")" + printNodeField(c));
944 public static void main(String[] args)
948 if (args == null || args.length != 1)
951 "Takes one argument - file name of a newick tree file.",
952 ExitCode.INVALID_ARGUMENT);
955 File fn = new File(args[0]);
957 StringBuffer newickfile = new StringBuffer();
958 BufferedReader treefile = new BufferedReader(new FileReader(fn));
961 while ((l = treefile.readLine()) != null)
963 newickfile.append(l);
967 jalview.bin.Console.outPrintln("Read file :\n");
969 NewickFile trf = new NewickFile(args[0], DataSourceType.FILE);
971 jalview.bin.Console.outPrintln("Original file :\n");
973 Regex nonl = new Regex("\n+", "");
975 .outPrintln(nonl.replaceAll(newickfile.toString()) + "\n");
977 jalview.bin.Console.outPrintln("Parsed file.\n");
979 .outPrintln("Default output type for original input.\n");
980 jalview.bin.Console.outPrintln(trf.print());
981 jalview.bin.Console.outPrintln("Without bootstraps.\n");
982 jalview.bin.Console.outPrintln(trf.print(false));
983 jalview.bin.Console.outPrintln("Without distances.\n");
984 jalview.bin.Console.outPrintln(trf.print(true, false));
986 .outPrintln("Without bootstraps but with distanecs.\n");
987 jalview.bin.Console.outPrintln(trf.print(false, true));
988 jalview.bin.Console.outPrintln("Without bootstraps or distanecs.\n");
989 jalview.bin.Console.outPrintln(trf.print(false, false));
991 .outPrintln("With bootstraps and with distances.\n");
992 jalview.bin.Console.outPrintln(trf.print(true, true));
993 } catch (java.io.IOException e)
995 jalview.bin.Console.errPrintln("Exception\n" + e);