2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 // http://evolution.genetics.washington.edu/phylip/newick_doc.html
24 // TODO: Implement Basic NHX tag parsing and preservation
25 // TODO: http://evolution.genetics.wustl.edu/eddy/forester/NHX.html
26 // TODO: Extended SequenceNodeI to hold parsed NHX strings
29 import java.util.Locale;
31 import jalview.datamodel.SequenceNode;
32 import jalview.util.MessageManager;
33 import jalview.util.Platform;
35 import java.io.BufferedReader;
37 import java.io.FileReader;
38 import java.io.IOException;
39 import java.util.StringTokenizer;
41 import com.stevesoft.pat.Regex;
43 // TODO This class does not conform to Java standards for field name capitalization.
46 * Parse a new hanpshire style tree Caveats: NHX files are NOT supported and the
47 * tree distances and topology are unreliable when they are parsed. TODO: on
48 * this: NHX codes are appended in comments beginning with &&NHX. The codes are
49 * given below (from http://www.phylosoft.org/forester/NHX.html): Element Type
50 * Description Corresponding phyloXML element (parent element in parentheses) no
51 * tag string name of this node/clade (MUST BE FIRST, IF ASSIGNED)
52 * <name>(<clade>) : decimal branch length to parent node (MUST BE SECOND, IF
53 * ASSIGNED) <branch_length>(<clade>) :GN= string gene name <name>(<sequence>)
54 * :AC= string sequence accession <accession>(<sequence>) :ND= string node
55 * identifier - if this is being used, it has to be unique within each phylogeny
56 * <node_id>(<clade>) :B= decimal confidence value for parent branch
57 * <confidence>(<clade>) :D= 'T', 'F', or '?' 'T' if this node represents a
58 * duplication event - 'F' if this node represents a speciation event, '?' if
59 * this node represents an unknown event (D= tag should be replaced by Ev= tag)
60 * n/a :Ev=duplications>speciations>gene losses>event type>duplication type int
61 * int int string string event (replaces the =D tag), number of duplication,
62 * speciation, and gene loss events, type of event (transfer, fusion, root,
63 * unknown, other, speciation_duplication_loss, unassigned) <events>(<clade>)
64 * :E= string EC number at this node <annotation>(<sequence>) :Fu= string
65 * function at this node <annotation>(<sequence>)
66 * :DS=protein-length>from>to>support>name>from>... int int int double string
67 * int ... domain structure at this node <domain_architecture>(<sequence>) :S=
68 * string species name of the species/phylum at this node <taxonomy>(<clade>)
69 * :T= integer taxonomy ID of the species/phylum at this node <id>(<taxonomy>)
70 * :W= integer width of parent branch <width>(<clade>) :C=rrr.ggg.bbb
71 * integer.integer.integer color of parent branch <color>(<clade>) :Co= 'Y' or
72 * 'N' collapse this node when drawing the tree (default is not to collapse) n/a
73 * :XB= string custom data associated with a branch <property>(<clade>) :XN=
74 * string custom data associated with a node <property>(<clade>) :O= integer
75 * orthologous to this external node n/a :SN= integer subtree neighbors n/a :SO=
76 * integer super orthologous (no duplications on paths) to this external node
82 public class NewickFile extends FileParse
84 private SequenceNode root;
86 private boolean HasBootstrap = false;
88 private boolean HasDistances = false;
90 private boolean RootHasDistance = false;
93 private boolean ReplaceUnderscores = false;
95 private boolean printRootInfo = true;
97 private static final int REGEX_PERL_NODE_REQUIRE_QUOTE = 0;
99 private static final int REGEX_PERL_NODE_ESCAPE_QUOTE = 1;
101 private static final int REGEX_PERL_NODE_UNQUOTED_WHITESPACE = 2;
103 private static final int REGEX_MAJOR_SYMS = 3;
105 private static final int REGEX_QNODE_NAME = 4;
107 private static final int REGEX_COMMENT = 5;
109 private static final int REGEX_UQNODE_NAME = 6;
111 private static final int REGEX_NBOOTSTRAP = 7;
113 private static final int REGEX_NDIST = 8;
115 private static final int REGEX_NO_LINES = 9;
117 private static final int REGEX_PERL_EXPAND_QUOTES = 10;
119 private static final int REGEX_MAX = 11;
121 private static final Regex[] REGEX = new Regex[REGEX_MAX];
123 private static Regex getRegex(int id)
125 if (REGEX[id] == null)
129 String codePerl = null;
132 case REGEX_PERL_NODE_REQUIRE_QUOTE:
133 codePerl = "m/[\\[,:'()]/";
135 case REGEX_PERL_NODE_ESCAPE_QUOTE:
136 codePerl = "s/'/''/";
138 case REGEX_PERL_NODE_UNQUOTED_WHITESPACE:
139 codePerl = "s/\\/w/_/";
141 case REGEX_PERL_EXPAND_QUOTES:
142 codePerl = "s/''/'/";
144 case REGEX_MAJOR_SYMS:
147 case REGEX_QNODE_NAME:
148 code = "'([^']|'')+'";
153 case REGEX_UQNODE_NAME:
154 code = "\\b([^' :;\\](),]+)";
156 case REGEX_NBOOTSTRAP:
157 code = "\\s*([0-9+]+)\\s*:";
160 code = ":([-0-9Ee.+]+)";
169 return codePerl == null ? Platform.newRegex(code, code2)
170 : Platform.newRegexPerl(codePerl);
176 private char quoteChar = '\'';
179 * Creates a new NewickFile object.
184 * @throws IOException
187 public NewickFile(String inStr) throws IOException
189 super(inStr, DataSourceType.PASTE);
193 * Creates a new NewickFile object.
200 * @throws IOException
203 public NewickFile(String inFile, DataSourceType protocol)
206 super(inFile, protocol);
209 public NewickFile(FileParse source) throws IOException
215 * Creates a new NewickFile object.
220 public NewickFile(SequenceNode newtree)
226 * Creates a new NewickFile object.
233 public NewickFile(SequenceNode newtree, boolean bootstrap)
235 HasBootstrap = bootstrap;
240 * Creates a new NewickFile object.
249 public NewickFile(SequenceNode newtree, boolean bootstrap,
253 HasBootstrap = bootstrap;
254 HasDistances = distances;
258 * Creates a new NewickFile object.
266 * @param rootdistance
269 public NewickFile(SequenceNode newtree, boolean bootstrap,
270 boolean distances, boolean rootdistance)
273 HasBootstrap = bootstrap;
274 HasDistances = distances;
275 RootHasDistance = rootdistance;
292 * @return DOCUMENT ME!
294 private String ErrorStringrange(String Error, String Er, int r, int p,
297 return ((Error == null) ? "" : Error) + Er + " at position " + p + " ( "
298 + s.substring(((p - r) < 0) ? 0 : (p - r),
299 ((p + r) > s.length()) ? s.length() : (p + r))
304 // These are set automatically by the reader
305 public boolean HasBootstrap()
313 * @return DOCUMENT ME!
315 public boolean HasDistances()
320 public boolean HasRootDistance()
322 return RootHasDistance;
326 * parse the filesource as a newick file (new hampshire and/or extended)
328 * @throws IOException
329 * with a line number and character position for badly formatted NH
332 public void parse() throws IOException
334 Platform.ensureRegex();
337 { // fill nf with complete tree file
339 StringBuffer file = new StringBuffer();
341 while ((nf = nextLine()) != null)
346 nf = file.toString();
349 root = new SequenceNode();
351 SequenceNode realroot = null;
352 SequenceNode c = root;
356 // int flen = nf.length();
359 String nodename = null;
360 String commentString2 = null; // comments after simple node props
362 float DefDistance = (float) 0.001; // @param Default distance for a node -
364 int DefBootstrap = -1; // @param Default bootstrap for a node
366 float distance = DefDistance;
367 int bootstrap = DefBootstrap;
369 boolean ascending = false; // flag indicating that we are leaving the
372 Regex majorsyms = getRegex(REGEX_MAJOR_SYMS); // "[(\\['),;]"
376 boolean parsednodename = false;
377 while (majorsyms.searchFrom(nf, cp) && (Error == null))
379 int fcp = majorsyms.matchedFrom();
381 switch (schar = nf.charAt(fcp))
385 // ascending should not be set
389 Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf);
395 if (c.right() == null)
397 c.setRight(new SequenceNode(null, c, null, DefDistance,
398 DefBootstrap, false));
399 c = (SequenceNode) c.right();
403 if (c.left() != null)
405 // Dummy node for polytomy - keeps c.left free for new node
406 SequenceNode tmpn = new SequenceNode(null, c, null, 0, 0, true);
407 tmpn.SetChildren(c.left(), c.right());
411 c.setLeft(new SequenceNode(null, c, null, DefDistance,
412 DefBootstrap, false));
413 c = (SequenceNode) c.left();
416 if (realroot == null)
422 distance = DefDistance;
423 bootstrap = DefBootstrap;
428 // Deal with quoted fields
431 Regex qnodename = getRegex(REGEX_QNODE_NAME);// "'([^']|'')+'");
433 if (qnodename.searchFrom(nf, fcp))
435 int nl = qnodename.stringMatched().length();
436 nodename = new String(
437 qnodename.stringMatched().substring(1, nl - 1));
438 // unpack any escaped colons
439 Regex xpandquotes = getRegex(REGEX_PERL_EXPAND_QUOTES);
440 String widernodename = xpandquotes.replaceAll(nodename);
441 nodename = widernodename;
442 // jump to after end of quoted nodename
443 nextcp = fcp + nl + 1;
444 parsednodename = true;
448 Error = ErrorStringrange(Error,
449 "Unterminated quotes for nodename", 7, fcp, nf);
459 Error = ErrorStringrange(Error,
460 "Wayward semicolon (depth=" + d + ")", 7, fcp, nf);
462 // cp advanced at the end of default
466 // node string contains Comment or structured/extended NH format info
468 * if ((fcp-cp>1 && nf.substring(cp,fcp).trim().length()>1)) { // will
469 * process in remains System.err.println("skipped text:
470 * '"+nf.substring(cp,fcp)+"'"); }
472 // verify termination.
473 Regex comment = getRegex(REGEX_COMMENT); // "]"
474 if (comment.searchFrom(nf, fcp))
476 // Skip the comment field
477 nextcp = comment.matchedFrom() + 1;
478 warningMessage = "Tree file contained comments which may confuse input algorithm.";
481 // cp advanced at the end of default to nextcp, ncp is unchanged so
482 // any node info can be read.
486 Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp,
490 // Parse simpler field strings
491 String fstring = nf.substring(ncp, fcp);
492 // remove any comments before we parse the node info
493 // TODO: test newick file with quoted square brackets in node name (is
495 while (fstring.indexOf(']') > -1)
497 int cstart = fstring.indexOf('[');
498 int cend = fstring.indexOf(']');
499 commentString2 = fstring.substring(cstart + 1, cend);
500 fstring = fstring.substring(0, cstart)
501 + fstring.substring(cend + 1);
504 Regex uqnodename = getRegex(REGEX_UQNODE_NAME);// "\\b([^' :;\\](),]+)"
505 Regex nbootstrap = getRegex(REGEX_NBOOTSTRAP);// "\\s*([0-9+]+)\\s*:");
506 Regex ndist = getRegex(REGEX_NDIST);// ":([-0-9Ee.+]+)");
508 if (!parsednodename && uqnodename.search(fstring)
509 && ((uqnodename.matchedFrom(1) == 0) || (fstring
510 .charAt(uqnodename.matchedFrom(1) - 1) != ':'))) // JBPNote
513 if (nodename == null)
515 if (ReplaceUnderscores)
517 nodename = uqnodename.stringMatched(1).replace('_', ' ');
521 nodename = uqnodename.stringMatched(1);
526 Error = ErrorStringrange(Error,
527 "File has broken algorithm - overwritten nodename", 10,
531 // get comment bootstraps
533 if (nbootstrap.search(fstring))
535 if (nbootstrap.stringMatched(1)
536 .equals(uqnodename.stringMatched(1)))
538 nodename = null; // no nodename here.
540 if (nodename == null || nodename.length() == 0
541 || nbootstrap.matchedFrom(1) > (uqnodename.matchedFrom(1)
542 + uqnodename.stringMatched().length()))
546 bootstrap = (Integer.valueOf(nbootstrap.stringMatched(1)))
549 } catch (Exception e)
551 Error = ErrorStringrange(Error, "Can't parse bootstrap value",
552 4, ncp + nbootstrap.matchedFrom(), nf);
557 boolean nodehasdistance = false;
559 if (ndist.search(fstring))
563 distance = (Float.valueOf(ndist.stringMatched(1))).floatValue();
565 nodehasdistance = true;
566 } catch (Exception e)
568 Error = ErrorStringrange(Error,
569 "Can't parse node distance value", 7,
570 ncp + ndist.matchedFrom(), nf);
576 // Write node info here
578 // Trees without distances still need a render distance
579 c.dist = (HasDistances) ? distance : DefDistance;
580 // be consistent for internal bootstrap defaults too
581 c.setBootstrap((HasBootstrap) ? bootstrap : DefBootstrap);
584 RootHasDistance = nodehasdistance; // JBPNote This is really
585 // UGLY!!! Ensure root node gets
586 // its given distance
588 parseNHXNodeProps(c, commentString2);
589 commentString2 = null;
593 // Find a place to put the leaf
594 SequenceNode newnode = new SequenceNode(null, c, nodename,
595 (HasDistances) ? distance : DefDistance,
596 (HasBootstrap) ? bootstrap : DefBootstrap, false);
597 parseNHXNodeProps(c, commentString2);
598 commentString2 = null;
600 if (c.right() == null)
606 if (c.left() == null)
612 // Insert a dummy node for polytomy
613 // dummy nodes have distances
614 SequenceNode newdummy = new SequenceNode(null, c, null,
615 (HasDistances ? 0 : DefDistance), 0, true);
616 newdummy.SetChildren(c.left(), newnode);
624 // move back up the tree from preceding closure
627 if ((d > -1) && (c == null))
629 Error = ErrorStringrange(Error,
630 "File broke algorithm: Lost place in tree (is there an extra ')' ?)",
635 if (nf.charAt(fcp) == ')')
642 if (nf.charAt(fcp) == ',')
650 // Just advance focus, if we need to
651 if ((c.left() != null) && (!c.left().isLeaf()))
653 c = (SequenceNode) c.left();
659 // Reset new node properties to obvious fakes
661 distance = DefDistance;
662 bootstrap = DefBootstrap;
663 commentString2 = null;
664 parsednodename = false;
679 throw (new IOException(
680 MessageManager.formatMessage("exception.newfile", new String[]
681 { Error.toString() })));
685 throw (new IOException(
686 MessageManager.formatMessage("exception.newfile", new String[]
687 { MessageManager.getString("label.no_tree_read_in") })));
689 // THe next line is failing for topali trees - not sure why yet. if
690 // (root.right()!=null && root.isDummy())
691 root = (SequenceNode) root.right().detach(); // remove the imaginary root.
693 if (!RootHasDistance)
695 root.dist = (HasDistances) ? 0 : DefDistance;
700 * parse NHX codes in comment strings and update NewickFile state flags for
701 * distances and bootstraps, and add any additional properties onto the node.
704 * @param commentString
705 * @param commentString2
707 private void parseNHXNodeProps(SequenceNode c, String commentString)
709 // TODO: store raw comment on the sequenceNode so it can be recovered when
711 if (commentString != null && commentString.startsWith("&&NHX"))
713 StringTokenizer st = new StringTokenizer(commentString.substring(5),
715 while (st.hasMoreTokens())
717 String tok = st.nextToken();
718 int colpos = tok.indexOf("=");
722 String code = tok.substring(0, colpos);
723 String value = tok.substring(colpos + 1);
726 // parse out code/value pairs
727 if (code.toLowerCase(Locale.ROOT).equals("b"))
730 Float iv = Float.valueOf(value);
731 v = iv.intValue(); // jalview only does integer bootstraps
737 } catch (Exception e)
740 "Couldn't parse code '" + code + "' = '" + value + "'");
741 e.printStackTrace(System.err);
752 * @return DOCUMENT ME!
754 public SequenceNode getTree()
760 * Generate a newick format tree according to internal flags for bootstraps,
761 * distances and root distances.
763 * @return new hampshire tree in a single line
765 public String print()
769 StringBuffer tf = new StringBuffer();
772 return (tf.append(";").toString());
779 * Generate a newick format tree according to internal flags for distances and
780 * root distances and user specificied writing of bootstraps.
782 * @param withbootstraps
783 * controls if bootstrap values are explicitly written.
785 * @return new hampshire tree in a single line
787 public String print(boolean withbootstraps)
791 boolean boots = this.HasBootstrap;
792 this.HasBootstrap = withbootstraps;
795 this.HasBootstrap = boots;
803 * Generate newick format tree according to internal flags for writing root
806 * @param withbootstraps
807 * explicitly write bootstrap values
809 * explicitly write distances
811 * @return new hampshire tree in a single line
813 public String print(boolean withbootstraps, boolean withdists)
817 boolean dists = this.HasDistances;
818 this.HasDistances = withdists;
820 String rv = print(withbootstraps);
821 this.HasDistances = dists;
828 * Generate newick format tree according to user specified flags
830 * @param withbootstraps
831 * explicitly write bootstrap values
833 * explicitly write distances
834 * @param printRootInfo
835 * explicitly write root distance
837 * @return new hampshire tree in a single line
839 public String print(boolean withbootstraps, boolean withdists,
840 boolean printRootInfo)
844 boolean rootinfo = printRootInfo;
845 this.printRootInfo = printRootInfo;
847 String rv = print(withbootstraps, withdists);
848 this.printRootInfo = rootinfo;
857 * @return DOCUMENT ME!
870 * @return DOCUMENT ME!
872 char setQuoteChar(char c)
874 char old = quoteChar;
886 * @return DOCUMENT ME!
888 private String nodeName(String name)
890 if (getRegex(REGEX_PERL_NODE_REQUIRE_QUOTE).search(name))
893 + getRegex(REGEX_PERL_NODE_ESCAPE_QUOTE).replaceAll(name)
898 return getRegex(REGEX_PERL_NODE_UNQUOTED_WHITESPACE).replaceAll(name);
908 * @return DOCUMENT ME!
910 private String printNodeField(SequenceNode c)
912 return ((c.getName() == null) ? "" : nodeName(c.getName()))
913 + ((HasBootstrap) ? ((c.getBootstrap() > -1)
914 ? ((c.getName() != null ? " " : "") + c.getBootstrap())
916 + ((HasDistances) ? (":" + c.dist) : "");
925 * @return DOCUMENT ME!
927 private String printRootField(SequenceNode root)
929 return (printRootInfo)
930 ? (((root.getName() == null) ? "" : nodeName(root.getName()))
932 ? ((root.getBootstrap() > -1)
933 ? ((root.getName() != null ? " " : "")
934 + +root.getBootstrap())
937 + ((RootHasDistance) ? (":" + root.dist) : ""))
941 // Non recursive call deals with root node properties
942 public void print(StringBuffer tf, SequenceNode root)
946 if (root.isLeaf() && printRootInfo)
948 tf.append(printRootField(root));
954 _print(tf, (SequenceNode) root.right());
955 _print(tf, (SequenceNode) root.left());
960 _print(tf, (SequenceNode) root.right());
962 if (root.left() != null)
967 _print(tf, (SequenceNode) root.left());
968 tf.append(")" + printRootField(root));
974 // Recursive call for non-root nodes
975 public void _print(StringBuffer tf, SequenceNode c)
981 tf.append(printNodeField(c));
987 _print(tf, (SequenceNode) c.left());
988 if (c.left() != null)
992 _print(tf, (SequenceNode) c.right());
997 _print(tf, (SequenceNode) c.right());
999 if (c.left() != null)
1004 _print(tf, (SequenceNode) c.left());
1005 tf.append(")" + printNodeField(c));
1016 public static void main(String[] args)
1020 if (args == null || args.length != 1)
1023 "Takes one argument - file name of a newick tree file.");
1027 File fn = new File(args[0]);
1029 StringBuffer newickfile = new StringBuffer();
1030 BufferedReader treefile = new BufferedReader(new FileReader(fn));
1033 while ((l = treefile.readLine()) != null)
1035 newickfile.append(l);
1039 System.out.println("Read file :\n");
1041 NewickFile trf = new NewickFile(args[0], DataSourceType.FILE);
1043 System.out.println("Original file :\n");
1045 Regex nonl = getRegex(REGEX_NO_LINES);// "\n+", "");
1046 System.out.println(nonl.replaceAll(newickfile.toString()) + "\n");
1048 System.out.println("Parsed file.\n");
1049 System.out.println("Default output type for original input.\n");
1050 System.out.println(trf.print());
1051 System.out.println("Without bootstraps.\n");
1052 System.out.println(trf.print(false));
1053 System.out.println("Without distances.\n");
1054 System.out.println(trf.print(true, false));
1055 System.out.println("Without bootstraps but with distanecs.\n");
1056 System.out.println(trf.print(false, true));
1057 System.out.println("Without bootstraps or distanecs.\n");
1058 System.out.println(trf.print(false, false));
1059 System.out.println("With bootstraps and with distances.\n");
1060 System.out.println(trf.print(true, true));
1061 } catch (java.io.IOException e)
1063 System.err.println("Exception\n" + e);
1064 e.printStackTrace();