2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 // http://evolution.genetics.washington.edu/phylip/newick_doc.html
24 // TODO: Implement Basic NHX tag parsing and preservation
25 // TODO: http://evolution.genetics.wustl.edu/eddy/forester/NHX.html
26 // TODO: Extended SequenceNodeI to hold parsed NHX strings
29 import jalview.datamodel.SequenceNode;
30 import jalview.util.MessageManager;
32 import java.io.BufferedReader;
34 import java.io.FileReader;
35 import java.io.IOException;
36 import java.util.StringTokenizer;
39 * Parse a new hanpshire style tree Caveats: NHX files are NOT supported and the
40 * tree distances and topology are unreliable when they are parsed. TODO: on
41 * this: NHX codes are appended in comments beginning with &&NHX. The codes are
42 * given below (from http://www.phylosoft.org/forester/NHX.html): Element Type
43 * Description Corresponding phyloXML element (parent element in parentheses) no
44 * tag string name of this node/clade (MUST BE FIRST, IF ASSIGNED)
45 * <name>(<clade>) : decimal branch length to parent node (MUST BE SECOND, IF
46 * ASSIGNED) <branch_length>(<clade>) :GN= string gene name <name>(<sequence>)
47 * :AC= string sequence accession <accession>(<sequence>) :ND= string node
48 * identifier - if this is being used, it has to be unique within each phylogeny
49 * <node_id>(<clade>) :B= decimal confidence value for parent branch
50 * <confidence>(<clade>) :D= 'T', 'F', or '?' 'T' if this node represents a
51 * duplication event - 'F' if this node represents a speciation event, '?' if
52 * this node represents an unknown event (D= tag should be replaced by Ev= tag)
53 * n/a :Ev=duplications>speciations>gene losses>event type>duplication type int
54 * int int string string event (replaces the =D tag), number of duplication,
55 * speciation, and gene loss events, type of event (transfer, fusion, root,
56 * unknown, other, speciation_duplication_loss, unassigned) <events>(<clade>)
57 * :E= string EC number at this node <annotation>(<sequence>) :Fu= string
58 * function at this node <annotation>(<sequence>)
59 * :DS=protein-length>from>to>support>name>from>... int int int double string
60 * int ... domain structure at this node <domain_architecture>(<sequence>) :S=
61 * string species name of the species/phylum at this node <taxonomy>(<clade>)
62 * :T= integer taxonomy ID of the species/phylum at this node <id>(<taxonomy>)
63 * :W= integer width of parent branch <width>(<clade>) :C=rrr.ggg.bbb
64 * integer.integer.integer color of parent branch <color>(<clade>) :Co= 'Y' or
65 * 'N' collapse this node when drawing the tree (default is not to collapse) n/a
66 * :XB= string custom data associated with a branch <property>(<clade>) :XN=
67 * string custom data associated with a node <property>(<clade>) :O= integer
68 * orthologous to this external node n/a :SN= integer subtree neighbors n/a :SO=
69 * integer super orthologous (no duplications on paths) to this external node
75 public class NewickFile extends FileParse
79 private boolean HasBootstrap = false;
81 private boolean HasDistances = false;
83 private boolean RootHasDistance = false;
86 boolean ReplaceUnderscores = false;
88 boolean printRootInfo = true;
90 private com.stevesoft.pat.Regex[] NodeSafeName = new com.stevesoft.pat.Regex[]
91 { new com.stevesoft.pat.Regex().perlCode("m/[\\[,:'()]/"), // test for
94 new com.stevesoft.pat.Regex().perlCode("s/'/''/"), // escaping quote
96 new com.stevesoft.pat.Regex().perlCode("s/\\/w/_/") // unqoted whitespace
100 char QuoteChar = '\'';
103 * Creates a new NewickFile object.
108 * @throws IOException
111 public NewickFile(String inStr) throws IOException
113 super(inStr, "Paste");
117 * Creates a new NewickFile object.
124 * @throws IOException
127 public NewickFile(String inFile, String type) throws IOException
132 public NewickFile(FileParse source) throws IOException
138 * Creates a new NewickFile object.
143 public NewickFile(SequenceNode newtree)
149 * Creates a new NewickFile object.
156 public NewickFile(SequenceNode newtree, boolean bootstrap)
158 HasBootstrap = bootstrap;
163 * Creates a new NewickFile object.
172 public NewickFile(SequenceNode newtree, boolean bootstrap,
176 HasBootstrap = bootstrap;
177 HasDistances = distances;
181 * Creates a new NewickFile object.
189 * @param rootdistance
192 public NewickFile(SequenceNode newtree, boolean bootstrap,
193 boolean distances, boolean rootdistance)
196 HasBootstrap = bootstrap;
197 HasDistances = distances;
198 RootHasDistance = rootdistance;
215 * @return DOCUMENT ME!
217 private String ErrorStringrange(String Error, String Er, int r, int p,
220 return ((Error == null) ? "" : Error)
225 + s.substring(((p - r) < 0) ? 0 : (p - r),
226 ((p + r) > s.length()) ? s.length() : (p + r)) + " )\n";
230 // These are set automatically by the reader
231 public boolean HasBootstrap()
239 * @return DOCUMENT ME!
241 public boolean HasDistances()
246 public boolean HasRootDistance()
248 return RootHasDistance;
252 * parse the filesource as a newick file (new hampshire and/or extended)
254 * @throws IOException
255 * with a line number and character position for badly formatted NH
258 public void parse() throws IOException
262 { // fill nf with complete tree file
264 StringBuffer file = new StringBuffer();
266 while ((nf = nextLine()) != null)
271 nf = file.toString();
274 root = new SequenceNode();
276 SequenceNode realroot = null;
277 SequenceNode c = root;
281 // int flen = nf.length();
284 String nodename = null;
285 String commentString2 = null; // comments after simple node props
287 float DefDistance = (float) 0.001; // @param Default distance for a node -
289 int DefBootstrap = -1; // @param Default bootstrap for a node
291 float distance = DefDistance;
292 int bootstrap = DefBootstrap;
294 boolean ascending = false; // flag indicating that we are leaving the
297 com.stevesoft.pat.Regex majorsyms = new com.stevesoft.pat.Regex(
302 boolean parsednodename = false;
303 while (majorsyms.searchFrom(nf, cp) && (Error == null))
305 int fcp = majorsyms.matchedFrom();
307 switch (schar = nf.charAt(fcp))
311 // ascending should not be set
315 Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf);
323 if (c.right() == null)
325 c.setRight(new SequenceNode(null, c, null, DefDistance,
326 DefBootstrap, false));
327 c = (SequenceNode) c.right();
331 if (c.left() != null)
333 // Dummy node for polytomy - keeps c.left free for new node
334 SequenceNode tmpn = new SequenceNode(null, c, null, 0, 0, true);
335 tmpn.SetChildren(c.left(), c.right());
339 c.setLeft(new SequenceNode(null, c, null, DefDistance,
340 DefBootstrap, false));
341 c = (SequenceNode) c.left();
344 if (realroot == null)
350 distance = DefDistance;
351 bootstrap = DefBootstrap;
356 // Deal with quoted fields
359 com.stevesoft.pat.Regex qnodename = new com.stevesoft.pat.Regex(
362 if (qnodename.searchFrom(nf, fcp))
364 int nl = qnodename.stringMatched().length();
365 nodename = new String(qnodename.stringMatched().substring(1,
367 // unpack any escaped colons
368 com.stevesoft.pat.Regex xpandquotes = com.stevesoft.pat.Regex
369 .perlCode("s/''/'/");
370 String widernodename = xpandquotes.replaceAll(nodename);
371 nodename = widernodename;
372 // jump to after end of quoted nodename
373 nextcp = fcp + nl + 1;
374 parsednodename = true;
378 Error = ErrorStringrange(Error,
379 "Unterminated quotes for nodename", 7, fcp, nf);
389 Error = ErrorStringrange(Error, "Wayward semicolon (depth=" + d
392 // cp advanced at the end of default
396 // node string contains Comment or structured/extended NH format info
398 * if ((fcp-cp>1 && nf.substring(cp,fcp).trim().length()>1)) { // will
399 * process in remains System.err.println("skipped text:
400 * '"+nf.substring(cp,fcp)+"'"); }
402 // verify termination.
403 com.stevesoft.pat.Regex comment = new com.stevesoft.pat.Regex("]");
404 if (comment.searchFrom(nf, fcp))
406 // Skip the comment field
407 nextcp = comment.matchedFrom() + 1;
408 warningMessage = "Tree file contained comments which may confuse input algorithm.";
411 // cp advanced at the end of default to nextcp, ncp is unchanged so
412 // any node info can be read.
416 Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp,
422 // Parse simpler field strings
423 String fstring = nf.substring(ncp, fcp);
424 // remove any comments before we parse the node info
425 // TODO: test newick file with quoted square brackets in node name (is
427 while (fstring.indexOf(']') > -1)
429 int cstart = fstring.indexOf('[');
430 int cend = fstring.indexOf(']');
431 commentString2 = fstring.substring(cstart + 1, cend);
432 fstring = fstring.substring(0, cstart)
433 + fstring.substring(cend + 1);
436 com.stevesoft.pat.Regex uqnodename = new com.stevesoft.pat.Regex(
437 "\\b([^' :;\\](),]+)");
438 com.stevesoft.pat.Regex nbootstrap = new com.stevesoft.pat.Regex(
439 "\\s*([0-9+]+)\\s*:");
440 com.stevesoft.pat.Regex ndist = new com.stevesoft.pat.Regex(
444 && uqnodename.search(fstring)
445 && ((uqnodename.matchedFrom(1) == 0) || (fstring
446 .charAt(uqnodename.matchedFrom(1) - 1) != ':'))) // JBPNote
449 if (nodename == null)
451 if (ReplaceUnderscores)
453 nodename = uqnodename.stringMatched(1).replace('_', ' ');
457 nodename = uqnodename.stringMatched(1);
462 Error = ErrorStringrange(Error,
463 "File has broken algorithm - overwritten nodename", 10,
467 // get comment bootstraps
469 if (nbootstrap.search(fstring))
471 if (nbootstrap.stringMatched(1).equals(
472 uqnodename.stringMatched(1)))
474 nodename = null; // no nodename here.
477 || nodename.length() == 0
478 || nbootstrap.matchedFrom(1) > (uqnodename.matchedFrom(1) + uqnodename
479 .stringMatched().length()))
483 bootstrap = (new Integer(nbootstrap.stringMatched(1)))
486 } catch (Exception e)
488 Error = ErrorStringrange(Error,
489 "Can't parse bootstrap value", 4,
490 ncp + nbootstrap.matchedFrom(), nf);
495 boolean nodehasdistance = false;
497 if (ndist.search(fstring))
501 distance = (new Float(ndist.stringMatched(1))).floatValue();
503 nodehasdistance = true;
504 } catch (Exception e)
506 Error = ErrorStringrange(Error,
507 "Can't parse node distance value", 7,
508 ncp + ndist.matchedFrom(), nf);
514 // Write node info here
516 // Trees without distances still need a render distance
517 c.dist = (HasDistances) ? distance : DefDistance;
518 // be consistent for internal bootstrap defaults too
519 c.setBootstrap((HasBootstrap) ? bootstrap : DefBootstrap);
522 RootHasDistance = nodehasdistance; // JBPNote This is really
523 // UGLY!!! Ensure root node gets
524 // its given distance
526 parseNHXNodeProps(c, commentString2);
527 commentString2 = null;
531 // Find a place to put the leaf
532 SequenceNode newnode = new SequenceNode(null, c, nodename,
533 (HasDistances) ? distance : DefDistance,
534 (HasBootstrap) ? bootstrap : DefBootstrap, false);
535 parseNHXNodeProps(c, commentString2);
536 commentString2 = null;
538 if (c.right() == null)
544 if (c.left() == null)
550 // Insert a dummy node for polytomy
551 // dummy nodes have distances
552 SequenceNode newdummy = new SequenceNode(null, c, null,
553 (HasDistances ? 0 : DefDistance), 0, true);
554 newdummy.SetChildren(c.left(), newnode);
562 // move back up the tree from preceding closure
565 if ((d > -1) && (c == null))
567 Error = ErrorStringrange(
569 "File broke algorithm: Lost place in tree (is there an extra ')' ?)",
574 if (nf.charAt(fcp) == ')')
581 if (nf.charAt(fcp) == ',')
589 // Just advance focus, if we need to
590 if ((c.left() != null) && (!c.left().isLeaf()))
592 c = (SequenceNode) c.left();
598 // Reset new node properties to obvious fakes
600 distance = DefDistance;
601 bootstrap = DefBootstrap;
602 commentString2 = null;
603 parsednodename = false;
618 throw (new IOException(MessageManager.formatMessage(
619 "exception.newfile", new String[] { Error.toString() })));
623 throw (new IOException(MessageManager.formatMessage(
624 "exception.newfile", new String[] { MessageManager
625 .getString("label.no_tree_read_in") })));
627 // THe next line is failing for topali trees - not sure why yet. if
628 // (root.right()!=null && root.isDummy())
629 root = (SequenceNode) root.right().detach(); // remove the imaginary root.
631 if (!RootHasDistance)
633 root.dist = (HasDistances) ? 0 : DefDistance;
638 * parse NHX codes in comment strings and update NewickFile state flags for
639 * distances and bootstraps, and add any additional properties onto the node.
642 * @param commentString
643 * @param commentString2
645 private void parseNHXNodeProps(SequenceNode c, String commentString)
647 // TODO: store raw comment on the sequenceNode so it can be recovered when
649 if (commentString != null && commentString.startsWith("&&NHX"))
651 StringTokenizer st = new StringTokenizer(commentString.substring(5),
653 while (st.hasMoreTokens())
655 String tok = st.nextToken();
656 int colpos = tok.indexOf("=");
660 String code = tok.substring(0, colpos);
661 String value = tok.substring(colpos + 1);
664 // parse out code/value pairs
665 if (code.toLowerCase().equals("b"))
668 Float iv = new Float(value);
669 v = iv.intValue(); // jalview only does integer bootstraps
675 } catch (Exception e)
677 System.err.println("Couldn't parse code '" + code + "' = '"
679 e.printStackTrace(System.err);
690 * @return DOCUMENT ME!
692 public SequenceNode getTree()
698 * Generate a newick format tree according to internal flags for bootstraps,
699 * distances and root distances.
701 * @return new hampshire tree in a single line
703 public String print()
707 StringBuffer tf = new StringBuffer();
710 return (tf.append(";").toString());
717 * Generate a newick format tree according to internal flags for distances and
718 * root distances and user specificied writing of bootstraps.
720 * @param withbootstraps
721 * controls if bootstrap values are explicitly written.
723 * @return new hampshire tree in a single line
725 public String print(boolean withbootstraps)
729 boolean boots = this.HasBootstrap;
730 this.HasBootstrap = withbootstraps;
733 this.HasBootstrap = boots;
741 * Generate newick format tree according to internal flags for writing root
744 * @param withbootstraps
745 * explicitly write bootstrap values
747 * explicitly write distances
749 * @return new hampshire tree in a single line
751 public String print(boolean withbootstraps, boolean withdists)
755 boolean dists = this.HasDistances;
756 this.HasDistances = withdists;
758 String rv = print(withbootstraps);
759 this.HasDistances = dists;
766 * Generate newick format tree according to user specified flags
768 * @param withbootstraps
769 * explicitly write bootstrap values
771 * explicitly write distances
772 * @param printRootInfo
773 * explicitly write root distance
775 * @return new hampshire tree in a single line
777 public String print(boolean withbootstraps, boolean withdists,
778 boolean printRootInfo)
782 boolean rootinfo = printRootInfo;
783 this.printRootInfo = printRootInfo;
785 String rv = print(withbootstraps, withdists);
786 this.printRootInfo = rootinfo;
795 * @return DOCUMENT ME!
808 * @return DOCUMENT ME!
810 char setQuoteChar(char c)
812 char old = QuoteChar;
824 * @return DOCUMENT ME!
826 private String nodeName(String name)
828 if (NodeSafeName[0].search(name))
830 return QuoteChar + NodeSafeName[1].replaceAll(name) + QuoteChar;
834 return NodeSafeName[2].replaceAll(name);
844 * @return DOCUMENT ME!
846 private String printNodeField(SequenceNode c)
848 return ((c.getName() == null) ? "" : nodeName(c.getName()))
849 + ((HasBootstrap) ? ((c.getBootstrap() > -1) ? ((c.getName() != null ? " "
850 : "") + c.getBootstrap())
852 : "") + ((HasDistances) ? (":" + c.dist) : "");
861 * @return DOCUMENT ME!
863 private String printRootField(SequenceNode root)
865 return (printRootInfo) ? (((root.getName() == null) ? ""
866 : nodeName(root.getName()))
867 + ((HasBootstrap) ? ((root.getBootstrap() > -1) ? ((root
868 .getName() != null ? " " : "") + +root.getBootstrap())
869 : "") : "") + ((RootHasDistance) ? (":" + root.dist)
873 // Non recursive call deals with root node properties
874 public void print(StringBuffer tf, SequenceNode root)
878 if (root.isLeaf() && printRootInfo)
880 tf.append(printRootField(root));
886 _print(tf, (SequenceNode) root.right());
887 _print(tf, (SequenceNode) root.left());
892 _print(tf, (SequenceNode) root.right());
894 if (root.left() != null)
899 _print(tf, (SequenceNode) root.left());
900 tf.append(")" + printRootField(root));
906 // Recursive call for non-root nodes
907 public void _print(StringBuffer tf, SequenceNode c)
913 tf.append(printNodeField(c));
919 _print(tf, (SequenceNode) c.left());
920 if (c.left() != null)
924 _print(tf, (SequenceNode) c.right());
929 _print(tf, (SequenceNode) c.right());
931 if (c.left() != null)
936 _print(tf, (SequenceNode) c.left());
937 tf.append(")" + printNodeField(c));
944 public static void main(String[] args)
948 if (args == null || args.length != 1)
951 .println("Takes one argument - file name of a newick tree file.");
955 File fn = new File(args[0]);
957 StringBuffer newickfile = new StringBuffer();
958 BufferedReader treefile = new BufferedReader(new FileReader(fn));
961 while ((l = treefile.readLine()) != null)
963 newickfile.append(l);
967 System.out.println("Read file :\n");
969 NewickFile trf = new NewickFile(args[0], "File");
971 System.out.println("Original file :\n");
973 com.stevesoft.pat.Regex nonl = new com.stevesoft.pat.Regex("\n+", "");
974 System.out.println(nonl.replaceAll(newickfile.toString()) + "\n");
976 System.out.println("Parsed file.\n");
977 System.out.println("Default output type for original input.\n");
978 System.out.println(trf.print());
979 System.out.println("Without bootstraps.\n");
980 System.out.println(trf.print(false));
981 System.out.println("Without distances.\n");
982 System.out.println(trf.print(true, false));
983 System.out.println("Without bootstraps but with distanecs.\n");
984 System.out.println(trf.print(false, true));
985 System.out.println("Without bootstraps or distanecs.\n");
986 System.out.println(trf.print(false, false));
987 System.out.println("With bootstraps and with distances.\n");
988 System.out.println(trf.print(true, true));
989 } catch (java.io.IOException e)
991 System.err.println("Exception\n" + e);