2 * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
3 * Copyright (C) $$Year-Rel$$ The Jalview Authors
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3
10 * of the License, or (at your option) any later version.
12 * Jalview is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
19 * The Jalview Authors are detailed in the 'AUTHORS' file.
23 // http://evolution.genetics.washington.edu/phylip/newick_doc.html
24 // TODO: Implement Basic NHX tag parsing and preservation
25 // TODO: http://evolution.genetics.wustl.edu/eddy/forester/NHX.html
26 // TODO: Extended SequenceNodeI to hold parsed NHX strings
29 import jalview.datamodel.SequenceNode;
30 import jalview.jsdev.RegExp;
31 import jalview.jsdev.api.RegExpInterface;
32 import jalview.util.MessageManager;
34 import java.io.BufferedReader;
36 import java.io.FileReader;
37 import java.io.IOException;
38 import java.util.StringTokenizer;
40 import javajs.J2SIgnoreImport;
42 //import com.stevesoft.pat.Regex;
45 * Parse a new hanpshire style tree Caveats: NHX files are NOT supported and the
46 * tree distances and topology are unreliable when they are parsed. TODO: on
47 * this: NHX codes are appended in comments beginning with &&NHX. The codes are
48 * given below (from http://www.phylosoft.org/forester/NHX.html): Element Type
49 * Description Corresponding phyloXML element (parent element in parentheses) no
50 * tag string name of this node/clade (MUST BE FIRST, IF ASSIGNED)
51 * <name>(<clade>) : decimal branch length to parent node (MUST BE SECOND, IF
52 * ASSIGNED) <branch_length>(<clade>) :GN= string gene name <name>(<sequence>)
53 * :AC= string sequence accession <accession>(<sequence>) :ND= string node
54 * identifier - if this is being used, it has to be unique within each phylogeny
55 * <node_id>(<clade>) :B= decimal confidence value for parent branch
56 * <confidence>(<clade>) :D= 'T', 'F', or '?' 'T' if this node represents a
57 * duplication event - 'F' if this node represents a speciation event, '?' if
58 * this node represents an unknown event (D= tag should be replaced by Ev= tag)
59 * n/a :Ev=duplications>speciations>gene losses>event type>duplication type int
60 * int int string string event (replaces the =D tag), number of duplication,
61 * speciation, and gene loss events, type of event (transfer, fusion, root,
62 * unknown, other, speciation_duplication_loss, unassigned) <events>(<clade>)
63 * :E= string EC number at this node <annotation>(<sequence>) :Fu= string
64 * function at this node <annotation>(<sequence>)
65 * :DS=protein-length>from>to>support>name>from>... int int int double string
66 * int ... domain structure at this node <domain_architecture>(<sequence>) :S=
67 * string species name of the species/phylum at this node <taxonomy>(<clade>)
68 * :T= integer taxonomy ID of the species/phylum at this node <id>(<taxonomy>)
69 * :W= integer width of parent branch <width>(<clade>) :C=rrr.ggg.bbb
70 * integer.integer.integer color of parent branch <color>(<clade>) :Co= 'Y' or
71 * 'N' collapse this node when drawing the tree (default is not to collapse) n/a
72 * :XB= string custom data associated with a branch <property>(<clade>) :XN=
73 * string custom data associated with a node <property>(<clade>) :O= integer
74 * orthologous to this external node n/a :SN= integer subtree neighbors n/a :SO=
75 * integer super orthologous (no duplications on paths) to this external node
81 @J2SIgnoreImport({java.io.File.class, java.io.FileReader.class})
82 public class NewickFile extends FileParse
86 private boolean HasBootstrap = false;
88 private boolean HasDistances = false;
90 private boolean RootHasDistance = false;
93 boolean ReplaceUnderscores = false;
95 boolean printRootInfo = true;
97 private RegExpInterface[] NodeSafeName = new RegExpInterface[]
98 { RegExp.perlCode("m/[\\[,:'()]/"), // test for
101 RegExp.perlCode("s/'/''/"), // escaping quote
103 RegExp.perlCode("s/\\/w/_/") // unqoted whitespace
107 char QuoteChar = '\'';
110 * Creates a new NewickFile object.
115 * @throws IOException
118 public NewickFile(String inStr) throws IOException
120 super(inStr, "Paste");
124 * Creates a new NewickFile object.
131 * @throws IOException
134 public NewickFile(String inFile, String type) throws IOException
139 public NewickFile(FileParse source) throws IOException
145 * Creates a new NewickFile object.
150 public NewickFile(SequenceNode newtree)
156 * Creates a new NewickFile object.
163 public NewickFile(SequenceNode newtree, boolean bootstrap)
165 HasBootstrap = bootstrap;
170 * Creates a new NewickFile object.
179 public NewickFile(SequenceNode newtree, boolean bootstrap,
183 HasBootstrap = bootstrap;
184 HasDistances = distances;
188 * Creates a new NewickFile object.
196 * @param rootdistance
199 public NewickFile(SequenceNode newtree, boolean bootstrap,
200 boolean distances, boolean rootdistance)
203 HasBootstrap = bootstrap;
204 HasDistances = distances;
205 RootHasDistance = rootdistance;
222 * @return DOCUMENT ME!
224 private String ErrorStringrange(String Error, String Er, int r, int p,
227 return ((Error == null) ? "" : Error)
232 + s.substring(((p - r) < 0) ? 0 : (p - r),
233 ((p + r) > s.length()) ? s.length() : (p + r)) + " )\n";
237 // These are set automatically by the reader
238 public boolean HasBootstrap()
246 * @return DOCUMENT ME!
248 public boolean HasDistances()
253 public boolean HasRootDistance()
255 return RootHasDistance;
259 * parse the filesource as a newick file (new hampshire and/or extended)
261 * @throws IOException
262 * with a line number and character position for badly formatted NH
265 public void parse() throws IOException {
268 { // fill nf with complete tree file
270 StringBuffer file = new StringBuffer();
272 while ((nf = nextLine()) != null) {
276 nf = file.toString();
279 root = new SequenceNode();
281 SequenceNode realroot = null;
282 SequenceNode c = root;
286 // int flen = nf.length();
289 String nodename = null;
290 String commentString2 = null; // comments after simple node props
292 float DefDistance = (float) 0.001; // @param Default distance for a node -
294 int DefBootstrap = -1; // @param Default bootstrap for a node
296 float distance = DefDistance;
297 int bootstrap = DefBootstrap;
299 boolean ascending = false; // flag indicating that we are leaving the
302 RegExpInterface majorsyms = RegExp.newRegex("[(\\['),;]");
306 boolean parsednodename = false;
307 while (majorsyms.searchFrom(nf, cp) && (Error == null)) {
308 int fcp = majorsyms.matchedFrom();
310 switch (schar = nf.charAt(fcp)) {
313 // ascending should not be set
316 Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf);
324 if (c.right() == null) {
325 c.setRight(new SequenceNode(null, c, null, DefDistance, DefBootstrap,
327 c = (SequenceNode) c.right();
329 if (c.left() != null) {
330 // Dummy node for polytomy - keeps c.left free for new node
331 SequenceNode tmpn = new SequenceNode(null, c, null, 0, 0, true);
332 tmpn.SetChildren(c.left(), c.right());
336 c.setLeft(new SequenceNode(null, c, null, DefDistance, DefBootstrap,
338 c = (SequenceNode) c.left();
341 if (realroot == null) {
346 distance = DefDistance;
347 bootstrap = DefBootstrap;
352 // Deal with quoted fields
355 RegExpInterface qnodename = RegExp.newRegex("'([^']|'')+'");
357 if (qnodename.searchFrom(nf, fcp)) {
358 int nl = qnodename.stringMatched().length();
359 nodename = new String(qnodename.stringMatched().substring(1, nl - 1));
360 // unpack any escaped colons
361 RegExpInterface xpandquotes = RegExp.perlCode("s/''/'/");
362 String widernodename = xpandquotes.replaceAll(nodename);
363 nodename = widernodename;
364 // jump to after end of quoted nodename
365 nextcp = fcp + nl + 1;
366 parsednodename = true;
368 Error = ErrorStringrange(Error, "Unterminated quotes for nodename",
377 Error = ErrorStringrange(Error, "Wayward semicolon (depth=" + d
380 // cp advanced at the end of default
383 // node string contains Comment or structured/extended NH format info
385 * if ((fcp-cp>1 && nf.substring(cp,fcp).trim().length()>1)) { // will
386 * process in remains System.err.println("skipped text:
387 * '"+nf.substring(cp,fcp)+"'"); }
389 // verify termination.
390 RegExpInterface comment = RegExp.newRegex("]");
391 if (comment.searchFrom(nf, fcp)) {
392 // Skip the comment field
393 nextcp = comment.matchedFrom() + 1;
394 warningMessage = "Tree file contained comments which may confuse input algorithm.";
397 // cp advanced at the end of default to nextcp, ncp is unchanged so
398 // any node info can be read.
400 Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp, nf);
405 // Parse simpler field strings
406 String fstring = nf.substring(ncp, fcp);
407 // remove any comments before we parse the node info
408 // TODO: test newick file with quoted square brackets in node name (is
410 while (fstring.indexOf(']') > -1) {
411 int cstart = fstring.indexOf('[');
412 int cend = fstring.indexOf(']');
413 commentString2 = fstring.substring(cstart + 1, cend);
414 fstring = fstring.substring(0, cstart) + fstring.substring(cend + 1);
417 RegExpInterface uqnodename = RegExp.newRegex("\\b([^' :;\\](),]+)");
418 RegExpInterface nbootstrap = RegExp.newRegex("\\s*([0-9+]+)\\s*:");
419 RegExpInterface ndist = RegExp.newRegex(":([-0-9Ee.+]+)");
422 && uqnodename.search(fstring)
423 && ((uqnodename.matchedFromI(1) == 0) || (fstring.charAt(uqnodename
424 .matchedFromI(1) - 1) != ':'))) // JBPNote
427 if (nodename == null) {
428 if (ReplaceUnderscores) {
429 nodename = uqnodename.stringMatchedI(1).replace('_', ' ');
431 nodename = uqnodename.stringMatchedI(1);
434 Error = ErrorStringrange(Error,
435 "File has broken algorithm - overwritten nodename", 10, fcp, nf);
438 // get comment bootstraps
440 if (nbootstrap.search(fstring)) {
441 if (nbootstrap.stringMatchedI(1).equals(uqnodename.stringMatchedI(1))) {
442 nodename = null; // no nodename here.
445 || nodename.length() == 0
446 || nbootstrap.matchedFromI(1) > (uqnodename.matchedFromI(1) + uqnodename
447 .stringMatched().length())) {
449 bootstrap = (new Integer(nbootstrap.stringMatchedI(1)))
452 } catch (Exception e) {
453 Error = ErrorStringrange(Error, "Can't parse bootstrap value", 4,
454 ncp + nbootstrap.matchedFrom(), nf);
459 boolean nodehasdistance = false;
461 if (ndist.search(fstring)) {
463 distance = (new Float(ndist.stringMatchedI(1))).floatValue();
465 nodehasdistance = true;
466 } catch (Exception e) {
467 Error = ErrorStringrange(Error, "Can't parse node distance value",
468 7, ncp + ndist.matchedFrom(), nf);
473 // Write node info here
475 // Trees without distances still need a render distance
476 c.dist = (HasDistances) ? distance : DefDistance;
477 // be consistent for internal bootstrap defaults too
478 c.setBootstrap((HasBootstrap) ? bootstrap : DefBootstrap);
480 RootHasDistance = nodehasdistance; // JBPNote This is really
481 // UGLY!!! Ensure root node gets
482 // its given distance
484 parseNHXNodeProps(c, commentString2);
485 commentString2 = null;
487 // Find a place to put the leaf
488 SequenceNode newnode = new SequenceNode(null, c, nodename,
489 (HasDistances) ? distance : DefDistance,
490 (HasBootstrap) ? bootstrap : DefBootstrap, false);
491 parseNHXNodeProps(c, commentString2);
492 commentString2 = null;
494 if (c.right() == null) {
497 if (c.left() == null) {
500 // Insert a dummy node for polytomy
501 // dummy nodes have distances
502 SequenceNode newdummy = new SequenceNode(null, c, null,
503 (HasDistances ? 0 : DefDistance), 0, true);
504 newdummy.SetChildren(c.left(), newnode);
511 // move back up the tree from preceding closure
514 if ((d > -1) && (c == null)) {
515 Error = ErrorStringrange(
517 "File broke algorithm: Lost place in tree (is there an extra ')' ?)",
522 if (nf.charAt(fcp) == ')') {
526 if (nf.charAt(fcp) == ',') {
530 // Just advance focus, if we need to
531 if ((c.left() != null) && (!c.left().isLeaf())) {
532 c = (SequenceNode) c.left();
538 // Reset new node properties to obvious fakes
540 distance = DefDistance;
541 bootstrap = DefBootstrap;
542 commentString2 = null;
543 parsednodename = false;
554 throw (new IOException(MessageManager.formatMessage("exception.newfile",
555 new String[] { Error.toString() })));
558 throw (new IOException(MessageManager.formatMessage("exception.newfile",
559 new String[] { MessageManager.getString("label.no_tree_read_in") })));
561 // THe next line is failing for topali trees - not sure why yet. if
562 // (root.right()!=null && root.isDummy())
563 root = (SequenceNode) root.right().detach(); // remove the imaginary root.
565 if (!RootHasDistance) {
566 root.dist = (HasDistances) ? 0 : DefDistance;
571 * parse NHX codes in comment strings and update NewickFile state flags for
572 * distances and bootstraps, and add any additional properties onto the node.
575 * @param commentString
576 * @param commentString2
578 private void parseNHXNodeProps(SequenceNode c, String commentString)
580 // TODO: store raw comment on the sequenceNode so it can be recovered when
582 if (commentString != null && commentString.startsWith("&&NHX"))
584 StringTokenizer st = new StringTokenizer(commentString.substring(5),
586 while (st.hasMoreTokens())
588 String tok = st.nextToken();
589 int colpos = tok.indexOf("=");
593 String code = tok.substring(0, colpos);
594 String value = tok.substring(colpos + 1);
597 // parse out code/value pairs
598 if (code.toLowerCase().equals("b"))
601 Float iv = new Float(value);
602 v = iv.intValue(); // jalview only does integer bootstraps
608 } catch (Exception e)
610 System.err.println("Couldn't parse code '" + code + "' = '"
612 e.printStackTrace(System.err);
623 * @return DOCUMENT ME!
625 public SequenceNode getTree()
631 * Generate a newick format tree according to internal flags for bootstraps,
632 * distances and root distances.
634 * @return new hampshire tree in a single line
636 public String print()
640 StringBuffer tf = new StringBuffer();
643 return (tf.append(";").toString());
650 * Generate a newick format tree according to internal flags for distances and
651 * root distances and user specificied writing of bootstraps.
653 * @param withbootstraps
654 * controls if bootstrap values are explicitly written.
656 * @return new hampshire tree in a single line
658 public String print(boolean withbootstraps)
662 boolean boots = this.HasBootstrap;
663 this.HasBootstrap = withbootstraps;
666 this.HasBootstrap = boots;
674 * Generate newick format tree according to internal flags for writing root
677 * @param withbootstraps
678 * explicitly write bootstrap values
680 * explicitly write distances
682 * @return new hampshire tree in a single line
684 public String print(boolean withbootstraps, boolean withdists)
688 boolean dists = this.HasDistances;
689 this.HasDistances = withdists;
691 String rv = print(withbootstraps);
692 this.HasDistances = dists;
699 * Generate newick format tree according to user specified flags
701 * @param withbootstraps
702 * explicitly write bootstrap values
704 * explicitly write distances
705 * @param printRootInfo
706 * explicitly write root distance
708 * @return new hampshire tree in a single line
710 public String print(boolean withbootstraps, boolean withdists,
711 boolean printRootInfo)
715 boolean rootinfo = printRootInfo;
716 this.printRootInfo = printRootInfo;
718 String rv = print(withbootstraps, withdists);
719 this.printRootInfo = rootinfo;
728 * @return DOCUMENT ME!
741 * @return DOCUMENT ME!
743 char setQuoteChar(char c)
745 char old = QuoteChar;
757 * @return DOCUMENT ME!
759 private String nodeName(String name)
761 if (NodeSafeName[0].search(name))
763 return QuoteChar + NodeSafeName[1].replaceAll(name) + QuoteChar;
767 return NodeSafeName[2].replaceAll(name);
777 * @return DOCUMENT ME!
779 private String printNodeField(SequenceNode c)
781 return ((c.getName() == null) ? "" : nodeName(c.getName()))
782 + ((HasBootstrap) ? ((c.getBootstrap() > -1) ? ((c.getName() != null ? " "
783 : "") + c.getBootstrap())
785 : "") + ((HasDistances) ? (":" + c.dist) : "");
794 * @return DOCUMENT ME!
796 private String printRootField(SequenceNode root)
798 return (printRootInfo) ? (((root.getName() == null) ? ""
799 : nodeName(root.getName()))
800 + ((HasBootstrap) ? ((root.getBootstrap() > -1) ? ((root
801 .getName() != null ? " " : "") + +root.getBootstrap())
802 : "") : "") + ((RootHasDistance) ? (":" + root.dist)
806 // Non recursive call deals with root node properties
807 public void print(StringBuffer tf, SequenceNode root)
811 if (root.isLeaf() && printRootInfo)
813 tf.append(printRootField(root));
819 _print(tf, (SequenceNode) root.right());
820 _print(tf, (SequenceNode) root.left());
825 _print(tf, (SequenceNode) root.right());
827 if (root.left() != null)
832 _print(tf, (SequenceNode) root.left());
833 tf.append(")" + printRootField(root));
839 // Recursive call for non-root nodes
840 public void _print(StringBuffer tf, SequenceNode c)
846 tf.append(printNodeField(c));
852 _print(tf, (SequenceNode) c.left());
853 if (c.left() != null)
857 _print(tf, (SequenceNode) c.right());
862 _print(tf, (SequenceNode) c.right());
864 if (c.left() != null)
869 _print(tf, (SequenceNode) c.left());
870 tf.append(")" + printNodeField(c));
881 public static void main(String[] args)
885 if (args == null || args.length != 1)
888 .println("Takes one argument - file name of a newick tree file.");
892 File fn = new File(args[0]);
894 StringBuffer newickfile = new StringBuffer();
895 BufferedReader treefile = new BufferedReader(new FileReader(fn));
898 while ((l = treefile.readLine()) != null)
900 newickfile.append(l);
904 System.out.println("Read file :\n");
906 NewickFile trf = new NewickFile(args[0], "File");
908 System.out.println("Original file :\n");
910 RegExpInterface nonl = RegExp.newRegex("\n+", "");
911 System.out.println(nonl.replaceAll(newickfile.toString()) + "\n");
913 System.out.println("Parsed file.\n");
914 System.out.println("Default output type for original input.\n");
915 System.out.println(trf.print());
916 System.out.println("Without bootstraps.\n");
917 System.out.println(trf.print(false));
918 System.out.println("Without distances.\n");
919 System.out.println(trf.print(true, false));
920 System.out.println("Without bootstraps but with distanecs.\n");
921 System.out.println(trf.print(false, true));
922 System.out.println("Without bootstraps or distanecs.\n");
923 System.out.println(trf.print(false, false));
924 System.out.println("With bootstraps and with distances.\n");
925 System.out.println(trf.print(true, true));
926 } catch (java.io.IOException e)
928 System.err.println("Exception\n" + e);