2 * Jalview - A Sequence Alignment Editor and Viewer (Version 2.6)
3 * Copyright (C) 2010 J Procter, AM Waterhouse, G Barton, M Clamp, S Searle
5 * This file is part of Jalview.
7 * Jalview is free software: you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
11 * Jalview is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 * PURPOSE. See the GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along with Jalview. If not, see <http://www.gnu.org/licenses/>.
20 // http://evolution.genetics.washington.edu/phylip/newick_doc.html
21 // TODO: Implement Basic NHX tag parsing and preservation
22 // TODO: http://evolution.genetics.wustl.edu/eddy/forester/NHX.html
23 // TODO: Extended SequenceNodeI to hold parsed NHX strings
27 import java.util.StringTokenizer;
29 import jalview.datamodel.*;
32 * Parse a new hanpshire style tree Caveats: NHX files are NOT supported and the
33 * tree distances and topology are unreliable when they are parsed. TODO: on
34 * this: NHX codes are appended in comments beginning with &&NHX. The codes are
35 * given below (from http://www.phylosoft.org/forester/NHX.html): Element Type
36 * Description Corresponding phyloXML element (parent element in parentheses) no
37 * tag string name of this node/clade (MUST BE FIRST, IF ASSIGNED)
38 * <name>(<clade>) : decimal branch length to parent node (MUST BE SECOND, IF
39 * ASSIGNED) <branch_length>(<clade>) :GN= string gene name <name>(<sequence>)
40 * :AC= string sequence accession <accession>(<sequence>) :ND= string node
41 * identifier - if this is being used, it has to be unique within each phylogeny
42 * <node_id>(<clade>) :B= decimal confidence value for parent branch
43 * <confidence>(<clade>) :D= 'T', 'F', or '?' 'T' if this node represents a
44 * duplication event - 'F' if this node represents a speciation event, '?' if
45 * this node represents an unknown event (D= tag should be replaced by Ev= tag)
46 * n/a :Ev=duplications>speciations>gene losses>event type>duplication type int
47 * int int string string event (replaces the =D tag), number of duplication,
48 * speciation, and gene loss events, type of event (transfer, fusion, root,
49 * unknown, other, speciation_duplication_loss, unassigned) <events>(<clade>)
50 * :E= string EC number at this node <annotation>(<sequence>) :Fu= string
51 * function at this node <annotation>(<sequence>)
52 * :DS=protein-length>from>to>support>name>from>... int int int double string
53 * int ... domain structure at this node <domain_architecture>(<sequence>) :S=
54 * string species name of the species/phylum at this node <taxonomy>(<clade>)
55 * :T= integer taxonomy ID of the species/phylum at this node <id>(<taxonomy>)
56 * :W= integer width of parent branch <width>(<clade>) :C=rrr.ggg.bbb
57 * integer.integer.integer color of parent branch <color>(<clade>) :Co= 'Y' or
58 * 'N' collapse this node when drawing the tree (default is not to collapse) n/a
59 * :XB= string custom data associated with a branch <property>(<clade>) :XN=
60 * string custom data associated with a node <property>(<clade>) :O= integer
61 * orthologous to this external node n/a :SN= integer subtree neighbors n/a :SO=
62 * integer super orthologous (no duplications on paths) to this external node
68 public class NewickFile extends FileParse
72 private boolean HasBootstrap = false;
74 private boolean HasDistances = false;
76 private boolean RootHasDistance = false;
79 boolean ReplaceUnderscores = false;
81 boolean printRootInfo = true;
83 private com.stevesoft.pat.Regex[] NodeSafeName = new com.stevesoft.pat.Regex[]
84 { new com.stevesoft.pat.Regex().perlCode("m/[\\[,:'()]/"), // test for
87 new com.stevesoft.pat.Regex().perlCode("s/'/''/"), // escaping quote
89 new com.stevesoft.pat.Regex().perlCode("s/\\/w/_/") // unqoted whitespace
93 char QuoteChar = '\'';
96 * Creates a new NewickFile object.
101 * @throws IOException
104 public NewickFile(String inStr) throws IOException
106 super(inStr, "Paste");
110 * Creates a new NewickFile object.
117 * @throws IOException
120 public NewickFile(String inFile, String type) throws IOException
125 public NewickFile(FileParse source) throws IOException
131 * Creates a new NewickFile object.
136 public NewickFile(SequenceNode newtree)
142 * Creates a new NewickFile object.
149 public NewickFile(SequenceNode newtree, boolean bootstrap)
151 HasBootstrap = bootstrap;
156 * Creates a new NewickFile object.
165 public NewickFile(SequenceNode newtree, boolean bootstrap,
169 HasBootstrap = bootstrap;
170 HasDistances = distances;
174 * Creates a new NewickFile object.
182 * @param rootdistance
185 public NewickFile(SequenceNode newtree, boolean bootstrap,
186 boolean distances, boolean rootdistance)
189 HasBootstrap = bootstrap;
190 HasDistances = distances;
191 RootHasDistance = rootdistance;
208 * @return DOCUMENT ME!
210 private String ErrorStringrange(String Error, String Er, int r, int p,
213 return ((Error == null) ? "" : Error)
218 + s.substring(((p - r) < 0) ? 0 : (p - r),
219 ((p + r) > s.length()) ? s.length() : (p + r)) + " )\n";
223 // These are set automatically by the reader
224 public boolean HasBootstrap()
232 * @return DOCUMENT ME!
234 public boolean HasDistances()
239 public boolean HasRootDistance()
241 return RootHasDistance;
245 * parse the filesource as a newick file (new hampshire and/or extended)
247 * @throws IOException
248 * with a line number and character position for badly formatted NH
251 public void parse() throws IOException
255 { // fill nf with complete tree file
257 StringBuffer file = new StringBuffer();
259 while ((nf = nextLine()) != null)
264 nf = file.toString();
267 root = new SequenceNode();
269 SequenceNode realroot = null;
270 SequenceNode c = root;
274 // int flen = nf.length();
277 String nodename = null;
278 String commentString2 = null; // comments after simple node props
280 float DefDistance = (float) 0.001; // @param Default distance for a node -
282 int DefBootstrap = -1; // @param Default bootstrap for a node
284 float distance = DefDistance;
285 int bootstrap = DefBootstrap;
287 boolean ascending = false; // flag indicating that we are leaving the
290 com.stevesoft.pat.Regex majorsyms = new com.stevesoft.pat.Regex(
295 while (majorsyms.searchFrom(nf, cp) && (Error == null))
297 int fcp = majorsyms.matchedFrom();
299 switch (schar = nf.charAt(fcp))
303 // ascending should not be set
307 Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf);
315 if (c.right() == null)
317 c.setRight(new SequenceNode(null, c, null, DefDistance,
318 DefBootstrap, false));
319 c = (SequenceNode) c.right();
323 if (c.left() != null)
325 // Dummy node for polytomy - keeps c.left free for new node
326 SequenceNode tmpn = new SequenceNode(null, c, null, 0, 0, true);
327 tmpn.SetChildren(c.left(), c.right());
331 c.setLeft(new SequenceNode(null, c, null, DefDistance,
332 DefBootstrap, false));
333 c = (SequenceNode) c.left();
336 if (realroot == null)
342 distance = DefDistance;
343 bootstrap = DefBootstrap;
348 // Deal with quoted fields
351 com.stevesoft.pat.Regex qnodename = new com.stevesoft.pat.Regex(
354 if (qnodename.searchFrom(nf, fcp))
356 int nl = qnodename.stringMatched().length();
357 nodename = new String(qnodename.stringMatched().substring(0,
363 Error = ErrorStringrange(Error,
364 "Unterminated quotes for nodename", 7, fcp, nf);
374 Error = ErrorStringrange(Error, "Wayward semicolon (depth=" + d
377 // cp advanced at the end of default
381 // node string contains Comment or structured/extended NH format info
383 * if ((fcp-cp>1 && nf.substring(cp,fcp).trim().length()>1)) { // will
384 * process in remains System.err.println("skipped text:
385 * '"+nf.substring(cp,fcp)+"'"); }
387 // verify termination.
388 com.stevesoft.pat.Regex comment = new com.stevesoft.pat.Regex("]");
389 if (comment.searchFrom(nf, fcp))
391 // Skip the comment field
392 nextcp = comment.matchedFrom() + 1;
393 warningMessage = "Tree file contained comments which may confuse input algorithm.";
396 // cp advanced at the end of default to nextcp, ncp is unchanged so
397 // any node info can be read.
401 Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp,
407 // Parse simpler field strings
408 String fstring = nf.substring(ncp, fcp);
409 // remove any comments before we parse the node info
410 // TODO: test newick file with quoted square brackets in node name (is
412 while (fstring.indexOf(']') > -1)
414 int cstart = fstring.indexOf('[');
415 int cend = fstring.indexOf(']');
416 commentString2 = fstring.substring(cstart + 1, cend);
417 fstring = fstring.substring(0, cstart)
418 + fstring.substring(cend + 1);
421 com.stevesoft.pat.Regex uqnodename = new com.stevesoft.pat.Regex(
422 "\\b([^' :;\\](),]+)");
423 com.stevesoft.pat.Regex nbootstrap = new com.stevesoft.pat.Regex(
424 "\\s*([0-9+]+)\\s*:");
425 com.stevesoft.pat.Regex ndist = new com.stevesoft.pat.Regex(
428 if (uqnodename.search(fstring)
429 && ((uqnodename.matchedFrom(1) == 0) || (fstring
430 .charAt(uqnodename.matchedFrom(1) - 1) != ':'))) // JBPNote
433 if (nodename == null)
435 if (ReplaceUnderscores)
437 nodename = uqnodename.stringMatched(1).replace('_', ' ');
441 nodename = uqnodename.stringMatched(1);
446 Error = ErrorStringrange(Error,
447 "File has broken algorithm - overwritten nodename", 10,
451 // get comment bootstraps
453 if (nbootstrap.search(fstring))
455 if (nbootstrap.stringMatched(1).equals(
456 uqnodename.stringMatched(1)))
458 nodename = null; // no nodename here.
461 || nodename.length() == 0
462 || nbootstrap.matchedFrom(1) > (uqnodename.matchedFrom(1) + uqnodename
463 .stringMatched().length()))
467 bootstrap = (new Integer(nbootstrap.stringMatched(1)))
470 } catch (Exception e)
472 Error = ErrorStringrange(Error,
473 "Can't parse bootstrap value", 4,
474 ncp + nbootstrap.matchedFrom(), nf);
479 boolean nodehasdistance = false;
481 if (ndist.search(fstring))
485 distance = (new Float(ndist.stringMatched(1))).floatValue();
487 nodehasdistance = true;
488 } catch (Exception e)
490 Error = ErrorStringrange(Error,
491 "Can't parse node distance value", 7,
492 ncp + ndist.matchedFrom(), nf);
498 // Write node info here
500 // Trees without distances still need a render distance
501 c.dist = (HasDistances) ? distance : DefDistance;
502 // be consistent for internal bootstrap defaults too
503 c.setBootstrap((HasBootstrap) ? bootstrap : DefBootstrap);
506 RootHasDistance = nodehasdistance; // JBPNote This is really
507 // UGLY!!! Ensure root node gets
508 // its given distance
510 parseNHXNodeProps(c, commentString2);
511 commentString2 = null;
515 // Find a place to put the leaf
516 SequenceNode newnode = new SequenceNode(null, c, nodename,
517 (HasDistances) ? distance : DefDistance,
518 (HasBootstrap) ? bootstrap : DefBootstrap, false);
519 parseNHXNodeProps(c, commentString2);
520 commentString2 = null;
522 if (c.right() == null)
528 if (c.left() == null)
534 // Insert a dummy node for polytomy
535 // dummy nodes have distances
536 SequenceNode newdummy = new SequenceNode(null, c, null,
537 (HasDistances ? 0 : DefDistance), 0, true);
538 newdummy.SetChildren(c.left(), newnode);
546 // move back up the tree from preceding closure
549 if ((d > -1) && (c == null))
551 Error = ErrorStringrange(
553 "File broke algorithm: Lost place in tree (is there an extra ')' ?)",
558 if (nf.charAt(fcp) == ')')
565 if (nf.charAt(fcp) == ',')
573 // Just advance focus, if we need to
574 if ((c.left() != null) && (!c.left().isLeaf()))
576 c = (SequenceNode) c.left();
582 // Reset new node properties to obvious fakes
584 distance = DefDistance;
585 bootstrap = DefBootstrap;
586 commentString2 = null;
601 throw (new IOException("NewickFile: " + Error + "\n"));
605 throw (new IOException("NewickFile: No Tree read in\n"));
607 // THe next line is failing for topali trees - not sure why yet. if
608 // (root.right()!=null && root.isDummy())
609 root = (SequenceNode) root.right().detach(); // remove the imaginary root.
611 if (!RootHasDistance)
613 root.dist = (HasDistances) ? 0 : DefDistance;
618 * parse NHX codes in comment strings and update NewickFile state flags for
619 * distances and bootstraps, and add any additional properties onto the node.
622 * @param commentString
623 * @param commentString2
625 private void parseNHXNodeProps(SequenceNode c, String commentString)
627 // TODO: store raw comment on the sequenceNode so it can be recovered when
629 if (commentString != null && commentString.startsWith("&&NHX"))
631 StringTokenizer st = new StringTokenizer(commentString.substring(5),
633 while (st.hasMoreTokens())
635 String tok = st.nextToken();
636 int colpos = tok.indexOf("=");
640 String code = tok.substring(0, colpos);
641 String value = tok.substring(colpos + 1);
644 // parse out code/value pairs
645 if (code.toLowerCase().equals("b"))
648 Float iv = new Float(value);
649 v = iv.intValue(); // jalview only does integer bootstraps
655 } catch (Exception e)
657 System.err.println("Couldn't parse code '" + code + "' = '"
659 e.printStackTrace(System.err);
670 * @return DOCUMENT ME!
672 public SequenceNode getTree()
678 * Generate a newick format tree according to internal flags for bootstraps,
679 * distances and root distances.
681 * @return new hampshire tree in a single line
683 public String print()
687 StringBuffer tf = new StringBuffer();
690 return (tf.append(";").toString());
697 * Generate a newick format tree according to internal flags for distances and
698 * root distances and user specificied writing of bootstraps.
700 * @param withbootstraps
701 * controls if bootstrap values are explicitly written.
703 * @return new hampshire tree in a single line
705 public String print(boolean withbootstraps)
709 boolean boots = this.HasBootstrap;
710 this.HasBootstrap = withbootstraps;
713 this.HasBootstrap = boots;
721 * Generate newick format tree according to internal flags for writing root
724 * @param withbootstraps
725 * explicitly write bootstrap values
727 * explicitly write distances
729 * @return new hampshire tree in a single line
731 public String print(boolean withbootstraps, boolean withdists)
735 boolean dists = this.HasDistances;
736 this.HasDistances = withdists;
738 String rv = print(withbootstraps);
739 this.HasDistances = dists;
746 * Generate newick format tree according to user specified flags
748 * @param withbootstraps
749 * explicitly write bootstrap values
751 * explicitly write distances
752 * @param printRootInfo
753 * explicitly write root distance
755 * @return new hampshire tree in a single line
757 public String print(boolean withbootstraps, boolean withdists,
758 boolean printRootInfo)
762 boolean rootinfo = printRootInfo;
763 this.printRootInfo = printRootInfo;
765 String rv = print(withbootstraps, withdists);
766 this.printRootInfo = rootinfo;
775 * @return DOCUMENT ME!
788 * @return DOCUMENT ME!
790 char setQuoteChar(char c)
792 char old = QuoteChar;
804 * @return DOCUMENT ME!
806 private String nodeName(String name)
808 if (NodeSafeName[0].search(name))
810 return QuoteChar + NodeSafeName[1].replaceAll(name) + QuoteChar;
814 return NodeSafeName[2].replaceAll(name);
824 * @return DOCUMENT ME!
826 private String printNodeField(SequenceNode c)
828 return ((c.getName() == null) ? "" : nodeName(c.getName()))
829 + ((HasBootstrap) ? ((c.getBootstrap() > -1) ? ((c.getName() != null ? " "
830 : "") + c.getBootstrap())
832 : "") + ((HasDistances) ? (":" + c.dist) : "");
841 * @return DOCUMENT ME!
843 private String printRootField(SequenceNode root)
845 return (printRootInfo) ? (((root.getName() == null) ? ""
846 : nodeName(root.getName()))
847 + ((HasBootstrap) ? ((root.getBootstrap() > -1) ? ((root
848 .getName() != null ? " " : "") + +root.getBootstrap())
849 : "") : "") + ((RootHasDistance) ? (":" + root.dist)
853 // Non recursive call deals with root node properties
854 public void print(StringBuffer tf, SequenceNode root)
858 if (root.isLeaf() && printRootInfo)
860 tf.append(printRootField(root));
866 _print(tf, (SequenceNode) root.right());
867 _print(tf, (SequenceNode) root.left());
872 _print(tf, (SequenceNode) root.right());
874 if (root.left() != null)
879 _print(tf, (SequenceNode) root.left());
880 tf.append(")" + printRootField(root));
886 // Recursive call for non-root nodes
887 public void _print(StringBuffer tf, SequenceNode c)
893 tf.append(printNodeField(c));
899 _print(tf, (SequenceNode) c.left());
900 if (c.left() != null)
904 _print(tf, (SequenceNode) c.right());
909 _print(tf, (SequenceNode) c.right());
911 if (c.left() != null)
916 _print(tf, (SequenceNode) c.left());
917 tf.append(")" + printNodeField(c));
924 public static void main(String[] args)
928 if (args == null || args.length != 1)
931 .println("Takes one argument - file name of a newick tree file.");
935 File fn = new File(args[0]);
937 StringBuffer newickfile = new StringBuffer();
938 BufferedReader treefile = new BufferedReader(new FileReader(fn));
941 while ((l = treefile.readLine()) != null)
943 newickfile.append(l);
947 System.out.println("Read file :\n");
949 NewickFile trf = new NewickFile(args[0], "File");
951 System.out.println("Original file :\n");
953 com.stevesoft.pat.Regex nonl = new com.stevesoft.pat.Regex("\n+", "");
954 System.out.println(nonl.replaceAll(newickfile.toString()) + "\n");
956 System.out.println("Parsed file.\n");
957 System.out.println("Default output type for original input.\n");
958 System.out.println(trf.print());
959 System.out.println("Without bootstraps.\n");
960 System.out.println(trf.print(false));
961 System.out.println("Without distances.\n");
962 System.out.println(trf.print(true, false));
963 System.out.println("Without bootstraps but with distanecs.\n");
964 System.out.println(trf.print(false, true));
965 System.out.println("Without bootstraps or distanecs.\n");
966 System.out.println(trf.print(false, false));
967 System.out.println("With bootstraps and with distances.\n");
968 System.out.println(trf.print(true, true));
969 } catch (java.io.IOException e)
971 System.err.println("Exception\n" + e);