2 * Jalview - A Sequence Alignment Editor and Viewer
3 * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 // http://evolution.genetics.washington.edu/phylip/newick_doc.html
23 // TODO: Implement Basic NHX tag parsing and preservation
24 // TODO: http://evolution.genetics.wustl.edu/eddy/forester/NHX.html
25 // TODO: Extended SequenceNodeI to hold parsed NHX strings
29 import java.util.StringTokenizer;
31 import jalview.datamodel.*;
34 * Parse a new hanpshire style tree Caveats: NHX files are NOT supported and the
35 * tree distances and topology are unreliable when they are parsed. TODO: on
36 * this: NHX codes are appended in comments beginning with &&NHX. The codes are
37 * given below (from http://www.phylosoft.org/forester/NHX.html): Element Type
38 * Description Corresponding phyloXML element (parent element in parentheses) no
39 * tag string name of this node/clade (MUST BE FIRST, IF ASSIGNED) <name>(<clade>) :
40 * decimal branch length to parent node (MUST BE SECOND, IF ASSIGNED)
41 * <branch_length>(<clade>) :GN= string gene name <name>(<sequence>) :AC=
42 * string sequence accession <accession>(<sequence>) :ND= string node
43 * identifier - if this is being used, it has to be unique within each phylogeny
44 * <node_id>(<clade>) :B= decimal confidence value for parent branch
45 * <confidence>(<clade>) :D= 'T', 'F', or '?' 'T' if this node represents a
46 * duplication event - 'F' if this node represents a speciation event, '?' if
47 * this node represents an unknown event (D= tag should be replaced by Ev= tag)
48 * n/a :Ev=duplications>speciations>gene losses>event type>duplication type int
49 * int int string string event (replaces the =D tag), number of duplication,
50 * speciation, and gene loss events, type of event (transfer, fusion, root,
51 * unknown, other, speciation_duplication_loss, unassigned) <events>(<clade>)
52 * :E= string EC number at this node <annotation>(<sequence>) :Fu= string
53 * function at this node <annotation>(<sequence>)
54 * :DS=protein-length>from>to>support>name>from>... int int int double string
55 * int ... domain structure at this node <domain_architecture>(<sequence>) :S=
56 * string species name of the species/phylum at this node <taxonomy>(<clade>)
57 * :T= integer taxonomy ID of the species/phylum at this node <id>(<taxonomy>)
58 * :W= integer width of parent branch <width>(<clade>) :C=rrr.ggg.bbb
59 * integer.integer.integer color of parent branch <color>(<clade>) :Co= 'Y' or
60 * 'N' collapse this node when drawing the tree (default is not to collapse) n/a
61 * :XB= string custom data associated with a branch <property>(<clade>) :XN=
62 * string custom data associated with a node <property>(<clade>) :O= integer
63 * orthologous to this external node n/a :SN= integer subtree neighbors n/a :SO=
64 * integer super orthologous (no duplications on paths) to this external node
70 public class NewickFile extends FileParse
74 private boolean HasBootstrap = false;
76 private boolean HasDistances = false;
78 private boolean RootHasDistance = false;
81 boolean ReplaceUnderscores = false;
83 boolean printRootInfo = true;
85 private com.stevesoft.pat.Regex[] NodeSafeName = new com.stevesoft.pat.Regex[]
86 { new com.stevesoft.pat.Regex().perlCode("m/[\\[,:'()]/"), // test for
89 new com.stevesoft.pat.Regex().perlCode("s/'/''/"), // escaping quote
91 new com.stevesoft.pat.Regex().perlCode("s/\\/w/_/") // unqoted whitespace
95 char QuoteChar = '\'';
98 * Creates a new NewickFile object.
103 * @throws IOException
106 public NewickFile(String inStr) throws IOException
108 super(inStr, "Paste");
112 * Creates a new NewickFile object.
119 * @throws IOException
122 public NewickFile(String inFile, String type) throws IOException
127 public NewickFile(FileParse source) throws IOException
133 * Creates a new NewickFile object.
138 public NewickFile(SequenceNode newtree)
144 * Creates a new NewickFile object.
151 public NewickFile(SequenceNode newtree, boolean bootstrap)
153 HasBootstrap = bootstrap;
158 * Creates a new NewickFile object.
167 public NewickFile(SequenceNode newtree, boolean bootstrap,
171 HasBootstrap = bootstrap;
172 HasDistances = distances;
176 * Creates a new NewickFile object.
184 * @param rootdistance
187 public NewickFile(SequenceNode newtree, boolean bootstrap,
188 boolean distances, boolean rootdistance)
191 HasBootstrap = bootstrap;
192 HasDistances = distances;
193 RootHasDistance = rootdistance;
210 * @return DOCUMENT ME!
212 private String ErrorStringrange(String Error, String Er, int r, int p,
215 return ((Error == null) ? "" : Error)
220 + s.substring(((p - r) < 0) ? 0 : (p - r), ((p + r) > s
221 .length()) ? s.length() : (p + r)) + " )\n";
225 // These are set automatically by the reader
226 public boolean HasBootstrap()
234 * @return DOCUMENT ME!
236 public boolean HasDistances()
241 public boolean HasRootDistance()
243 return RootHasDistance;
247 * parse the filesource as a newick file (new hampshire and/or extended)
249 * @throws IOException
250 * with a line number and character position for badly
251 * formatted NH strings
253 public void parse() throws IOException
257 { // fill nf with complete tree file
259 StringBuffer file = new StringBuffer();
261 while ((nf = nextLine()) != null)
266 nf = file.toString();
269 root = new SequenceNode();
271 SequenceNode realroot = null;
272 SequenceNode c = root;
276 // int flen = nf.length();
279 String nodename = null;
280 String commentString2 = null; // comments after simple node props
282 float DefDistance = (float) 0.001; // @param Default distance for a node -
284 int DefBootstrap = -1; // @param Default bootstrap for a node
286 float distance = DefDistance;
287 int bootstrap = DefBootstrap;
289 boolean ascending = false; // flag indicating that we are leaving the
292 com.stevesoft.pat.Regex majorsyms = new com.stevesoft.pat.Regex(
297 while (majorsyms.searchFrom(nf, cp) && (Error == null))
299 int fcp = majorsyms.matchedFrom();
301 switch (schar = nf.charAt(fcp))
305 // ascending should not be set
309 Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf);
317 if (c.right() == null)
319 c.setRight(new SequenceNode(null, c, null, DefDistance,
320 DefBootstrap, false));
321 c = (SequenceNode) c.right();
325 if (c.left() != null)
327 // Dummy node for polytomy - keeps c.left free for new node
328 SequenceNode tmpn = new SequenceNode(null, c, null, 0, 0, true);
329 tmpn.SetChildren(c.left(), c.right());
333 c.setLeft(new SequenceNode(null, c, null, DefDistance,
334 DefBootstrap, false));
335 c = (SequenceNode) c.left();
338 if (realroot == null)
344 distance = DefDistance;
345 bootstrap = DefBootstrap;
350 // Deal with quoted fields
353 com.stevesoft.pat.Regex qnodename = new com.stevesoft.pat.Regex(
356 if (qnodename.searchFrom(nf, fcp))
358 int nl = qnodename.stringMatched().length();
359 nodename = new String(qnodename.stringMatched().substring(0,
365 Error = ErrorStringrange(Error,
366 "Unterminated quotes for nodename", 7, fcp, nf);
376 Error = ErrorStringrange(Error, "Wayward semicolon (depth=" + d
379 // cp advanced at the end of default
383 // node string contains Comment or structured/extended NH format info
385 * if ((fcp-cp>1 && nf.substring(cp,fcp).trim().length()>1)) { // will
386 * process in remains System.err.println("skipped text:
387 * '"+nf.substring(cp,fcp)+"'"); }
389 // verify termination.
390 com.stevesoft.pat.Regex comment = new com.stevesoft.pat.Regex("]");
391 if (comment.searchFrom(nf, fcp))
393 // Skip the comment field
394 nextcp = comment.matchedFrom() + 1;
395 warningMessage = "Tree file contained comments which may confuse input algorithm.";
398 // cp advanced at the end of default to nextcp, ncp is unchanged so
399 // any node info can be read.
403 Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp,
409 // Parse simpler field strings
410 String fstring = nf.substring(ncp, fcp);
411 // remove any comments before we parse the node info
412 // TODO: test newick file with quoted square brackets in node name (is
414 while (fstring.indexOf(']') > -1)
416 int cstart = fstring.indexOf('[');
417 int cend = fstring.indexOf(']');
418 commentString2 = fstring.substring(cstart + 1, cend);
419 fstring = fstring.substring(0, cstart)
420 + fstring.substring(cend + 1);
423 com.stevesoft.pat.Regex uqnodename = new com.stevesoft.pat.Regex(
424 "\\b([^' :;\\](),]+)");
425 com.stevesoft.pat.Regex nbootstrap = new com.stevesoft.pat.Regex(
426 "\\s*([0-9+]+)\\s*:");
427 com.stevesoft.pat.Regex ndist = new com.stevesoft.pat.Regex(
430 if (uqnodename.search(fstring)
431 && ((uqnodename.matchedFrom(1) == 0) || (fstring
432 .charAt(uqnodename.matchedFrom(1) - 1) != ':'))) // JBPNote
435 if (nodename == null)
437 if (ReplaceUnderscores)
439 nodename = uqnodename.stringMatched(1).replace('_', ' ');
443 nodename = uqnodename.stringMatched(1);
448 Error = ErrorStringrange(Error,
449 "File has broken algorithm - overwritten nodename", 10,
453 // get comment bootstraps
455 if (nbootstrap.search(fstring))
457 if (nbootstrap.stringMatched(1).equals(
458 uqnodename.stringMatched(1)))
460 nodename = null; // no nodename here.
463 || nodename.length() == 0
464 || nbootstrap.matchedFrom(1) > (uqnodename.matchedFrom(1) + uqnodename
465 .stringMatched().length()))
469 bootstrap = (new Integer(nbootstrap.stringMatched(1)))
472 } catch (Exception e)
474 Error = ErrorStringrange(Error,
475 "Can't parse bootstrap value", 4, ncp
476 + nbootstrap.matchedFrom(), nf);
481 boolean nodehasdistance = false;
483 if (ndist.search(fstring))
487 distance = (new Float(ndist.stringMatched(1))).floatValue();
489 nodehasdistance = true;
490 } catch (Exception e)
492 Error = ErrorStringrange(Error,
493 "Can't parse node distance value", 7, ncp
494 + ndist.matchedFrom(), nf);
500 // Write node info here
502 // Trees without distances still need a render distance
503 c.dist = (HasDistances) ? distance : DefDistance;
504 // be consistent for internal bootstrap defaults too
505 c.setBootstrap((HasBootstrap) ? bootstrap : DefBootstrap);
508 RootHasDistance = nodehasdistance; // JBPNote This is really
509 // UGLY!!! Ensure root node gets
510 // its given distance
512 parseNHXNodeProps(c, commentString2);
513 commentString2 = null;
517 // Find a place to put the leaf
518 SequenceNode newnode = new SequenceNode(null, c, nodename,
519 (HasDistances) ? distance : DefDistance,
520 (HasBootstrap) ? bootstrap : DefBootstrap, false);
521 parseNHXNodeProps(c, commentString2);
522 commentString2 = null;
524 if (c.right() == null)
530 if (c.left() == null)
536 // Insert a dummy node for polytomy
537 // dummy nodes have distances
538 SequenceNode newdummy = new SequenceNode(null, c, null,
539 (HasDistances ? 0 : DefDistance), 0, true);
540 newdummy.SetChildren(c.left(), newnode);
548 // move back up the tree from preceding closure
551 if ((d > -1) && (c == null))
553 Error = ErrorStringrange(
555 "File broke algorithm: Lost place in tree (is there an extra ')' ?)",
560 if (nf.charAt(fcp) == ')')
567 if (nf.charAt(fcp) == ',')
575 // Just advance focus, if we need to
576 if ((c.left() != null) && (!c.left().isLeaf()))
578 c = (SequenceNode) c.left();
584 // Reset new node properties to obvious fakes
586 distance = DefDistance;
587 bootstrap = DefBootstrap;
603 throw (new IOException("NewickFile: " + Error + "\n"));
607 throw (new IOException("NewickFile: No Tree read in\n"));
609 // THe next line is failing for topali trees - not sure why yet. if
610 // (root.right()!=null && root.isDummy())
611 root = (SequenceNode) root.right().detach(); // remove the imaginary root.
613 if (!RootHasDistance)
615 root.dist = (HasDistances) ? 0 : DefDistance;
620 * parse NHX codes in comment strings and update NewickFile state flags for
621 * distances and bootstraps, and add any additional properties onto the node.
624 * @param commentString
625 * @param commentString2
627 private void parseNHXNodeProps(SequenceNode c, String commentString)
629 // TODO: store raw comment on the sequenceNode so it can be recovered when
631 if (commentString != null && commentString.startsWith("&&NHX"))
633 StringTokenizer st = new StringTokenizer(commentString.substring(5),":");
634 while (st.hasMoreTokens())
636 String tok = st.nextToken();
637 int colpos=tok.indexOf("=");
641 String code = tok.substring(0, colpos);
642 String value = tok.substring(colpos+1);
644 // parse out code/value pairs
645 if (code.toLowerCase().equals("b"))
648 Float iv = new Float(value);
649 v = iv.intValue(); // jalview only does integer bootstraps
658 System.err.println("Couldn't parse code '"+code+"' = '"+value+"'");
659 e.printStackTrace(System.err);
671 * @return DOCUMENT ME!
673 public SequenceNode getTree()
679 * Generate a newick format tree according to internal flags for bootstraps,
680 * distances and root distances.
682 * @return new hampshire tree in a single line
684 public String print()
688 StringBuffer tf = new StringBuffer();
691 return (tf.append(";").toString());
698 * Generate a newick format tree according to internal flags for distances and
699 * root distances and user specificied writing of bootstraps.
701 * @param withbootstraps
702 * controls if bootstrap values are explicitly written.
704 * @return new hampshire tree in a single line
706 public String print(boolean withbootstraps)
710 boolean boots = this.HasBootstrap;
711 this.HasBootstrap = withbootstraps;
714 this.HasBootstrap = boots;
722 * Generate newick format tree according to internal flags for writing root
725 * @param withbootstraps
726 * explicitly write bootstrap values
728 * explicitly write distances
730 * @return new hampshire tree in a single line
732 public String print(boolean withbootstraps, boolean withdists)
736 boolean dists = this.HasDistances;
737 this.HasDistances = withdists;
739 String rv = print(withbootstraps);
740 this.HasDistances = dists;
747 * Generate newick format tree according to user specified flags
749 * @param withbootstraps
750 * explicitly write bootstrap values
752 * explicitly write distances
753 * @param printRootInfo
754 * explicitly write root distance
756 * @return new hampshire tree in a single line
758 public String print(boolean withbootstraps, boolean withdists,
759 boolean printRootInfo)
763 boolean rootinfo = printRootInfo;
764 this.printRootInfo = printRootInfo;
766 String rv = print(withbootstraps, withdists);
767 this.printRootInfo = rootinfo;
776 * @return DOCUMENT ME!
789 * @return DOCUMENT ME!
791 char setQuoteChar(char c)
793 char old = QuoteChar;
805 * @return DOCUMENT ME!
807 private String nodeName(String name)
809 if (NodeSafeName[0].search(name))
811 return QuoteChar + NodeSafeName[1].replaceAll(name) + QuoteChar;
815 return NodeSafeName[2].replaceAll(name);
825 * @return DOCUMENT ME!
827 private String printNodeField(SequenceNode c)
829 return ((c.getName() == null) ? "" : nodeName(c.getName()))
830 + ((HasBootstrap) ? ((c.getBootstrap() > -1) ? ((c.getName() != null ? " "
831 : "") + c.getBootstrap())
833 : "") + ((HasDistances) ? (":" + c.dist) : "");
842 * @return DOCUMENT ME!
844 private String printRootField(SequenceNode root)
846 return (printRootInfo) ? (((root.getName() == null) ? ""
847 : nodeName(root.getName()))
848 + ((HasBootstrap) ? ((root.getBootstrap() > -1) ? ((root
849 .getName() != null ? " " : "") + +root.getBootstrap())
850 : "") : "") + ((RootHasDistance) ? (":" + root.dist)
854 // Non recursive call deals with root node properties
855 public void print(StringBuffer tf, SequenceNode root)
859 if (root.isLeaf() && printRootInfo)
861 tf.append(printRootField(root));
867 _print(tf, (SequenceNode) root.right());
868 _print(tf, (SequenceNode) root.left());
873 _print(tf, (SequenceNode) root.right());
875 if (root.left() != null)
880 _print(tf, (SequenceNode) root.left());
881 tf.append(")" + printRootField(root));
887 // Recursive call for non-root nodes
888 public void _print(StringBuffer tf, SequenceNode c)
894 tf.append(printNodeField(c));
900 _print(tf, (SequenceNode) c.left());
901 if (c.left() != null)
905 _print(tf, (SequenceNode) c.right());
910 _print(tf, (SequenceNode) c.right());
912 if (c.left() != null)
917 _print(tf, (SequenceNode) c.left());
918 tf.append(")" + printNodeField(c));
925 public static void main(String[] args)
929 if (args == null || args.length != 1)
932 .println("Takes one argument - file name of a newick tree file.");
936 File fn = new File(args[0]);
938 StringBuffer newickfile = new StringBuffer();
939 BufferedReader treefile = new BufferedReader(new FileReader(fn));
942 while ((l = treefile.readLine()) != null)
944 newickfile.append(l);
948 System.out.println("Read file :\n");
950 NewickFile trf = new NewickFile(args[0], "File");
952 System.out.println("Original file :\n");
954 com.stevesoft.pat.Regex nonl = new com.stevesoft.pat.Regex("\n+", "");
955 System.out.println(nonl.replaceAll(newickfile.toString()) + "\n");
957 System.out.println("Parsed file.\n");
958 System.out.println("Default output type for original input.\n");
959 System.out.println(trf.print());
960 System.out.println("Without bootstraps.\n");
961 System.out.println(trf.print(false));
962 System.out.println("Without distances.\n");
963 System.out.println(trf.print(true, false));
964 System.out.println("Without bootstraps but with distanecs.\n");
965 System.out.println(trf.print(false, true));
966 System.out.println("Without bootstraps or distanecs.\n");
967 System.out.println(trf.print(false, false));
968 System.out.println("With bootstraps and with distances.\n");
969 System.out.println(trf.print(true, true));
970 } catch (java.io.IOException e)
972 System.err.println("Exception\n" + e);