2 * Jalview - A Sequence Alignment Editor and Viewer
3 * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 // http://evolution.genetics.washington.edu/phylip/newick_doc.html
23 // TODO: Implement Basic NHX tag parsing and preservation
24 // TODO: http://evolution.genetics.wustl.edu/eddy/forester/NHX.html
25 // TODO: Extended SequenceNodeI to hold parsed NHX strings
30 import jalview.datamodel.*;
38 public class NewickFile
42 private boolean HasBootstrap = false;
43 private boolean HasDistances = false;
44 private boolean RootHasDistance = false;
47 boolean ReplaceUnderscores = false;
48 boolean printRootInfo = false;
49 private com.stevesoft.pat.Regex[] NodeSafeName = new com.stevesoft.pat.Regex[]
51 new com.stevesoft.pat.Regex().perlCode("m/[\\[,:'()]/"), // test for requiring quotes
52 new com.stevesoft.pat.Regex().perlCode("s/'/''/"), // escaping quote characters
53 new com.stevesoft.pat.Regex().perlCode("s/\\/w/_/") // unqoted whitespace transformation
55 char QuoteChar = '\'';
58 * Creates a new NewickFile object.
60 * @param inStr DOCUMENT ME!
62 * @throws IOException DOCUMENT ME!
64 public NewickFile(String inStr)
67 super(inStr, "Paste");
71 * Creates a new NewickFile object.
73 * @param inFile DOCUMENT ME!
74 * @param type DOCUMENT ME!
76 * @throws IOException DOCUMENT ME!
78 public NewickFile(String inFile, String type)
85 * Creates a new NewickFile object.
87 * @param newtree DOCUMENT ME!
89 public NewickFile(SequenceNode newtree)
95 * Creates a new NewickFile object.
97 * @param newtree DOCUMENT ME!
98 * @param bootstrap DOCUMENT ME!
100 public NewickFile(SequenceNode newtree, boolean bootstrap)
102 HasBootstrap = bootstrap;
107 * Creates a new NewickFile object.
109 * @param newtree DOCUMENT ME!
110 * @param bootstrap DOCUMENT ME!
111 * @param distances DOCUMENT ME!
113 public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances)
116 HasBootstrap = bootstrap;
117 HasDistances = distances;
121 * Creates a new NewickFile object.
123 * @param newtree DOCUMENT ME!
124 * @param bootstrap DOCUMENT ME!
125 * @param distances DOCUMENT ME!
126 * @param rootdistance DOCUMENT ME!
128 public NewickFile(SequenceNode newtree, boolean bootstrap,
129 boolean distances, boolean rootdistance)
132 HasBootstrap = bootstrap;
133 HasDistances = distances;
134 RootHasDistance = rootdistance;
140 * @param Error DOCUMENT ME!
141 * @param Er DOCUMENT ME!
142 * @param r DOCUMENT ME!
143 * @param p DOCUMENT ME!
144 * @param s DOCUMENT ME!
146 * @return DOCUMENT ME!
148 private String ErrorStringrange(String Error, String Er, int r, int p,
151 return ( (Error == null) ? "" : Error) + Er + " at position " + p +
153 s.substring( ( (p - r) < 0) ? 0 : (p - r),
154 ( (p + r) > s.length()) ? s.length() : (p + r)) + " )\n";
158 // These are set automatically by the reader
159 public boolean HasBootstrap()
167 * @return DOCUMENT ME!
169 public boolean HasDistances()
174 public boolean HasRootDistance()
176 return RootHasDistance;
180 * parse the filesource as a newick file (new hampshire and/or extended)
182 * @throws IOException with a line number and character position for badly formatted NH strings
189 { // fill nf with complete tree file
191 StringBuffer file = new StringBuffer();
193 while ( (nf = nextLine()) != null)
198 nf = file.toString();
201 root = new SequenceNode();
203 SequenceNode realroot = null;
204 SequenceNode c = root;
208 //int flen = nf.length();
211 String nodename = null;
213 float DefDistance = (float) 0.001; // @param Default distance for a node - very very small
214 int DefBootstrap = 0; // @param Default bootstrap for a node
216 float distance = DefDistance;
217 int bootstrap = DefBootstrap;
219 boolean ascending = false; // flag indicating that we are leaving the current node
221 com.stevesoft.pat.Regex majorsyms = new com.stevesoft.pat.Regex(
224 while (majorsyms.searchFrom(nf, cp) && (Error == null))
226 int fcp = majorsyms.matchedFrom();
228 switch (nf.charAt(fcp))
230 case '[': // Comment or structured/extended NH format info
232 com.stevesoft.pat.Regex comment = new com.stevesoft.pat.Regex(
235 if (comment.searchFrom(nf, fcp))
237 // Skip the comment field
238 cp = 1 + comment.matchedFrom();
242 Error = ErrorStringrange(Error, "Unterminated comment", 3,
252 // ascending should not be set
256 Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf);
264 if (c.right() == null)
266 c.setRight(new SequenceNode(null, c, null, DefDistance,
267 DefBootstrap, false));
268 c = (SequenceNode) c.right();
272 if (c.left() != null)
274 // Dummy node for polytomy - keeps c.left free for new node
275 SequenceNode tmpn = new SequenceNode(null, c, null, 0,
277 tmpn.SetChildren(c.left(), c.right());
281 c.setLeft(new SequenceNode(null, c, null, DefDistance,
282 DefBootstrap, false));
283 c = (SequenceNode) c.left();
286 if (realroot == null)
292 distance = DefDistance;
293 bootstrap = DefBootstrap;
298 // Deal with quoted fields
301 com.stevesoft.pat.Regex qnodename = new com.stevesoft.pat.Regex(
304 if (qnodename.searchFrom(nf, fcp))
306 int nl = qnodename.stringMatched().length();
307 nodename = new String(qnodename.stringMatched().substring(0,
313 Error = ErrorStringrange(Error,
314 "Unterminated quotes for nodename", 7, fcp,
324 Error = ErrorStringrange(Error,
325 "Wayward semicolon (depth=" + d + ")", 7,
329 // cp advanced at the end of default
332 // Parse simpler field strings
333 String fstring = nf.substring(cp, fcp);
334 com.stevesoft.pat.Regex uqnodename = new com.stevesoft.pat.Regex(
335 "\\b([^' :;\\](),]+)");
336 com.stevesoft.pat.Regex nbootstrap = new com.stevesoft.pat.Regex(
337 "\\S+([0-9+]+)\\S*:");
338 com.stevesoft.pat.Regex ndist = new com.stevesoft.pat.Regex(
341 if (uqnodename.search(fstring) &&
342 ( (uqnodename.matchedFrom(1) == 0) ||
343 (fstring.charAt(uqnodename.matchedFrom(1) - 1) != ':'))) // JBPNote HACK!
345 if (nodename == null)
347 if (ReplaceUnderscores)
349 nodename = uqnodename.stringMatched(1).replace('_',
354 nodename = uqnodename.stringMatched(1);
359 Error = ErrorStringrange(Error,
360 "File has broken algorithm - overwritten nodename",
365 if (nbootstrap.search(fstring) &&
366 (nbootstrap.matchedFrom(1) > (uqnodename.matchedFrom(1) +
367 uqnodename.stringMatched().length())))
371 bootstrap = (new Integer(nbootstrap.stringMatched(1))).intValue();
376 Error = ErrorStringrange(Error,
377 "Can't parse bootstrap value", 4,
378 cp + nbootstrap.matchedFrom(), nf);
382 boolean nodehasdistance = false;
384 if (ndist.search(fstring))
388 distance = (new Float(ndist.stringMatched(1))).floatValue();
390 nodehasdistance = true;
394 Error = ErrorStringrange(Error,
395 "Can't parse node distance value", 7,
396 cp + ndist.matchedFrom(), nf);
402 // Write node info here
404 // Trees without distances still need a render distance
405 c.dist = (HasDistances) ? distance : DefDistance;
406 // be consistent for internal bootstrap defaults too
407 c.setBootstrap( (HasBootstrap) ? bootstrap : DefBootstrap);
410 RootHasDistance = nodehasdistance; // JBPNote This is really UGLY!!! Ensure root node gets its given distance
415 // Find a place to put the leaf
416 SequenceNode newnode = new SequenceNode(null, c, nodename,
417 (HasDistances) ? distance : DefDistance,
418 (HasBootstrap) ? bootstrap : DefBootstrap, false);
420 if (c.right() == null)
426 if (c.left() == null)
432 // Insert a dummy node for polytomy
433 // dummy nodes have distances
434 SequenceNode newdummy = new SequenceNode(null, c,
435 null, (HasDistances ? 0 : DefDistance), 0, true);
436 newdummy.SetChildren(c.left(), newnode);
444 // move back up the tree from preceding closure
447 if ( (d > -1) && (c == null))
449 Error = ErrorStringrange(Error,
450 "File broke algorithm: Lost place in tree (is there an extra ')' ?)",
455 if (nf.charAt(fcp) == ')')
462 if (nf.charAt(fcp) == ',')
470 // Just advance focus, if we need to
471 if ( (c.left() != null) && (!c.left().isLeaf()))
473 c = (SequenceNode) c.left();
478 // else : We do nothing if ';' is encountered.
481 // Reset new node properties to obvious fakes
483 distance = DefDistance;
484 bootstrap = DefBootstrap;
492 throw (new IOException("NewickFile: " + Error + "\n"));
496 throw (new IOException("NewickFile: No Tree read in\n"));
498 // THe next line is failing for topali trees - not sure why yet. if (root.right()!=null && root.isDummy())
499 root = (SequenceNode) root.right().detach(); // remove the imaginary root.
501 if (!RootHasDistance)
503 root.dist = (HasDistances) ? 0 : DefDistance;
510 * @return DOCUMENT ME!
512 public SequenceNode getTree()
518 * Generate a newick format tree according to internal flags
519 * for bootstraps, distances and root distances.
521 * @return new hampshire tree in a single line
523 public String print()
527 StringBuffer tf = new StringBuffer();
530 return (tf.append(";").toString());
537 * Generate a newick format tree according to internal flags
538 * for distances and root distances and user specificied writing of
540 * @param withbootstraps controls if bootstrap values are explicitly written.
542 * @return new hampshire tree in a single line
544 public String print(boolean withbootstraps)
548 boolean boots = this.HasBootstrap;
549 this.HasBootstrap = withbootstraps;
552 this.HasBootstrap = boots;
560 * Generate newick format tree according to internal flags
561 * for writing root node distances.
563 * @param withbootstraps explicitly write bootstrap values
564 * @param withdists explicitly write distances
566 * @return new hampshire tree in a single line
568 public String print(boolean withbootstraps, boolean withdists)
572 boolean dists = this.HasDistances;
573 this.HasDistances = withdists;
575 String rv = print(withbootstraps);
576 this.HasDistances = dists;
583 * Generate newick format tree according to user specified flags
585 * @param withbootstraps explicitly write bootstrap values
586 * @param withdists explicitly write distances
587 * @param printRootInfo explicitly write root distance
589 * @return new hampshire tree in a single line
591 public String print(boolean withbootstraps, boolean withdists,
592 boolean printRootInfo)
596 boolean rootinfo = printRootInfo;
597 this.printRootInfo = printRootInfo;
599 String rv = print(withbootstraps, withdists);
600 this.printRootInfo = rootinfo;
609 * @return DOCUMENT ME!
619 * @param c DOCUMENT ME!
621 * @return DOCUMENT ME!
623 char setQuoteChar(char c)
625 char old = QuoteChar;
634 * @param name DOCUMENT ME!
636 * @return DOCUMENT ME!
638 private String nodeName(String name)
640 if (NodeSafeName[0].search(name))
642 return QuoteChar + NodeSafeName[1].replaceAll(name) + QuoteChar;
646 return NodeSafeName[2].replaceAll(name);
653 * @param c DOCUMENT ME!
655 * @return DOCUMENT ME!
657 private String printNodeField(SequenceNode c)
659 return ( (c.getName() == null) ? "" : nodeName(c.getName())) +
661 ? ( (c.getBootstrap() > -1) ? (" " + c.getBootstrap()) : "") : "") +
662 ( (HasDistances) ? (":" + c.dist) : "");
668 * @param root DOCUMENT ME!
670 * @return DOCUMENT ME!
672 private String printRootField(SequenceNode root)
674 return (printRootInfo)
675 ? ( ( (root.getName() == null) ? "" : nodeName(root.getName())) +
677 ? ( (root.getBootstrap() > -1) ? (" " + root.getBootstrap()) : "") :
679 ( (RootHasDistance) ? (":" + root.dist) : "")) : "";
682 // Non recursive call deals with root node properties
683 public void print(StringBuffer tf, SequenceNode root)
687 if (root.isLeaf() && printRootInfo)
689 tf.append(printRootField(root));
695 _print(tf, (SequenceNode) root.right());
696 _print(tf, (SequenceNode) root.left());
701 _print(tf, (SequenceNode) root.right());
703 if (root.left() != null)
708 _print(tf, (SequenceNode) root.left());
709 tf.append(")" + printRootField(root));
715 // Recursive call for non-root nodes
716 public void _print(StringBuffer tf, SequenceNode c)
722 tf.append(printNodeField(c));
728 _print(tf, (SequenceNode) c.left());
729 if (c.left() != null)
733 _print(tf, (SequenceNode) c.right());
738 _print(tf, (SequenceNode) c.right());
740 if (c.left() != null)
745 _print(tf, (SequenceNode) c.left());
746 tf.append(")" + printNodeField(c));
753 public static void main(String[] args)
757 if (args == null || args.length != 1)
760 "Takes one argument - file name of a newick tree file.");
764 File fn = new File(args[0]);
766 StringBuffer newickfile = new StringBuffer();
767 BufferedReader treefile = new BufferedReader(new FileReader(fn));
770 while ( (l = treefile.readLine()) != null)
772 newickfile.append(l);
776 System.out.println("Read file :\n");
778 NewickFile trf = new NewickFile(args[0], "File");
780 System.out.println("Original file :\n");
782 com.stevesoft.pat.Regex nonl = new com.stevesoft.pat.Regex("\n+", "");
783 System.out.println(nonl.replaceAll(newickfile.toString()) + "\n");
785 System.out.println("Parsed file.\n");
786 System.out.println("Default output type for original input.\n");
787 System.out.println(trf.print());
788 System.out.println("Without bootstraps.\n");
789 System.out.println(trf.print(false));
790 System.out.println("Without distances.\n");
791 System.out.println(trf.print(true, false));
792 System.out.println("Without bootstraps but with distanecs.\n");
793 System.out.println(trf.print(false, true));
794 System.out.println("Without bootstraps or distanecs.\n");
795 System.out.println(trf.print(false, false));
796 System.out.println("With bootstraps and with distances.\n");
797 System.out.println(trf.print(true, true));
799 catch (java.io.IOException e)
801 System.err.println("Exception\n" + e);