2 * Jalview - A Sequence Alignment Editor and Viewer
3 * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
22 // http://evolution.genetics.washington.edu/phylip/newick_doc.html
23 // TODO: Implement Basic NHX tag parsing and preservation
24 // TODO: http://evolution.genetics.wustl.edu/eddy/forester/NHX.html
25 // TODO: Extended SequenceNodeI to hold parsed NHX strings
30 import jalview.datamodel.*;
33 * Parse a new hanpshire style tree
34 * Caveats: NHX files are NOT supported and the tree distances and topology are unreliable when they are parsed.
38 public class NewickFile
42 private boolean HasBootstrap = false;
43 private boolean HasDistances = false;
44 private boolean RootHasDistance = false;
47 boolean ReplaceUnderscores = false;
48 boolean printRootInfo = true;
49 private com.stevesoft.pat.Regex[] NodeSafeName = new com.stevesoft.pat.Regex[]
51 new com.stevesoft.pat.Regex().perlCode("m/[\\[,:'()]/"), // test for requiring quotes
52 new com.stevesoft.pat.Regex().perlCode("s/'/''/"), // escaping quote characters
53 new com.stevesoft.pat.Regex().perlCode("s/\\/w/_/") // unqoted whitespace transformation
55 char QuoteChar = '\'';
58 * Creates a new NewickFile object.
60 * @param inStr DOCUMENT ME!
62 * @throws IOException DOCUMENT ME!
64 public NewickFile(String inStr)
67 super(inStr, "Paste");
71 * Creates a new NewickFile object.
73 * @param inFile DOCUMENT ME!
74 * @param type DOCUMENT ME!
76 * @throws IOException DOCUMENT ME!
78 public NewickFile(String inFile, String type)
83 public NewickFile(FileParse source) throws IOException
88 * Creates a new NewickFile object.
90 * @param newtree DOCUMENT ME!
92 public NewickFile(SequenceNode newtree)
98 * Creates a new NewickFile object.
100 * @param newtree DOCUMENT ME!
101 * @param bootstrap DOCUMENT ME!
103 public NewickFile(SequenceNode newtree, boolean bootstrap)
105 HasBootstrap = bootstrap;
110 * Creates a new NewickFile object.
112 * @param newtree DOCUMENT ME!
113 * @param bootstrap DOCUMENT ME!
114 * @param distances DOCUMENT ME!
116 public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances)
119 HasBootstrap = bootstrap;
120 HasDistances = distances;
124 * Creates a new NewickFile object.
126 * @param newtree DOCUMENT ME!
127 * @param bootstrap DOCUMENT ME!
128 * @param distances DOCUMENT ME!
129 * @param rootdistance DOCUMENT ME!
131 public NewickFile(SequenceNode newtree, boolean bootstrap,
132 boolean distances, boolean rootdistance)
135 HasBootstrap = bootstrap;
136 HasDistances = distances;
137 RootHasDistance = rootdistance;
143 * @param Error DOCUMENT ME!
144 * @param Er DOCUMENT ME!
145 * @param r DOCUMENT ME!
146 * @param p DOCUMENT ME!
147 * @param s DOCUMENT ME!
149 * @return DOCUMENT ME!
151 private String ErrorStringrange(String Error, String Er, int r, int p,
154 return ( (Error == null) ? "" : Error) + Er + " at position " + p +
156 s.substring( ( (p - r) < 0) ? 0 : (p - r),
157 ( (p + r) > s.length()) ? s.length() : (p + r)) + " )\n";
161 // These are set automatically by the reader
162 public boolean HasBootstrap()
170 * @return DOCUMENT ME!
172 public boolean HasDistances()
177 public boolean HasRootDistance()
179 return RootHasDistance;
183 * parse the filesource as a newick file (new hampshire and/or extended)
185 * @throws IOException with a line number and character position for badly formatted NH strings
192 { // fill nf with complete tree file
194 StringBuffer file = new StringBuffer();
196 while ( (nf = nextLine()) != null)
201 nf = file.toString();
204 root = new SequenceNode();
206 SequenceNode realroot = null;
207 SequenceNode c = root;
211 //int flen = nf.length();
214 String nodename = null;
216 float DefDistance = (float) 0.001; // @param Default distance for a node - very very small
217 int DefBootstrap = -1; // @param Default bootstrap for a node
219 float distance = DefDistance;
220 int bootstrap = DefBootstrap;
222 boolean ascending = false; // flag indicating that we are leaving the current node
224 com.stevesoft.pat.Regex majorsyms = new com.stevesoft.pat.Regex(
229 while (majorsyms.searchFrom(nf, cp) && (Error == null))
231 int fcp = majorsyms.matchedFrom();
233 switch (schar=nf.charAt(fcp))
237 // ascending should not be set
241 Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf);
249 if (c.right() == null)
251 c.setRight(new SequenceNode(null, c, null, DefDistance,
252 DefBootstrap, false));
253 c = (SequenceNode) c.right();
257 if (c.left() != null)
259 // Dummy node for polytomy - keeps c.left free for new node
260 SequenceNode tmpn = new SequenceNode(null, c, null, 0,
262 tmpn.SetChildren(c.left(), c.right());
266 c.setLeft(new SequenceNode(null, c, null, DefDistance,
267 DefBootstrap, false));
268 c = (SequenceNode) c.left();
271 if (realroot == null)
277 distance = DefDistance;
278 bootstrap = DefBootstrap;
283 // Deal with quoted fields
286 com.stevesoft.pat.Regex qnodename = new com.stevesoft.pat.Regex(
289 if (qnodename.searchFrom(nf, fcp))
291 int nl = qnodename.stringMatched().length();
292 nodename = new String(qnodename.stringMatched().substring(0,
298 Error = ErrorStringrange(Error,
299 "Unterminated quotes for nodename", 7, fcp,
310 Error = ErrorStringrange(Error,
311 "Wayward semicolon (depth=" + d + ")", 7,
314 // cp advanced at the end of default
318 // node string contains Comment or structured/extended NH format info
319 /* if ((fcp-cp>1 && nf.substring(cp,fcp).trim().length()>1))
321 // will process in remains System.err.println("skipped text: '"+nf.substring(cp,fcp)+"'");
324 // verify termination.
325 com.stevesoft.pat.Regex comment = new com.stevesoft.pat.Regex(
327 if (comment.searchFrom(nf, fcp))
329 // Skip the comment field
330 nextcp=comment.matchedFrom()+1;
331 warningMessage = "Tree file contained comments which may confuse input algorithm.";
334 // cp advanced at the end of default to nextcp, ncp is unchanged so any node info can be read.
338 Error = ErrorStringrange(Error, "Unterminated comment", 3,
344 // Parse simpler field strings
345 String fstring = nf.substring(ncp, fcp);
346 // remove any comments before we parse the node info
347 // TODO: test newick file with quoted square brackets in node name (is this allowed?)
348 while (fstring.indexOf(']')>-1)
350 int cstart=fstring.indexOf('[');
351 int cend=fstring.indexOf(']');
352 String comment = fstring.substring(cstart+1,cend);
353 fstring = fstring.substring(0, cstart)+fstring.substring(cend+1);
356 com.stevesoft.pat.Regex uqnodename = new com.stevesoft.pat.Regex(
357 "\\b([^' :;\\](),]+)");
358 com.stevesoft.pat.Regex nbootstrap = new com.stevesoft.pat.Regex(
359 "\\s*([0-9+]+)\\s*:");
360 com.stevesoft.pat.Regex ndist = new com.stevesoft.pat.Regex(
363 if (uqnodename.search(fstring) &&
364 ( (uqnodename.matchedFrom(1) == 0) ||
365 (fstring.charAt(uqnodename.matchedFrom(1) - 1) != ':'))) // JBPNote HACK!
367 if (nodename == null)
369 if (ReplaceUnderscores)
371 nodename = uqnodename.stringMatched(1).replace('_',
376 nodename = uqnodename.stringMatched(1);
381 Error = ErrorStringrange(Error,
382 "File has broken algorithm - overwritten nodename",
387 if (nbootstrap.search(fstring))
389 if (nbootstrap.stringMatched(1).equals(uqnodename.stringMatched(1)))
391 nodename=null; // no nodename here.
393 if (nodename==null || nodename.length()==0 || nbootstrap.matchedFrom(1) > (uqnodename.matchedFrom(1) +
394 uqnodename.stringMatched().length()))
398 bootstrap = (new Integer(nbootstrap.stringMatched(1))).intValue();
403 Error = ErrorStringrange(Error,
404 "Can't parse bootstrap value", 4,
405 ncp + nbootstrap.matchedFrom(), nf);
410 boolean nodehasdistance = false;
412 if (ndist.search(fstring))
416 distance = (new Float(ndist.stringMatched(1))).floatValue();
418 nodehasdistance = true;
422 Error = ErrorStringrange(Error,
423 "Can't parse node distance value", 7,
424 ncp + ndist.matchedFrom(), nf);
430 // Write node info here
432 // Trees without distances still need a render distance
433 c.dist = (HasDistances) ? distance : DefDistance;
434 // be consistent for internal bootstrap defaults too
435 c.setBootstrap( (HasBootstrap) ? bootstrap : DefBootstrap);
438 RootHasDistance = nodehasdistance; // JBPNote This is really UGLY!!! Ensure root node gets its given distance
443 // Find a place to put the leaf
444 SequenceNode newnode = new SequenceNode(null, c, nodename,
445 (HasDistances) ? distance : DefDistance,
446 (HasBootstrap) ? bootstrap : DefBootstrap, false);
448 if (c.right() == null)
454 if (c.left() == null)
460 // Insert a dummy node for polytomy
461 // dummy nodes have distances
462 SequenceNode newdummy = new SequenceNode(null, c,
463 null, (HasDistances ? 0 : DefDistance), 0, true);
464 newdummy.SetChildren(c.left(), newnode);
472 // move back up the tree from preceding closure
475 if ( (d > -1) && (c == null))
477 Error = ErrorStringrange(Error,
478 "File broke algorithm: Lost place in tree (is there an extra ')' ?)",
483 if (nf.charAt(fcp) == ')')
490 if (nf.charAt(fcp) == ',')
498 // Just advance focus, if we need to
499 if ( (c.left() != null) && (!c.left().isLeaf()))
501 c = (SequenceNode) c.left();
507 // Reset new node properties to obvious fakes
509 distance = DefDistance;
510 bootstrap = DefBootstrap;
524 throw (new IOException("NewickFile: " + Error + "\n"));
528 throw (new IOException("NewickFile: No Tree read in\n"));
530 // THe next line is failing for topali trees - not sure why yet. if (root.right()!=null && root.isDummy())
531 root = (SequenceNode) root.right().detach(); // remove the imaginary root.
533 if (!RootHasDistance)
535 root.dist = (HasDistances) ? 0 : DefDistance;
542 * @return DOCUMENT ME!
544 public SequenceNode getTree()
550 * Generate a newick format tree according to internal flags
551 * for bootstraps, distances and root distances.
553 * @return new hampshire tree in a single line
555 public String print()
559 StringBuffer tf = new StringBuffer();
562 return (tf.append(";").toString());
569 * Generate a newick format tree according to internal flags
570 * for distances and root distances and user specificied writing of
572 * @param withbootstraps controls if bootstrap values are explicitly written.
574 * @return new hampshire tree in a single line
576 public String print(boolean withbootstraps)
580 boolean boots = this.HasBootstrap;
581 this.HasBootstrap = withbootstraps;
584 this.HasBootstrap = boots;
592 * Generate newick format tree according to internal flags
593 * for writing root node distances.
595 * @param withbootstraps explicitly write bootstrap values
596 * @param withdists explicitly write distances
598 * @return new hampshire tree in a single line
600 public String print(boolean withbootstraps, boolean withdists)
604 boolean dists = this.HasDistances;
605 this.HasDistances = withdists;
607 String rv = print(withbootstraps);
608 this.HasDistances = dists;
615 * Generate newick format tree according to user specified flags
617 * @param withbootstraps explicitly write bootstrap values
618 * @param withdists explicitly write distances
619 * @param printRootInfo explicitly write root distance
621 * @return new hampshire tree in a single line
623 public String print(boolean withbootstraps, boolean withdists,
624 boolean printRootInfo)
628 boolean rootinfo = printRootInfo;
629 this.printRootInfo = printRootInfo;
631 String rv = print(withbootstraps, withdists);
632 this.printRootInfo = rootinfo;
641 * @return DOCUMENT ME!
651 * @param c DOCUMENT ME!
653 * @return DOCUMENT ME!
655 char setQuoteChar(char c)
657 char old = QuoteChar;
666 * @param name DOCUMENT ME!
668 * @return DOCUMENT ME!
670 private String nodeName(String name)
672 if (NodeSafeName[0].search(name))
674 return QuoteChar + NodeSafeName[1].replaceAll(name) + QuoteChar;
678 return NodeSafeName[2].replaceAll(name);
685 * @param c DOCUMENT ME!
687 * @return DOCUMENT ME!
689 private String printNodeField(SequenceNode c)
691 return ( (c.getName() == null) ? "" : nodeName(c.getName())) +
693 ? ( (c.getBootstrap() > -1) ? ((c.getName()!=null ? " " : "")+ c.getBootstrap()) : "") : "") +
694 ( (HasDistances) ? (":" + c.dist) : "");
700 * @param root DOCUMENT ME!
702 * @return DOCUMENT ME!
704 private String printRootField(SequenceNode root)
706 return (printRootInfo)
707 ? ( ( (root.getName() == null) ? "" : nodeName(root.getName())) +
709 ? ( (root.getBootstrap() > -1) ? ((root.getName()!=null ? " " : "")+
710 + root.getBootstrap()) : "") :
712 ( (RootHasDistance) ? (":" + root.dist) : "")) : "";
715 // Non recursive call deals with root node properties
716 public void print(StringBuffer tf, SequenceNode root)
720 if (root.isLeaf() && printRootInfo)
722 tf.append(printRootField(root));
728 _print(tf, (SequenceNode) root.right());
729 _print(tf, (SequenceNode) root.left());
734 _print(tf, (SequenceNode) root.right());
736 if (root.left() != null)
741 _print(tf, (SequenceNode) root.left());
742 tf.append(")" + printRootField(root));
748 // Recursive call for non-root nodes
749 public void _print(StringBuffer tf, SequenceNode c)
755 tf.append(printNodeField(c));
761 _print(tf, (SequenceNode) c.left());
762 if (c.left() != null)
766 _print(tf, (SequenceNode) c.right());
771 _print(tf, (SequenceNode) c.right());
773 if (c.left() != null)
778 _print(tf, (SequenceNode) c.left());
779 tf.append(")" + printNodeField(c));
786 public static void main(String[] args)
790 if (args == null || args.length != 1)
793 "Takes one argument - file name of a newick tree file.");
797 File fn = new File(args[0]);
799 StringBuffer newickfile = new StringBuffer();
800 BufferedReader treefile = new BufferedReader(new FileReader(fn));
803 while ( (l = treefile.readLine()) != null)
805 newickfile.append(l);
809 System.out.println("Read file :\n");
811 NewickFile trf = new NewickFile(args[0], "File");
813 System.out.println("Original file :\n");
815 com.stevesoft.pat.Regex nonl = new com.stevesoft.pat.Regex("\n+", "");
816 System.out.println(nonl.replaceAll(newickfile.toString()) + "\n");
818 System.out.println("Parsed file.\n");
819 System.out.println("Default output type for original input.\n");
820 System.out.println(trf.print());
821 System.out.println("Without bootstraps.\n");
822 System.out.println(trf.print(false));
823 System.out.println("Without distances.\n");
824 System.out.println(trf.print(true, false));
825 System.out.println("Without bootstraps but with distanecs.\n");
826 System.out.println(trf.print(false, true));
827 System.out.println("Without bootstraps or distanecs.\n");
828 System.out.println(trf.print(false, false));
829 System.out.println("With bootstraps and with distances.\n");
830 System.out.println(trf.print(true, true));
832 catch (java.io.IOException e)
834 System.err.println("Exception\n" + e);