Formatted source
[jalview.git] / src / jalview / io / NewickFile.java
1 /*
2 * Jalview - A Sequence Alignment Editor and Viewer
3 * Copyright (C) 2005 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
18 */\r
19 \r
20 // NewickFile.java\r
21 // Tree I/O\r
22 // http://evolution.genetics.washington.edu/phylip/newick_doc.html\r
23 package jalview.io;\r
24 \r
25 import jalview.datamodel.*;\r
26 \r
27 import java.io.*;\r
28 \r
29 import java.util.*;\r
30 \r
31 \r
32 public class NewickFile extends FileParse {\r
33     SequenceNode root;\r
34     private boolean HasBootstrap = false;\r
35     private boolean HasDistances = false;\r
36     private boolean RootHasDistance = false;\r
37 \r
38     // File IO Flags\r
39     boolean ReplaceUnderscores = false;\r
40     boolean printRootInfo = false;\r
41     private com.stevesoft.pat.Regex[] NodeSafeName = new com.stevesoft.pat.Regex[] {\r
42             new com.stevesoft.pat.Regex().perlCode("m/[\\[,:'()]/"), // test for requiring quotes\r
43             new com.stevesoft.pat.Regex().perlCode("s/'/''/"), // escaping quote characters\r
44             new com.stevesoft.pat.Regex().perlCode("s/\\/w/_/") // unqoted whitespace transformation\r
45         };\r
46     char QuoteChar = '\'';\r
47 \r
48     public NewickFile(String inStr) throws IOException {\r
49         super(inStr, "Paste");\r
50     }\r
51 \r
52     public NewickFile(String inFile, String type) throws IOException {\r
53         super(inFile, type);\r
54     }\r
55 \r
56     public NewickFile(SequenceNode newtree) {\r
57         root = newtree;\r
58     }\r
59 \r
60     public NewickFile(SequenceNode newtree, boolean bootstrap) {\r
61         HasBootstrap = bootstrap;\r
62         root = newtree;\r
63     }\r
64 \r
65     public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances) {\r
66         root = newtree;\r
67         HasBootstrap = bootstrap;\r
68         HasDistances = distances;\r
69     }\r
70 \r
71     public NewickFile(SequenceNode newtree, boolean bootstrap,\r
72         boolean distances, boolean rootdistance) {\r
73         root = newtree;\r
74         HasBootstrap = bootstrap;\r
75         HasDistances = distances;\r
76         RootHasDistance = rootdistance;\r
77     }\r
78 \r
79     private String ErrorStringrange(String Error, String Er, int r, int p,\r
80         String s) {\r
81         return ((Error == null) ? "" : Error) + Er + " at position " + p +\r
82         " ( " +\r
83         s.substring(((p - r) < 0) ? 0 : (p - r),\r
84             ((p + r) > s.length()) ? s.length() : (p + r)) + " )\n";\r
85     }\r
86 \r
87     // @tree annotations\r
88     // These are set automatically by the reader\r
89     public boolean HasBootstrap() {\r
90         return HasBootstrap;\r
91     }\r
92 \r
93     public boolean HasDistances() {\r
94         return HasDistances;\r
95     }\r
96 \r
97     public void parse() throws IOException {\r
98         String nf;\r
99 \r
100         { // fill nf with complete tree file\r
101 \r
102             StringBuffer file = new StringBuffer();\r
103 \r
104             while ((nf = nextLine()) != null) {\r
105                 file.append(nf);\r
106             }\r
107 \r
108             nf = file.toString();\r
109         }\r
110 \r
111         root = new SequenceNode();\r
112 \r
113         SequenceNode realroot = null;\r
114         SequenceNode c = root;\r
115 \r
116         int d = -1;\r
117         int cp = 0;\r
118         int flen = nf.length();\r
119 \r
120         String Error = null;\r
121         String nodename = null;\r
122 \r
123         float DefDistance = (float) 0.00001; // @param Default distance for a node - very very small\r
124         int DefBootstrap = 0; // @param Default bootstrap for a node\r
125 \r
126         float distance = DefDistance;\r
127         int bootstrap = DefBootstrap;\r
128 \r
129         boolean ascending = false; // flag indicating that we are leaving the current node\r
130 \r
131         com.stevesoft.pat.Regex majorsyms = new com.stevesoft.pat.Regex(\r
132                 "[(\\['),;]");\r
133 \r
134         while (majorsyms.searchFrom(nf, cp) && (Error == null)) {\r
135             int fcp = majorsyms.matchedFrom();\r
136 \r
137             switch (nf.charAt(fcp)) {\r
138             case '[': // Comment or structured/extended NH format info\r
139 \r
140                 com.stevesoft.pat.Regex comment = new com.stevesoft.pat.Regex(\r
141                         "]");\r
142 \r
143                 if (comment.searchFrom(nf, fcp)) {\r
144                     // Skip the comment field\r
145                     cp = 1 + comment.matchedFrom();\r
146                 } else {\r
147                     Error = ErrorStringrange(Error, "Unterminated comment", 3,\r
148                             fcp, nf);\r
149                 }\r
150 \r
151                 ;\r
152 \r
153                 break;\r
154 \r
155             case '(':\r
156 \r
157                 // ascending should not be set\r
158                 // New Internal node\r
159                 if (ascending) {\r
160                     Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf);\r
161 \r
162                     continue;\r
163                 }\r
164 \r
165                 ;\r
166                 d++;\r
167 \r
168                 if (c.right() == null) {\r
169                     c.setRight(new SequenceNode(null, c, null, DefDistance,\r
170                             DefBootstrap, false));\r
171                     c = (SequenceNode) c.right();\r
172                 } else {\r
173                     if (c.left() != null) {\r
174                         // Dummy node for polytomy - keeps c.left free for new node\r
175                         SequenceNode tmpn = new SequenceNode(null, c, null, 0,\r
176                                 0, true);\r
177                         tmpn.SetChildren(c.left(), c.right());\r
178                         c.setRight(tmpn);\r
179                     }\r
180 \r
181                     c.setLeft(new SequenceNode(null, c, null, DefDistance,\r
182                             DefBootstrap, false));\r
183                     c = (SequenceNode) c.left();\r
184                 }\r
185 \r
186                 if (realroot == null) {\r
187                     realroot = c;\r
188                 }\r
189 \r
190                 nodename = null;\r
191                 distance = DefDistance;\r
192                 bootstrap = DefBootstrap;\r
193                 cp = fcp + 1;\r
194 \r
195                 break;\r
196 \r
197             // Deal with quoted fields\r
198             case '\'':\r
199 \r
200                 com.stevesoft.pat.Regex qnodename = new com.stevesoft.pat.Regex(\r
201                         "([^']|'')+'");\r
202 \r
203                 if (qnodename.searchFrom(nf, fcp)) {\r
204                     int nl = qnodename.stringMatched().length();\r
205                     nodename = new String(qnodename.stringMatched().substring(0,\r
206                                 nl - 1));\r
207                     cp = fcp + nl + 1;\r
208                 } else {\r
209                     Error = ErrorStringrange(Error,\r
210                             "Unterminated quotes for nodename", 7, fcp, nf);\r
211                 }\r
212 \r
213                 break;\r
214 \r
215             case ';':\r
216 \r
217                 if (d != -1) {\r
218                     Error = ErrorStringrange(Error,\r
219                             "Wayward semicolon (depth=" + d + ")", 7, fcp, nf);\r
220                 }\r
221 \r
222             // cp advanced at the end of default\r
223             default:\r
224 \r
225                 // Parse simpler field strings\r
226                 String fstring = nf.substring(cp, fcp);\r
227                 com.stevesoft.pat.Regex uqnodename = new com.stevesoft.pat.Regex(\r
228                         "\\b([^' :;\\](),]+)");\r
229                 com.stevesoft.pat.Regex nbootstrap = new com.stevesoft.pat.Regex(\r
230                         "\\S+([0-9+]+)\\S*:");\r
231                 com.stevesoft.pat.Regex ndist = new com.stevesoft.pat.Regex(\r
232                         ":([-0-9.+]+)");\r
233 \r
234                 if (uqnodename.search(fstring) &&\r
235                         ((uqnodename.matchedFrom(1) == 0) ||\r
236                         (fstring.charAt(uqnodename.matchedFrom(1) - 1) != ':'))) // JBPNote HACK!\r
237                  {\r
238                     if (nodename == null) {\r
239                         if (ReplaceUnderscores) {\r
240                             nodename = uqnodename.stringMatched(1).replace('_',\r
241                                     ' ');\r
242                         } else {\r
243                             nodename = uqnodename.stringMatched(1);\r
244                         }\r
245                     } else {\r
246                         Error = ErrorStringrange(Error,\r
247                                 "File has broken algorithm - overwritten nodename",\r
248                                 10, fcp, nf);\r
249                     }\r
250                 }\r
251 \r
252                 if (nbootstrap.search(fstring) &&\r
253                         (nbootstrap.matchedFrom(1) > (uqnodename.matchedFrom(1) +\r
254                         uqnodename.stringMatched().length()))) {\r
255                     try {\r
256                         bootstrap = (new Integer(nbootstrap.stringMatched(1))).intValue();\r
257                         HasBootstrap = true;\r
258                     } catch (Exception e) {\r
259                         Error = ErrorStringrange(Error,\r
260                                 "Can't parse bootstrap value", 4,\r
261                                 cp + nbootstrap.matchedFrom(), nf);\r
262                     }\r
263                 }\r
264 \r
265                 boolean nodehasdistance = false;\r
266 \r
267                 if (ndist.search(fstring)) {\r
268                     try {\r
269                         distance = (new Float(ndist.stringMatched(1))).floatValue();\r
270                         HasDistances = true;\r
271                         nodehasdistance = true;\r
272                     } catch (Exception e) {\r
273                         Error = ErrorStringrange(Error,\r
274                                 "Can't parse node distance value", 7,\r
275                                 cp + ndist.matchedFrom(), nf);\r
276                     }\r
277                 }\r
278 \r
279                 if (ascending) {\r
280                     // Write node info here\r
281                     c.setName(nodename);\r
282                     c.dist = (HasDistances) ? distance : 0;\r
283                     c.setBootstrap((HasBootstrap) ? bootstrap : 0);\r
284 \r
285                     if (c == realroot) {\r
286                         RootHasDistance = nodehasdistance; // JBPNote This is really UGLY!!!\r
287                     }\r
288                 } else {\r
289                     // Find a place to put the leaf\r
290                     SequenceNode newnode = new SequenceNode(null, c, nodename,\r
291                             (HasDistances) ? distance : DefDistance,\r
292                             (HasBootstrap) ? bootstrap : DefBootstrap, false);\r
293 \r
294                     if (c.right() == null) {\r
295                         c.setRight(newnode);\r
296                     } else {\r
297                         if (c.left() == null) {\r
298                             c.setLeft(newnode);\r
299                         } else {\r
300                             // Insert a dummy node for polytomy\r
301                             SequenceNode newdummy = new SequenceNode(null, c,\r
302                                     null, 0, 0, true);\r
303                             newdummy.SetChildren(c.left(), newnode);\r
304                             c.setLeft(newdummy);\r
305                         }\r
306                     }\r
307                 }\r
308 \r
309                 if (ascending) {\r
310                     // move back up the tree from preceding closure\r
311                     c = c.AscendTree();\r
312 \r
313                     if ((d > -1) && (c == null)) {\r
314                         Error = ErrorStringrange(Error,\r
315                                 "File broke algorithm: Lost place in tree (is there an extra ')' ?)",\r
316                                 7, fcp, nf);\r
317                     }\r
318                 }\r
319 \r
320                 if (nf.charAt(fcp) == ')') {\r
321                     d--;\r
322                     ascending = true;\r
323                 } else {\r
324                     if (nf.charAt(fcp) == ',') {\r
325                         if (ascending) {\r
326                             ascending = false;\r
327                         } else {\r
328                             // Just advance focus, if we need to\r
329                             if ((c.left() != null) && (!c.left().isLeaf())) {\r
330                                 c = (SequenceNode) c.left();\r
331                             }\r
332                         }\r
333                     }\r
334                      // else : We do nothing if ';' is encountered.\r
335                 }\r
336 \r
337                 // Reset new node properties to obvious fakes\r
338                 nodename = null;\r
339                 distance = DefDistance;\r
340                 bootstrap = DefBootstrap;\r
341 \r
342                 cp = fcp + 1;\r
343             }\r
344         }\r
345 \r
346         if (Error != null) {\r
347             throw (new IOException("NewickFile: " + Error + "\n"));\r
348         }\r
349 \r
350         root = (SequenceNode) root.right().detach(); // remove the imaginary root.\r
351 \r
352         if (!RootHasDistance) {\r
353             root.dist = 0;\r
354         }\r
355     }\r
356 \r
357     public SequenceNode getTree() {\r
358         return root;\r
359     }\r
360 \r
361     public String print() {\r
362         synchronized (this) {\r
363             StringBuffer tf = new StringBuffer();\r
364             print(tf, root);\r
365 \r
366             return (tf.append(";").toString());\r
367         }\r
368     }\r
369 \r
370     public String print(boolean withbootstraps) {\r
371         synchronized (this) {\r
372             boolean boots = this.HasBootstrap;\r
373             this.HasBootstrap = withbootstraps;\r
374 \r
375             String rv = print();\r
376             this.HasBootstrap = boots;\r
377 \r
378             return rv;\r
379         }\r
380     }\r
381 \r
382     public String print(boolean withbootstraps, boolean withdists) {\r
383         synchronized (this) {\r
384             boolean dists = this.HasDistances;\r
385             this.HasDistances = withdists;\r
386 \r
387             String rv = print(withbootstraps);\r
388             this.HasDistances = dists;\r
389 \r
390             return rv;\r
391         }\r
392     }\r
393 \r
394     public String print(boolean withbootstraps, boolean withdists,\r
395         boolean printRootInfo) {\r
396         synchronized (this) {\r
397             boolean rootinfo = printRootInfo;\r
398             this.printRootInfo = printRootInfo;\r
399 \r
400             String rv = print(withbootstraps, withdists);\r
401             this.printRootInfo = rootinfo;\r
402 \r
403             return rv;\r
404         }\r
405     }\r
406 \r
407     char getQuoteChar() {\r
408         return QuoteChar;\r
409     }\r
410 \r
411     char setQuoteChar(char c) {\r
412         char old = QuoteChar;\r
413         QuoteChar = c;\r
414 \r
415         return old;\r
416     }\r
417 \r
418     private String nodeName(String name) {\r
419         if (NodeSafeName[0].search(name)) {\r
420             return QuoteChar + NodeSafeName[1].replaceAll(name) + QuoteChar;\r
421         } else {\r
422             return NodeSafeName[2].replaceAll(name);\r
423         }\r
424     }\r
425 \r
426     private String printNodeField(SequenceNode c) {\r
427         return ((c.getName() == null) ? "" : nodeName(c.getName())) +\r
428         ((HasBootstrap)\r
429         ? ((c.getBootstrap() > -1) ? (" " + c.getBootstrap()) : "") : "") +\r
430         ((HasDistances) ? (":" + c.dist) : "");\r
431     }\r
432 \r
433     private String printRootField(SequenceNode root) {\r
434         return (printRootInfo)\r
435         ? (((root.getName() == null) ? "" : nodeName(root.getName())) +\r
436         ((HasBootstrap)\r
437         ? ((root.getBootstrap() > -1) ? (" " + root.getBootstrap()) : "") : "") +\r
438         ((RootHasDistance) ? (":" + root.dist) : "")) : "";\r
439     }\r
440 \r
441     // Non recursive call deals with root node properties\r
442     public void print(StringBuffer tf, SequenceNode root) {\r
443         if (root != null) {\r
444             if (root.isLeaf() && printRootInfo) {\r
445                 tf.append(printRootField(root));\r
446             } else {\r
447                 if (root.isDummy()) {\r
448                     _print(tf, (SequenceNode) root.right());\r
449                     _print(tf, (SequenceNode) root.left());\r
450                 } else {\r
451                     tf.append("(");\r
452                     _print(tf, (SequenceNode) root.right());\r
453 \r
454                     if (root.left() != null) {\r
455                         tf.append(",");\r
456                     }\r
457 \r
458                     _print(tf, (SequenceNode) root.left());\r
459                     tf.append(")" + printRootField(root));\r
460                 }\r
461             }\r
462         }\r
463     }\r
464 \r
465     // Recursive call for non-root nodes\r
466     public void _print(StringBuffer tf, SequenceNode c) {\r
467         if (c != null) {\r
468             if (c.isLeaf()) {\r
469                 tf.append(printNodeField(c));\r
470             } else {\r
471                 if (c.isDummy()) {\r
472                     _print(tf, (SequenceNode) c.right());\r
473                     _print(tf, (SequenceNode) c.left());\r
474                 } else {\r
475                     tf.append("(");\r
476                     _print(tf, (SequenceNode) c.right());\r
477 \r
478                     if (c.left() != null) {\r
479                         tf.append(",");\r
480                     }\r
481 \r
482                     _print(tf, (SequenceNode) c.left());\r
483                     tf.append(")" + printNodeField(c));\r
484                 }\r
485             }\r
486         }\r
487     }\r
488 \r
489     // Test\r
490     public static void main(String[] args) {\r
491         try {\r
492             File fn = new File(args[0]);\r
493 \r
494             StringBuffer newickfile = new StringBuffer();\r
495             BufferedReader treefile = new BufferedReader(new FileReader(fn));\r
496             String l;\r
497 \r
498             while ((l = treefile.readLine()) != null) {\r
499                 newickfile.append(l);\r
500             }\r
501 \r
502             treefile.close();\r
503             System.out.println("Read file :\n");\r
504 \r
505             NewickFile trf = new NewickFile(args[0], "File");\r
506             trf.parse();\r
507             System.out.println("Original file :\n");\r
508 \r
509             com.stevesoft.pat.Regex nonl = new com.stevesoft.pat.Regex("\n+", "");\r
510             System.out.println(nonl.replaceAll(newickfile.toString()) + "\n");\r
511 \r
512             System.out.println("Parsed file.\n");\r
513             System.out.println("Default output type for original input.\n");\r
514             System.out.println(trf.print());\r
515             System.out.println("Without bootstraps.\n");\r
516             System.out.println(trf.print(false));\r
517             System.out.println("Without distances.\n");\r
518             System.out.println(trf.print(true, false));\r
519             System.out.println("Without bootstraps but with distanecs.\n");\r
520             System.out.println(trf.print(false, true));\r
521             System.out.println("Without bootstraps or distanecs.\n");\r
522             System.out.println(trf.print(false, false));\r
523             System.out.println("With bootstraps and with distances.\n");\r
524             System.out.println(trf.print(true, true));\r
525         } catch (java.io.IOException e) {\r
526             System.err.println("Exception\n" + e);\r
527             e.printStackTrace();\r
528         }\r
529     }\r
530 }\r