Newick/New Hampshire tree format parser and generator.
authorjprocter <Jim Procter>
Thu, 17 Mar 2005 16:13:18 +0000 (16:13 +0000)
committerjprocter <Jim Procter>
Thu, 17 Mar 2005 16:13:18 +0000 (16:13 +0000)
src/jalview/io/NewickFile.java [new file with mode: 0755]

diff --git a/src/jalview/io/NewickFile.java b/src/jalview/io/NewickFile.java
new file mode 100755 (executable)
index 0000000..997903c
--- /dev/null
@@ -0,0 +1,490 @@
+// NewickFile.java
+// Tree I/O
+// http://evolution.genetics.washington.edu/phylip/newick_doc.html
+package jalview.io;
+
+import java.io.*;
+import java.util.*;
+import jalview.datamodel.*;
+
+public class NewickFile extends FileParse
+{
+  SequenceNode root;
+
+  private boolean HasBootstrap = false;
+  private boolean HasDistances = false;
+  private boolean RootHasDistance = false;
+  private String ErrorStringrange(String Error, String Er, int r, int p, String s) {
+    return ((Error==null) ? "" : Error)
+        + Er +
+        " at position "+p+" ( "
+        + s.substring((p-r)<0 ? 0 : (p-r),
+                      (p+r)>s.length() ? s.length() : (p+r))
+        + " )\n";
+  }
+
+  // @tree annotations
+  // These are set automatically by the reader
+  public boolean HasBootstrap() {
+    return HasBootstrap;
+  }
+
+  public boolean HasDistances() {
+    return HasDistances;
+  }
+  public NewickFile(String inStr) throws IOException
+  {
+    super(inStr, "Paste");
+  }
+
+  public NewickFile(String inFile, String type)
+      throws IOException
+  {
+
+    super(inFile, type);
+  }
+
+  public void parse() throws IOException
+  {
+    String nf;
+
+    { // fill nf with complete tree file
+      StringBuffer file = new StringBuffer();
+      while ( (nf = nextLine()) != null)
+      {
+        file.append(nf);
+      }
+      nf = file.toString();
+    }
+
+    root = new SequenceNode();
+    SequenceNode c = root;
+    int d = -1;
+    int cp = 0;
+    int flen = nf.length();
+    String Error = null;
+    String nodename = null;
+    float distance=-99999;
+    int bootstrap=-99999;
+    boolean ascending = false; // flag indicating that we are leaving the current node
+    com.stevesoft.pat.Regex majorsyms = new com.stevesoft.pat.Regex("[(\\['),;]");
+    while (majorsyms.searchFrom(nf, cp) && Error==null) {
+      int fcp = majorsyms.matchedFrom();
+      switch (nf.charAt(fcp)) {
+
+        case '[': // Comment or structured/extended NH format info
+          com.stevesoft.pat.Regex comment = new com.stevesoft.pat.Regex("]");
+          if (comment.searchFrom(nf, fcp))
+          {
+            // Skip the comment field
+            cp = 1 + comment.matchedFrom();
+          }
+          else
+          {
+            Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp, nf);
+          }
+          ;
+          break;
+
+        case '(':
+
+          // ascending should not be set
+          // New Internal node
+          if (ascending)
+          {
+            Error = ErrorStringrange(Error, "Unexpected '('", 7, fcp, nf);
+            continue;
+          }
+          ;
+          d++;
+          if (c.right() == null)
+          {
+            c.setRight(new SequenceNode(null, c, 0, null));
+            c = (SequenceNode) c.right();
+          }
+          else
+          {
+            if (c.left() != null)
+            {
+              // Dummy node for polytomy - keeps c.left free for new node
+              SequenceNode tmpn = new SequenceNode(null, c, null, 0, 0, true);
+              tmpn.SetChildren(c.left(), c.right());
+              c.setRight(tmpn);
+            }
+            c.setLeft(new SequenceNode(null, c, 0, null));
+            c = (SequenceNode) c.left();
+          }
+          nodename = null;
+          distance = -99999;
+          bootstrap = -99999;
+          cp = fcp + 1;
+          break;
+
+          // Deal with quoted fields
+        case '\'':
+          com.stevesoft.pat.Regex qnodename = new com.stevesoft.pat.Regex(
+              "([^']|'')+'");
+          if (qnodename.searchFrom(nf, fcp))
+          {
+            int nl = qnodename.stringMatched().length();
+            nodename = new String(qnodename.stringMatched().substring(0, nl - 1));
+            cp = fcp + nl + 1;
+          }
+          else
+          {
+            Error = ErrorStringrange(Error, "Unterminated quotes for nodename",
+                                     7, fcp, nf);
+          }
+          break;
+
+        case ';':
+          if (d != -1)
+          {
+            Error = ErrorStringrange(Error,
+                                     "Wayward semicolon (depth=" + d + ")", 7,
+                                     fcp, nf);
+          }
+          // cp advanced at the end of default
+
+        default:
+
+          // Parse simpler field strings
+          String fstring = nf.substring(cp, fcp);
+          com.stevesoft.pat.Regex uqnodename = new com.stevesoft.pat.Regex(
+              "\\b([^' :;\\](),]+)");
+          com.stevesoft.pat.Regex nbootstrap = new com.stevesoft.pat.Regex(
+              "\\S+([0-9+]+)\\S*:");
+          com.stevesoft.pat.Regex ndist = new com.stevesoft.pat.Regex(
+              ":([-0-9.+]+)");
+          if (uqnodename.search(fstring)
+              && (uqnodename.matchedFrom(1) == 0
+                  || fstring.charAt(uqnodename.matchedFrom(1) - 1) != ':')) // JBPNote HACK!
+
+          {
+            if (nodename == null)
+            {
+              nodename = uqnodename.stringMatched(1).replace('_', ' ');
+            }
+            else
+            {
+              Error = ErrorStringrange(Error,
+                                       "File has broken algorithm - overwritten nodename",
+                                       10, fcp,
+                                       nf);
+            }
+          }
+          if (nbootstrap.search(fstring)
+              &&
+              nbootstrap.matchedFrom(1)
+              > uqnodename.matchedFrom(1) + uqnodename.stringMatched().length())
+          {
+            try
+            {
+              bootstrap = (new Integer(nbootstrap.stringMatched(1))).intValue();
+              HasBootstrap = true;
+            }
+            catch (Exception e)
+            {
+              Error = ErrorStringrange(Error, "Can't parse bootstrap value", 4,
+                                       cp + nbootstrap.matchedFrom(), nf);
+            }
+          }
+
+          if (ndist.search(fstring))
+          {
+            try
+            {
+              distance = (new Float(ndist.stringMatched(1))).floatValue();
+              HasDistances = true;
+              if (c.parent()==root) {
+                RootHasDistance = true;
+              }
+            }
+            catch (Exception e)
+            {
+              Error = ErrorStringrange(Error, "Can't parse node distance value",
+                                       7, cp + ndist.matchedFrom(), nf);
+            }
+          }
+
+          if (ascending)
+          {
+            // Write node info here
+            c.setName(nodename);
+            c.dist = (HasDistances) ? distance : 0;
+            c.setBootstrap((HasBootstrap) ? bootstrap : 0);
+          }
+          else
+          {
+            // Find a place to put the leaf
+            SequenceNode newnode = new SequenceNode(null, c,
+                (HasDistances) ? distance : 0,
+                nodename);
+
+            newnode.setBootstrap(HasBootstrap ? bootstrap : 0);
+
+            if (c.right() == null)
+            {
+              c.setRight(newnode);
+            }
+            else
+            {
+              if (c.left() == null)
+              {
+                c.setLeft(newnode);
+              }
+              else
+              {
+                // Insert a dummy node for polytomy
+                SequenceNode newdummy = new SequenceNode(null, c, null, 0, 0, true);
+                newdummy.SetChildren(c.left(), newnode);
+                c.setLeft(newdummy);
+              }
+            }
+          }
+          if (ascending)
+          {
+            // move back up the tree from preceding closure
+            c = c.AscendTree();
+            if (d > -1 && c == null)
+            {
+              Error = ErrorStringrange(Error,
+                                       "File broke algorithm: Lost place in tree (is there an extra ')' ?)",
+                                       7, fcp, nf);
+            }
+          }
+
+          if (nf.charAt(fcp) == ')') {
+            d--;
+            ascending = true;
+          } else {
+            if (nf.charAt(fcp) == ',') {
+              if (ascending) {
+               ascending = false;
+             } else {
+               // Just advance focus, if we need to
+               if (c.left()!=null && (!c.left().isLeaf())) {
+                 c = (SequenceNode) c.left();
+               }
+             }
+            } // else : We do nothing if ';' is encountered.
+          }
+          // Reset new node properties to obvious fakes
+          nodename = null;
+          distance = -99999;
+          bootstrap = -99999;
+
+          cp=fcp+1;
+      }
+    }
+
+    if (Error!=null) {
+      throw(new IOException("NewickFile: "+Error+"\n"));
+    }
+
+    root = (SequenceNode) root.right().detach(); // remove the imaginary root.
+  }
+
+  public NewickFile(SequenceNode newtree) {
+    root = newtree;
+  }
+
+  public NewickFile(SequenceNode newtree, boolean bootstrap) {
+    HasBootstrap = bootstrap;
+    root=newtree;
+  }
+  public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances) {
+    root = newtree;
+    HasBootstrap = bootstrap;
+    HasDistances = distances;
+  }
+
+  public NewickFile(SequenceNode newtree, boolean bootstrap, boolean distances, boolean rootdistance) {
+      root = newtree;
+      HasBootstrap = bootstrap;
+      HasDistances = distances;
+      RootHasDistance = rootdistance;
+    }
+
+  public SequenceNode getTree() {
+    return root;
+  }
+
+  public String print() {
+    synchronized (this) {
+      StringBuffer tf = new StringBuffer();
+      print(tf, root);
+      return (tf.append(";").toString());
+    }
+  }
+
+  public String print(boolean withbootstraps) {
+    synchronized(this) {
+      boolean boots = this.HasBootstrap;
+      this.HasBootstrap = withbootstraps;
+      String rv = print();
+      this.HasBootstrap = boots;
+      return rv;
+    }
+  }
+
+  public String print(boolean withbootstraps, boolean withdists) {
+    synchronized(this) {
+      boolean dists = this.HasDistances;
+      this.HasDistances = withdists;
+      String rv = print(withbootstraps);
+      this.HasDistances = dists;
+      return rv;
+    }
+  }
+  boolean printRootInfo = false;
+
+
+  public String print(boolean withbootstraps, boolean withdists, boolean printRootInfo) {
+    synchronized(this) {
+      boolean rootinfo = printRootInfo;
+      this.printRootInfo = printRootInfo;
+      String rv = print(withbootstraps, withdists);
+      this.printRootInfo = rootinfo;
+      return rv;
+    }
+  }
+  private com.stevesoft.pat.Regex[] NodeSafeName =
+      new com.stevesoft.pat.Regex[]
+      { new com.stevesoft.pat.Regex().perlCode("m/[\\[,:'()]/"), // test for requiring quotes
+      new com.stevesoft.pat.Regex().perlCode("s/'/''/"), // escaping quote characters
+      new com.stevesoft.pat.Regex().perlCode("s/\\/w/_/") // unqoted whitespace transformation
+  };
+  char QuoteChar='\'';
+  char getQuoteChar() {
+    return QuoteChar;
+  }
+
+  char setQuoteChar(char c) {
+    char old = QuoteChar;
+    QuoteChar = c;
+    return old;
+  }
+
+  private String nodeName(String name) {
+    if (NodeSafeName[0].search(name)) {
+      return QuoteChar + NodeSafeName[1].replaceAll(name) + QuoteChar;
+    } else {
+      return NodeSafeName[2].replaceAll(name);
+    }
+  }
+  private String printNodeField(SequenceNode c) {
+
+    return ( (c.getName() == null) ? ""
+             : nodeName(c.getName()))
+        + ( (HasBootstrap)
+            ? ( (c.getBootstrap() > -1)
+                ? " " + c.getBootstrap()
+                : "")
+            : "")
+        + ( (HasDistances)
+            ? ":" + c.dist : "");
+  }
+  private String printRootField(SequenceNode root) {
+
+      return (printRootInfo)
+          ? (( (root.getName() == null) ? ""
+               : nodeName(root.getName()))
+             + ( (HasBootstrap)
+                 ? ( (root.getBootstrap() > -1)
+                     ? " " + root.getBootstrap()
+                     : "")
+                 : "")
+             + ( (RootHasDistance)
+                 ? ":" + root.dist : ""))
+          : "";
+    }
+
+// Non recursive call deals with root node properties
+  public void print(StringBuffer tf, SequenceNode root) {
+    if (root!=null) {
+      if (root.isLeaf() && printRootInfo) {
+        tf.append(printRootField(root));
+      } else {
+        if (root.isDummy()) {
+          _print(tf, (SequenceNode) root.right());
+          _print(tf, (SequenceNode) root.left());
+        } else {
+          tf.append("(");
+          _print(tf, (SequenceNode) root.right());
+          if (root.left() != null) {
+            tf.append(",");
+          }
+          _print(tf, (SequenceNode) root.left());
+          tf.append(")" + printRootField(root));
+        }
+      }
+    }
+  }
+
+  // Recursive call for non-root nodes
+  public void _print(StringBuffer tf, SequenceNode c) {
+    if (c!=null) {
+      if (c.isLeaf()) {
+        tf.append(printNodeField(c));
+      } else {
+        if (c.isDummy()) {
+          _print(tf, (SequenceNode) c.right());
+          _print(tf, (SequenceNode) c.left());
+        } else {
+          tf.append("(");
+          _print(tf, (SequenceNode) c.right());
+          if (c.left() != null) {
+            tf.append(",");
+          }
+          _print(tf, (SequenceNode) c.left());
+          tf.append(")" + printNodeField(c));
+        }
+      }
+    }
+  }
+
+// Test
+  public static void main(String[] args)
+{
+  try
+  {
+    File fn = new File(args[0]);
+
+    StringBuffer newickfile =  new StringBuffer();
+    BufferedReader treefile =
+        new BufferedReader(new FileReader(fn));
+    String l;
+    while ((l = treefile.readLine())!=null) {
+      newickfile.append(l);
+    }
+    treefile.close();
+    System.out.println("Read file :\n");
+    NewickFile trf = new NewickFile(args[0], "File");
+    trf.parse();
+    System.out.println("Original file :\n");
+    com.stevesoft.pat.Regex nonl = new com.stevesoft.pat.Regex("\n+", "");
+    System.out.println(nonl.replaceAll(newickfile.toString())+"\n");
+
+    System.out.println("Parsed file.\n");
+    System.out.println("Default output type for original input.\n");
+    System.out.println(trf.print());
+    System.out.println("Without bootstraps.\n");
+    System.out.println(trf.print(false));
+    System.out.println("Without distances.\n");
+    System.out.println(trf.print(true,false));
+    System.out.println("Without bootstraps but with distanecs.\n");
+    System.out.println(trf.print(false, true));
+    System.out.println("Without bootstraps or distanecs.\n");
+    System.out.println(trf.print(false, false));
+    System.out.println("With bootstraps and with distances.\n");
+    System.out.println(trf.print(true, true));
+  }
+  catch (java.io.IOException e)
+  {
+    System.out.println("Exception\n" + e);
+    e.printStackTrace();
+  }
+}
+
+}