From b13bd9e539bc62536bcd723b0682914a133b0c5a Mon Sep 17 00:00:00 2001 From: jprocter Date: Wed, 22 Aug 2007 12:50:47 +0000 Subject: [PATCH] fixed regex and made 'improper' support for parsing NHX style trees TODO: FIX THIS git-svn-id: https://svn.lifesci.dundee.ac.uk/svn/repository/trunk@442 be28352e-c001-0410-b1a7-c7978e42abec --- .../ac/vamsas/objects/utils/trees/NewickFile.java | 301 ++++++++++---------- 1 file changed, 146 insertions(+), 155 deletions(-) diff --git a/src/uk/ac/vamsas/objects/utils/trees/NewickFile.java b/src/uk/ac/vamsas/objects/utils/trees/NewickFile.java index f8e49aa..3c41744 100644 --- a/src/uk/ac/vamsas/objects/utils/trees/NewickFile.java +++ b/src/uk/ac/vamsas/objects/utils/trees/NewickFile.java @@ -27,7 +27,6 @@ // TODO: Extended SequenceNodeI to hold parsed NHX strings package uk.ac.vamsas.objects.utils.trees; - import java.io.*; import java.util.Enumeration; import java.util.Hashtable; @@ -55,7 +54,7 @@ public class NewickFile { private boolean RootHasDistance = false; // File IO Flags - boolean ReplaceUnderscores = false; + boolean ReplaceUnderscores = true; boolean printRootInfo = false; @@ -82,11 +81,14 @@ public class NewickFile { newickFile = inStr; parse(); } + public NewickFile(File inFile) throws IOException { - errormessage = "Problem's reading file "+inFile; - dataIn = new java.io.BufferedReader(new InputStreamReader(new java.io.FileInputStream(inFile))); + errormessage = "Problem's reading file " + inFile; + dataIn = new java.io.BufferedReader(new InputStreamReader( + new java.io.FileInputStream(inFile))); parse(); } + /** * Creates a new NewickFile object. * @@ -196,24 +198,29 @@ public class NewickFile { public boolean HasRootDistance() { return RootHasDistance; } + /* * hacked out of jalview code */ boolean error; + String errormessage; - java.io.BufferedReader dataIn=null; + + java.io.BufferedReader dataIn = null; + public String nextLine() throws IOException { - if (dataIn==null && newickFile==null) - throw new IOException("IMPLEMENTATION ERROR: NewickFile has not been initialised for reading a newick string."); - if (dataIn==null) - { + if (dataIn == null && newickFile == null) + throw new IOException( + "IMPLEMENTATION ERROR: NewickFile has not been initialised for reading a newick string."); + if (dataIn == null) { dataIn = new BufferedReader(new StringReader(newickFile)); - error=false; + error = false; } if (!error) return dataIn.readLine(); throw new IOException("Invalid Source Stream:" + errormessage); } + /** * call this to convert the newick string into a binary node linked tree * @@ -222,8 +229,7 @@ public class NewickFile { */ public void parse() throws IOException { String nf; - if (newickFile==null) - { + if (newickFile == null) { // fill nf with complete tree file StringBuffer file = new StringBuffer(); @@ -233,11 +239,9 @@ public class NewickFile { } nf = file.toString(); - } else - { + } else { nf = newickFile; } - root = new SequenceNode(); @@ -252,37 +256,23 @@ public class NewickFile { String nodename = null; float DefDistance = (float) 0.001; // @param Default distance for a node - - // very very small + // very very small int DefBootstrap = 0; // @param Default bootstrap for a node float distance = DefDistance; int bootstrap = DefBootstrap; boolean ascending = false; // flag indicating that we are leaving the - // current node + // current node - Pattern majorsyms = Pattern.compile( - "[(\\['),;]"); + Pattern majorsyms = Pattern.compile("[(\\['),;]"); Matcher mjsyms = majorsyms.matcher(nf); + char schar; while (mjsyms.find(cp) && (Error == null)) { int fcp = mjsyms.start(); - switch (nf.charAt(fcp)) { - case '[': // Comment or structured/extended NH format info - - - if (nf.indexOf(']',fcp)>-1) { - // Skip the comment field - cp = nf.indexOf(']',fcp); - } else { - Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp, nf); - } - - ; - - break; - + switch (schar = nf.charAt(fcp)) { case '(': // ascending should not be set @@ -338,19 +328,34 @@ public class NewickFile { break; - case ';': + default: + int nextcp = 0; + // Skip Comment or structured/extended NH format info + if (schar == '[') { + if ((nextcp=nf.indexOf(']', fcp)) > -1) { + // Skip the comment field + // should advance fcp too here + nextcp++; + //fcp = nextcp; + //schar = nf.charAt(fcp); + } else { + Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp, nf); + nextcp = 0; + break; + } + ; + } - if (d != -1) { + // Reached termininating root node label. + if (schar == ';' && d != -1) { Error = ErrorStringrange(Error, "Wayward semicolon (depth=" + d + ")", 7, fcp, nf); } - // cp advanced at the end of default - default: - // Parse simpler field strings String fstring = nf.substring(cp, fcp); - Matcher uqnodename = Pattern.compile("\\b([^' :;\\](),]+)").matcher(fstring); + Matcher uqnodename = Pattern.compile("^([^' :;\\](),]+).*").matcher( + fstring); if (uqnodename.matches() && ((uqnodename.start(1) == 0) || (fstring.charAt(uqnodename .start(1) - 1) != ':'))) // JBPNote HACK! @@ -366,12 +371,11 @@ public class NewickFile { "File has broken algorithm - overwritten nodename", 10, fcp, nf); } } - - Matcher nbootstrap = Pattern.compile("\\S+([0-9+]+)\\S*:").matcher(fstring); + Matcher nbootstrap = Pattern.compile("\\S+([0-9+]+)\\S*:").matcher( + fstring); - if (nbootstrap.matches() - && (nbootstrap.start(1) > uqnodename.end(1))) { + if (nbootstrap.matches() && (nbootstrap.start(1) > uqnodename.end(1))) { try { bootstrap = (new Integer(nbootstrap.group(1))).intValue(); HasBootstrap = true; @@ -381,8 +385,7 @@ public class NewickFile { } } - Matcher ndist = Pattern.compile( - ":([-0-9Ee.+]+)").matcher(fstring); + Matcher ndist = Pattern.compile(":([-0-9Ee.+]+)").matcher(fstring); boolean nodehasdistance = false; if (ndist.matches()) { @@ -405,8 +408,8 @@ public class NewickFile { c.setBootstrap((HasBootstrap) ? bootstrap : DefBootstrap); if (c == realroot) { RootHasDistance = nodehasdistance; // JBPNote This is really - // UGLY!!! Ensure root node gets - // its given distance + // UGLY!!! Ensure root node gets + // its given distance } } else { // Find a place to put the leaf @@ -464,8 +467,10 @@ public class NewickFile { nodename = null; distance = DefDistance; bootstrap = DefBootstrap; - - cp = fcp + 1; + if (nextcp == 0) + cp = fcp + 1; + else + cp = nextcp; } } @@ -488,12 +493,14 @@ public class NewickFile { public SequenceNode getTree() { return root; } - public uk.ac.vamsas.objects.core.Treenode[] matchTreeNodeNames(String[] names, Vobject[] boundObjects) - { + + public uk.ac.vamsas.objects.core.Treenode[] matchTreeNodeNames( + String[] names, Vobject[] boundObjects) { // todo! // also - need to reconstruct a names object id mapping (or BInaryNode) mapping for the parsed tree file return null; } + /** * Generate a newick format tree according to internal flags for bootstraps, * distances and root distances. @@ -615,7 +622,8 @@ public class NewickFile { */ private String nodeName(String name) { if (NodeSafeName[0].matcher(name).find()) { - return QuoteChar + NodeSafeName[1].matcher(name).replaceAll("''") + QuoteChar; // quite + return QuoteChar + NodeSafeName[1].matcher(name).replaceAll("''") + + QuoteChar; // quite } else { return NodeSafeName[2].matcher(name).replaceAll("_"); // whitespace } @@ -632,7 +640,7 @@ public class NewickFile { private String printNodeField(SequenceNode c) { return //c.getNewickNodeName() ((c.getName() == null) ? "" : nodeName(c.getName())) - + ((HasBootstrap) ? ((c.getBootstrap() > -1) ? (" " + c.getBootstrap()) + + ((HasBootstrap) ? ((c.getBootstrap() > -1) ? (" " + c.getBootstrap()) : "") : "") + ((HasDistances) ? (":" + c.dist) : ""); } @@ -729,7 +737,9 @@ public class NewickFile { // trf.parse(); System.out.println("Original file :\n"); - System.out.println(Pattern.compile("\n+").matcher(newickfile.toString()).replaceAll("") + "\n"); + System.out.println(Pattern.compile("\n+").matcher(newickfile.toString()) + .replaceAll("") + + "\n"); System.out.println("Parsed file.\n"); System.out.println("Default output type for original input.\n"); @@ -747,20 +757,19 @@ public class NewickFile { System.out.println("leaves.\n"); Vector lvs = new Vector(); trf.findLeaves(trf.root, lvs); - Enumeration lv =lvs.elements(); - while (lv.hasMoreElements()) - { + Enumeration lv = lvs.elements(); + while (lv.hasMoreElements()) { BinaryNode leave = (BinaryNode) lv.nextElement(); - if (leave.getName()!=null) - { - System.out.println("Node:'"+leave.getName()+"'"); + if (leave.getName() != null) { + System.out.println("Node:'" + leave.getName() + "'"); } - } + } } catch (java.io.IOException e) { System.err.println("Exception\n" + e); e.printStackTrace(); } } + /** * Search for leaf nodes. * @@ -769,27 +778,23 @@ public class NewickFile { * * @return Vector of leaf nodes on binary tree */ - public Vector findLeaves(SequenceNode node, Vector leaves) - { - if (node == null) - { + public Vector findLeaves(SequenceNode node, Vector leaves) { + if (node == null) { return leaves; } - if ( (node.left() == null) && (node.right() == null)) // Interior node detection + if ((node.left() == null) && (node.right() == null)) // Interior node detection { leaves.addElement(node); return leaves; - } - else - { -/* TODO: Identify internal nodes... if (node.isSequenceLabel()) - { - leaves.addElement(node); - }*/ - findLeaves( (SequenceNode) node.left(), leaves); - findLeaves( (SequenceNode) node.right(), leaves); + } else { + /* TODO: Identify internal nodes... if (node.isSequenceLabel()) + { + leaves.addElement(node); + }*/ + findLeaves((SequenceNode) node.left(), leaves); + findLeaves((SequenceNode) node.right(), leaves); } return leaves; @@ -803,30 +808,26 @@ public class NewickFile { public Treenode[] makeTreeNodes() { return makeTreeNodes(true); } + /** * make treenode vector for a parsed tree with/out leaf node associations * @param ignoreplaceholders if true means only associated nodes are returned * @return treenode vector for associated or all leaves */ - public Treenode[] makeTreeNodes(boolean ignoreplaceholders) { + public Treenode[] makeTreeNodes(boolean ignoreplaceholders) { Vector leaves = new Vector(); findLeaves(root, leaves); Vector tnv = new Vector(); Enumeration l = leaves.elements(); Hashtable nodespecs = new Hashtable(); - while (l.hasMoreElements()) - { + while (l.hasMoreElements()) { BinaryNode tnode = (BinaryNode) l.nextElement(); - if (tnode instanceof SequenceNode) - { - if (!(ignoreplaceholders && ((SequenceNode) tnode).isPlaceholder())) - { + if (tnode instanceof SequenceNode) { + if (!(ignoreplaceholders && ((SequenceNode) tnode).isPlaceholder())) { Object assocseq = ((SequenceNode) tnode).element(); - if (assocseq instanceof Vobject) - { + if (assocseq instanceof Vobject) { Vobject vobj = (Vobject) assocseq; - if (vobj!=null) - { + if (vobj != null) { Treenode node = new Treenode(); node.setNodespec(makeNodeSpec(nodespecs, tnode)); node.setName(tnode.getName()); @@ -834,35 +835,35 @@ public class NewickFile { vr.addRefs(vobj); node.addVref(vr); tnv.addElement(node); - } - else - { - System.err.println("WARNING: Unassociated treeNode "+tnode.element().toString()+" " - +((tnode.getName()!=null) ? " label "+tnode.getName() : "")); + } else { + System.err.println("WARNING: Unassociated treeNode " + + tnode.element().toString() + + " " + + ((tnode.getName() != null) ? " label " + tnode.getName() + : "")); } } } } } - if (tnv.size()>0) - { + if (tnv.size() > 0) { Treenode[] tn = new Treenode[tnv.size()]; - tnv.copyInto(tn); + tnv.copyInto(tn); return tn; } return new Treenode[] {}; } - private String makeNodeSpec(Hashtable nodespecs, BinaryNode tnode) - { + + private String makeNodeSpec(Hashtable nodespecs, BinaryNode tnode) { String nname = new String(tnode.getName()); Integer nindx = (Integer) nodespecs.get(nname); - if (nindx==null) - { + if (nindx == null) { nindx = new Integer(1); } - nname = nindx.toString()+" "+nname; + nname = nindx.toString() + " " + nname; return nname; } + /** * call to match up Treenode specs to NJTree parsed from document object. * @@ -871,34 +872,30 @@ public class NewickFile { * as returned from NJTree.findLeaves( .., ..) .. * @return */ - private BinaryNode findNodeSpec(String nodespec, Vector leaves) - { - int occurence=-1; - String nspec = nodespec.substring(nodespec.indexOf(' ')+1); + private BinaryNode findNodeSpec(String nodespec, Vector leaves) { + int occurence = -1; + String nspec = nodespec.substring(nodespec.indexOf(' ') + 1); String oval = nodespec.substring(0, nodespec.indexOf(' ')); try { occurence = new Integer(oval).intValue(); - } - catch (Exception e) - { - System.err.println("Invalid nodespec '"+nodespec+"'"); + } catch (Exception e) { + System.err.println("Invalid nodespec '" + nodespec + "'"); return null; } BinaryNode bn = null; - + int nocc = 0; Enumeration en = leaves.elements(); - while (en.hasMoreElements() && nocc