X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=src%2Fuk%2Fac%2Fvamsas%2Fobjects%2Futils%2Ftrees%2FNewickFile.java;h=d38c99aa381db696c3fbfcb9c7f63982e60638ed;hb=844ccad5a3fcbedec17b2af66d460f31abc7cff1;hp=2e1e893522d15fed6e7235853fe59c1e1ee27d0a;hpb=c5b324d5bb93fd421347f87ffe59bd6efe076d1e;p=vamsas.git diff --git a/src/uk/ac/vamsas/objects/utils/trees/NewickFile.java b/src/uk/ac/vamsas/objects/utils/trees/NewickFile.java index 2e1e893..d38c99a 100644 --- a/src/uk/ac/vamsas/objects/utils/trees/NewickFile.java +++ b/src/uk/ac/vamsas/objects/utils/trees/NewickFile.java @@ -1,24 +1,24 @@ /* - * originally from + * This file is part of the Vamsas Client version 0.1. + * Copyright 2009 by Jim Procter, Iain Milne, Pierre Marguerite, + * Andrew Waterhouse and Dominik Lindner. * - * Jalview - A Sequence Alignment Editor and Viewer - * Copyright (C) 2007 AM Waterhouse, J Procter, G Barton, M Clamp, S Searle - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, + * Earlier versions have also been incorporated into Jalview version 2.4 + * since 2008, and TOPALi version 2 since 2007. + * + * The Vamsas Client is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * The Vamsas Client is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with the Vamsas Client. If not, see . */ - // NewickFile.java // Tree I/O // http://evolution.genetics.washington.edu/phylip/newick_doc.html @@ -27,7 +27,6 @@ // TODO: Extended SequenceNodeI to hold parsed NHX strings package uk.ac.vamsas.objects.utils.trees; - import java.io.*; import java.util.Enumeration; import java.util.Hashtable; @@ -55,14 +54,14 @@ public class NewickFile { private boolean RootHasDistance = false; // File IO Flags - boolean ReplaceUnderscores = false; + boolean ReplaceUnderscores = true; - boolean printRootInfo = false; + boolean printRootInfo = true; // in case root has anything to preserve private Pattern[] NodeSafeName = new Pattern[] { Pattern.compile("[\\[,:'()]"), // test for requiring quotes Pattern.compile("'"), // escaping quote characters - Pattern.compile("/w") // unqoted whitespace transformation + Pattern.compile("\\s") // unqoted whitespace transformation }; char QuoteChar = '\''; @@ -70,23 +69,26 @@ public class NewickFile { String newickFile = null; /** - * Creates a new NewickFile object. + * Creates a new NewickFile object * * @param inStr - * DOCUMENT ME! + * Newick style tree string * * @throws IOException - * DOCUMENT ME! + * if string is not a valid newick file */ public NewickFile(String inStr) throws IOException { newickFile = inStr; parse(); } + public NewickFile(File inFile) throws IOException { - errormessage = "Problem's reading file "+inFile; - dataIn = new java.io.BufferedReader(new InputStreamReader(new java.io.FileInputStream(inFile))); + errormessage = "Problem's reading file " + inFile; + dataIn = new java.io.BufferedReader(new InputStreamReader( + new java.io.FileInputStream(inFile))); parse(); } + /** * Creates a new NewickFile object. * @@ -196,34 +198,40 @@ public class NewickFile { public boolean HasRootDistance() { return RootHasDistance; } + /* * hacked out of jalview code */ boolean error; + String errormessage; - java.io.BufferedReader dataIn=null; + + java.io.BufferedReader dataIn = null; + public String nextLine() throws IOException { - if (dataIn==null) - { + if (dataIn == null && newickFile == null) + throw new IOException( + "IMPLEMENTATION ERROR: NewickFile has not been initialised for reading a newick string."); + if (dataIn == null) { dataIn = new BufferedReader(new StringReader(newickFile)); - error=false; + error = false; } - else - throw new IOException("IMPLEMENTATION ERROR: NewickFile has not been initialised for reading a newick string."); if (!error) return dataIn.readLine(); throw new IOException("Invalid Source Stream:" + errormessage); } + /** - * call this to convert the newick string into a binary node linked tree + * call this to convert the newick string into a binary node linked tree Note: + * this is automatically called by the constructors, so you normally wouldn't + * need to use this. * * @throws IOException * if the newick string cannot be parsed. */ public void parse() throws IOException { String nf; - if (newickFile==null) - { + if (newickFile == null) { // fill nf with complete tree file StringBuffer file = new StringBuffer(); @@ -233,11 +241,9 @@ public class NewickFile { } nf = file.toString(); - } else - { + } else { nf = newickFile; } - root = new SequenceNode(); @@ -252,37 +258,25 @@ public class NewickFile { String nodename = null; float DefDistance = (float) 0.001; // @param Default distance for a node - - // very very small - int DefBootstrap = 0; // @param Default bootstrap for a node + // very very small + int DefBootstrap = -1; // @param Default bootstrap for a node float distance = DefDistance; int bootstrap = DefBootstrap; boolean ascending = false; // flag indicating that we are leaving the - // current node + // current node - Pattern majorsyms = Pattern.compile( - "[(\\['),;]"); + Pattern majorsyms = Pattern.compile("[(\\['),;]"); Matcher mjsyms = majorsyms.matcher(nf); + char schar; + int nextcp = 0; + int ncp = cp; while (mjsyms.find(cp) && (Error == null)) { int fcp = mjsyms.start(); - switch (nf.charAt(fcp)) { - case '[': // Comment or structured/extended NH format info - - - if (nf.indexOf(']',fcp)>-1) { - // Skip the comment field - cp = nf.indexOf(']',fcp); - } else { - Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp, nf); - } - - ; - - break; - + switch (schar = nf.charAt(fcp)) { case '(': // ascending should not be set @@ -338,19 +332,42 @@ public class NewickFile { break; - case ';': - - if (d != -1) { + default: + // Reached termininating root node label. + if (schar == ';' && d != -1) { Error = ErrorStringrange(Error, "Wayward semicolon (depth=" + d + ")", 7, fcp, nf); } - // cp advanced at the end of default - default: + // Skip Comment or structured/extended NH format info + if (schar == '[') { + if ((nextcp = nf.indexOf(']', fcp)) > -1) { + // verified that comment is properly terminated. + // now skip the comment field + nextcp++; + break; // go and search for the next node separator, leaving ncp at + // beginning of node info + } else { + Error = ErrorStringrange(Error, "Unterminated comment", 3, fcp, nf); + nextcp = 0; + break; + } + } - // Parse simpler field strings - String fstring = nf.substring(cp, fcp); - Matcher uqnodename = Pattern.compile("\\b([^' :;\\](),]+)").matcher(fstring); + // Parse simpler field strings from substring between ncp and node + // separator + String fstring = nf.substring(ncp, fcp); + // extract any comments from the nodeinfo. + while (fstring.indexOf(']') > -1) { + int cstart = fstring.indexOf('['); + int cend = fstring.indexOf(']'); + String comment = fstring.substring(cstart + 1, cend); // TODO: put + // this + // somewhere ? + fstring = fstring.substring(0, cstart) + fstring.substring(cend + 1); + } + Matcher uqnodename = Pattern.compile("^([^' :;\\](),]+).*").matcher( + fstring); if (uqnodename.matches() && ((uqnodename.start(1) == 0) || (fstring.charAt(uqnodename .start(1) - 1) != ':'))) // JBPNote HACK! @@ -366,23 +383,27 @@ public class NewickFile { "File has broken algorithm - overwritten nodename", 10, fcp, nf); } } - - Matcher nbootstrap = Pattern.compile("\\S+([0-9+]+)\\S*:").matcher(fstring); + Matcher nbootstrap = Pattern.compile("\\s*([+0-9]+)\\s*:.*").matcher( + fstring); - if (nbootstrap.matches() - && (nbootstrap.start(1) > uqnodename.end(1))) { - try { - bootstrap = (new Integer(nbootstrap.group(1))).intValue(); - HasBootstrap = true; - } catch (Exception e) { - Error = ErrorStringrange(Error, "Can't parse bootstrap value", 4, - cp + nbootstrap.start(0), nf); + if (nbootstrap.matches()) { + if (nodename != null && nbootstrap.group(1).equals(nodename)) { + nodename = null; // empty nodename - only bootstrap value + } + if ((nodename == null || nodename.length() == 0) + || nbootstrap.start(1) >= uqnodename.end(1)) { + try { + bootstrap = (new Integer(nbootstrap.group(1))).intValue(); + HasBootstrap = true; + } catch (Exception e) { + Error = ErrorStringrange(Error, "Can't parse bootstrap value", 4, + ncp + nbootstrap.start(0), nf); + } } } - Matcher ndist = Pattern.compile( - ":([-0-9Ee.+]+)").matcher(fstring); + Matcher ndist = Pattern.compile(".*:([-0-9Ee.+]+)").matcher(fstring); boolean nodehasdistance = false; if (ndist.matches()) { @@ -392,7 +413,7 @@ public class NewickFile { nodehasdistance = true; } catch (Exception e) { Error = ErrorStringrange(Error, "Can't parse node distance value", - 7, cp + ndist.start(0), nf); + 7, ncp + ndist.start(0), nf); } } @@ -405,8 +426,8 @@ public class NewickFile { c.setBootstrap((HasBootstrap) ? bootstrap : DefBootstrap); if (c == realroot) { RootHasDistance = nodehasdistance; // JBPNote This is really - // UGLY!!! Ensure root node gets - // its given distance + // UGLY!!! Ensure root node gets + // its given distance } } else { // Find a place to put the leaf @@ -456,16 +477,19 @@ public class NewickFile { } } } - - // else : We do nothing if ';' is encountered. } // Reset new node properties to obvious fakes nodename = null; distance = DefDistance; bootstrap = DefBootstrap; - - cp = fcp + 1; + } + // Advance character pointers if necessary + if (nextcp == 0) { + ncp = cp = fcp + 1; + } else { + cp = nextcp; + nextcp = 0; } } @@ -488,12 +512,15 @@ public class NewickFile { public SequenceNode getTree() { return root; } - public uk.ac.vamsas.objects.core.Treenode[] matchTreeNodeNames(String[] names, Vobject[] boundObjects) - { + + public uk.ac.vamsas.objects.core.Treenode[] matchTreeNodeNames( + String[] names, Vobject[] boundObjects) { // todo! - // also - need to reconstruct a names object id mapping (or BInaryNode) mapping for the parsed tree file + // also - need to reconstruct a names object id mapping (or BInaryNode) + // mapping for the parsed tree file return null; } + /** * Generate a newick format tree according to internal flags for bootstraps, * distances and root distances. @@ -615,7 +642,8 @@ public class NewickFile { */ private String nodeName(String name) { if (NodeSafeName[0].matcher(name).find()) { - return QuoteChar + NodeSafeName[1].matcher(name).replaceAll("''") + QuoteChar; // quite + return QuoteChar + NodeSafeName[1].matcher(name).replaceAll("''") + + QuoteChar; // quite } else { return NodeSafeName[2].matcher(name).replaceAll("_"); // whitespace } @@ -630,10 +658,12 @@ public class NewickFile { * @return DOCUMENT ME! */ private String printNodeField(SequenceNode c) { - return //c.getNewickNodeName() + return // c.getNewickNodeName() ((c.getName() == null) ? "" : nodeName(c.getName())) - + ((HasBootstrap) ? ((c.getBootstrap() > -1) ? (" " + c.getBootstrap()) - : "") : "") + ((HasDistances) ? (":" + c.dist) : ""); + + ((HasBootstrap) ? ((c.getBootstrap() > -1) ? (((c.getName() == null) ? " " + : "") + c.getBootstrap()) + : "") + : "") + ((HasDistances) ? (":" + c.dist) : ""); } /** @@ -647,9 +677,10 @@ public class NewickFile { private String printRootField(SequenceNode root) { return (printRootInfo) ? (((root.getName() == null) ? "" : nodeName(root .getName())) - + ((HasBootstrap) ? ((root.getBootstrap() > -1) ? (" " + root - .getBootstrap()) : "") : "") + ((RootHasDistance) ? (":" + root.dist) - : "")) + + ((HasBootstrap) ? ((root.getBootstrap() > -1) ? ((root.getName() != null ? " " + : "") + root.getBootstrap()) + : "") + : "") + ((RootHasDistance) ? (":" + root.dist) : "")) : ""; } @@ -729,7 +760,9 @@ public class NewickFile { // trf.parse(); System.out.println("Original file :\n"); - System.out.println(Pattern.compile("\n+").matcher(newickfile.toString()).replaceAll("") + "\n"); + System.out.println(Pattern.compile("\n+").matcher(newickfile.toString()) + .replaceAll("") + + "\n"); System.out.println("Parsed file.\n"); System.out.println("Default output type for original input.\n"); @@ -744,69 +777,87 @@ public class NewickFile { System.out.println(trf.print(false, false)); System.out.println("With bootstraps and with distances.\n"); System.out.println(trf.print(true, true)); + System.out.println("leaves.\n"); + Vector lvs = new Vector(); + trf.findLeaves(trf.root, lvs); + Enumeration lv = lvs.elements(); + while (lv.hasMoreElements()) { + BinaryNode leave = (BinaryNode) lv.nextElement(); + if (leave.getName() != null) { + System.out.println("Node:'" + leave.getName() + "'"); + } + } } catch (java.io.IOException e) { System.err.println("Exception\n" + e); e.printStackTrace(); } } + /** * Search for leaf nodes. - * - * @param node root node to search from - * @param leaves Vector of leaves to add leaf node objects too. - * + * + * @param node + * root node to search from + * @param leaves + * Vector of leaves to add leaf node objects too. + * * @return Vector of leaf nodes on binary tree */ - public Vector findLeaves(SequenceNode node, Vector leaves) - { - if (node == null) - { + public Vector findLeaves(SequenceNode node, Vector leaves) { + if (node == null) { return leaves; } - if ( (node.left() == null) && (node.right() == null)) // Interior node detection + if ((node.left() == null) && (node.right() == null)) // Interior node + // detection { leaves.addElement(node); return leaves; - } - else - { -/* TODO: Identify internal nodes... if (node.isSequenceLabel()) - { - leaves.addElement(node); - }*/ - findLeaves( (SequenceNode) node.left(), leaves); - findLeaves( (SequenceNode) node.right(), leaves); + } else { + /* + * TODO: Identify internal nodes... if (node.isSequenceLabel()) { + * leaves.addElement(node); } + */ + findLeaves((SequenceNode) node.left(), leaves); + findLeaves((SequenceNode) node.right(), leaves); } return leaves; } /** - * make tree node vector from a newick tree structure with associated vamsas objects + * make tree node vector from a newick tree structure with associated vamsas + * objects + * * @param ntree - * @return + * @return Treenode definitions for nodes with associated objects + */ + public Treenode[] makeTreeNodes() { + return makeTreeNodes(true); + } + + /** + * make treenode vector for a parsed tree with/out leaf node associations + * + * @param ignoreplaceholders + * if true means only associated nodes are returned + * @return treenode vector for associated or all leaves */ - public Treenode[] makeTreeNodes() { + public Treenode[] makeTreeNodes(boolean ignoreplaceholders) { Vector leaves = new Vector(); findLeaves(root, leaves); Vector tnv = new Vector(); Enumeration l = leaves.elements(); Hashtable nodespecs = new Hashtable(); - while (l.hasMoreElements()) - { + while (l.hasMoreElements()) { BinaryNode tnode = (BinaryNode) l.nextElement(); - if (tnode instanceof SequenceNode) - { - if (!((SequenceNode) tnode).isPlaceholder()) - { + if (tnode instanceof SequenceNode) { + if (!(ignoreplaceholders && ((SequenceNode) tnode).isPlaceholder())) { Object assocseq = ((SequenceNode) tnode).element(); - if (assocseq instanceof Vobject) - { + if (assocseq instanceof Vobject) { Vobject vobj = (Vobject) assocseq; - if (vobj!=null) - { + if (vobj != null) { Treenode node = new Treenode(); node.setNodespec(makeNodeSpec(nodespecs, tnode)); node.setName(tnode.getName()); @@ -814,35 +865,35 @@ public class NewickFile { vr.addRefs(vobj); node.addVref(vr); tnv.addElement(node); - } - else - { - System.err.println("WARNING: Unassociated treeNode "+tnode.element().toString()+" " - +((tnode.getName()!=null) ? " label "+tnode.getName() : "")); + } else { + System.err.println("WARNING: Unassociated treeNode " + + tnode.element().toString() + + " " + + ((tnode.getName() != null) ? " label " + tnode.getName() + : "")); } } } } } - if (tnv.size()>0) - { + if (tnv.size() > 0) { Treenode[] tn = new Treenode[tnv.size()]; - tnv.copyInto(tn); + tnv.copyInto(tn); return tn; } return new Treenode[] {}; } - private String makeNodeSpec(Hashtable nodespecs, BinaryNode tnode) - { + + private String makeNodeSpec(Hashtable nodespecs, BinaryNode tnode) { String nname = new String(tnode.getName()); Integer nindx = (Integer) nodespecs.get(nname); - if (nindx==null) - { + if (nindx == null) { nindx = new Integer(1); } - nname = nindx.toString()+" "+nname; + nname = nindx.toString() + " " + nname; return nname; } + /** * call to match up Treenode specs to NJTree parsed from document object. * @@ -851,85 +902,78 @@ public class NewickFile { * as returned from NJTree.findLeaves( .., ..) .. * @return */ - private BinaryNode findNodeSpec(String nodespec, Vector leaves) - { - int occurence=-1; - String nspec = nodespec.substring(nodespec.indexOf(' ')+1); + private BinaryNode findNodeSpec(String nodespec, Vector leaves) { + int occurence = -1; + String nspec = nodespec.substring(nodespec.indexOf(' ') + 1); String oval = nodespec.substring(0, nodespec.indexOf(' ')); try { occurence = new Integer(oval).intValue(); - } - catch (Exception e) - { - System.err.println("Invalid nodespec '"+nodespec+"'"); + } catch (Exception e) { + System.err.println("Invalid nodespec '" + nodespec + "'"); return null; } BinaryNode bn = null; - + int nocc = 0; Enumeration en = leaves.elements(); - while (en.hasMoreElements() && nocc