import jalview.datamodel.*;
/**
- * DOCUMENT ME!
- *
- * @author $author$
+ * Parse a new hanpshire style tree
+ * Caveats: NHX files are NOT supported and the tree distances and topology are unreliable when they are parsed.
+ * @author Jim Procter
* @version $Revision$
*/
public class NewickFile
// File IO Flags
boolean ReplaceUnderscores = false;
- boolean printRootInfo = false;
+ boolean printRootInfo = true;
private com.stevesoft.pat.Regex[] NodeSafeName = new com.stevesoft.pat.Regex[]
{
new com.stevesoft.pat.Regex().perlCode("m/[\\[,:'()]/"), // test for requiring quotes
}
/**
- * DOCUMENT ME!
+ * parse the filesource as a newick file (new hampshire and/or extended)
*
- * @throws IOException DOCUMENT ME!
+ * @throws IOException with a line number and character position for badly formatted NH strings
*/
public void parse()
throws IOException
String nodename = null;
float DefDistance = (float) 0.001; // @param Default distance for a node - very very small
- int DefBootstrap = 0; // @param Default bootstrap for a node
+ int DefBootstrap = -1; // @param Default bootstrap for a node
float distance = DefDistance;
int bootstrap = DefBootstrap;
com.stevesoft.pat.Regex majorsyms = new com.stevesoft.pat.Regex(
"[(\\['),;]");
+ int nextcp=0;
+ int ncp = cp;
while (majorsyms.searchFrom(nf, cp) && (Error == null))
{
int fcp = majorsyms.matchedFrom();
-
- switch (nf.charAt(fcp))
+ char schar;
+ switch (schar=nf.charAt(fcp))
{
- case '[': // Comment or structured/extended NH format info
-
- com.stevesoft.pat.Regex comment = new com.stevesoft.pat.Regex(
- "]");
-
- if (comment.searchFrom(nf, fcp))
- {
- // Skip the comment field
- cp = 1 + comment.matchedFrom();
- }
- else
- {
- Error = ErrorStringrange(Error, "Unterminated comment", 3,
- fcp, nf);
- }
-
- ;
-
- break;
-
case '(':
// ascending should not be set
break;
- case ';':
-
- if (d != -1)
+ default:
+ if (schar==';')
{
- Error = ErrorStringrange(Error,
+ if (d != -1)
+ {
+ Error = ErrorStringrange(Error,
"Wayward semicolon (depth=" + d + ")", 7,
fcp, nf);
+ }
+ // cp advanced at the end of default
}
+ if (schar == '[')
+ {
+ // node string contains Comment or structured/extended NH format info
+ /* if ((fcp-cp>1 && nf.substring(cp,fcp).trim().length()>1))
+ {
+ // will process in remains System.err.println("skipped text: '"+nf.substring(cp,fcp)+"'");
+ }
+ */
+ // verify termination.
+ com.stevesoft.pat.Regex comment = new com.stevesoft.pat.Regex(
+ "]");
+ if (comment.searchFrom(nf, fcp))
+ {
+ // Skip the comment field
+ nextcp=comment.matchedFrom()+1;
+ warningMessage = "Tree file contained comments which may confuse input algorithm.";
+ break;
+
+ // cp advanced at the end of default to nextcp, ncp is unchanged so any node info can be read.
+ }
+ else
+ {
+ Error = ErrorStringrange(Error, "Unterminated comment", 3,
+ fcp, nf);
+ }
- // cp advanced at the end of default
- default:
-
+ ;
+ }
// Parse simpler field strings
- String fstring = nf.substring(cp, fcp);
+ String fstring = nf.substring(ncp, fcp);
+ // remove any comments before we parse the node info
+ // TODO: test newick file with quoted square brackets in node name (is this allowed?)
+ while (fstring.indexOf(']')>-1)
+ {
+ int cstart=fstring.indexOf('[');
+ int cend=fstring.indexOf(']');
+ String comment = fstring.substring(cstart+1,cend);
+ fstring = fstring.substring(0, cstart)+fstring.substring(cend+1);
+
+ }
com.stevesoft.pat.Regex uqnodename = new com.stevesoft.pat.Regex(
"\\b([^' :;\\](),]+)");
com.stevesoft.pat.Regex nbootstrap = new com.stevesoft.pat.Regex(
- "\\S+([0-9+]+)\\S*:");
+ "\\s*([0-9+]+)\\s*:");
com.stevesoft.pat.Regex ndist = new com.stevesoft.pat.Regex(
":([-0-9Ee.+]+)");
}
}
- if (nbootstrap.search(fstring) &&
- (nbootstrap.matchedFrom(1) > (uqnodename.matchedFrom(1) +
- uqnodename.stringMatched().length())))
+ if (nbootstrap.search(fstring))
+ {
+ if (nbootstrap.stringMatched(1).equals(uqnodename.stringMatched(1)))
+ {
+ nodename=null; // no nodename here.
+ }
+ if (nodename==null || nodename.length()==0 || nbootstrap.matchedFrom(1) > (uqnodename.matchedFrom(1) +
+ uqnodename.stringMatched().length()))
{
try
{
{
Error = ErrorStringrange(Error,
"Can't parse bootstrap value", 4,
- cp + nbootstrap.matchedFrom(), nf);
+ ncp + nbootstrap.matchedFrom(), nf);
}
}
+ }
boolean nodehasdistance = false;
{
Error = ErrorStringrange(Error,
"Can't parse node distance value", 7,
- cp + ndist.matchedFrom(), nf);
+ ncp + ndist.matchedFrom(), nf);
}
}
}
}
}
-
- // else : We do nothing if ';' is encountered.
}
// Reset new node properties to obvious fakes
nodename = null;
distance = DefDistance;
bootstrap = DefBootstrap;
-
- cp = fcp + 1;
+ }
+ if (nextcp==0)
+ {
+ ncp = cp = fcp + 1;
+ }
+ else {
+ cp=nextcp;
+ nextcp=0;
}
}
{
return ( (c.getName() == null) ? "" : nodeName(c.getName())) +
( (HasBootstrap)
- ? ( (c.getBootstrap() > -1) ? (" " + c.getBootstrap()) : "") : "") +
+ ? ( (c.getBootstrap() > -1) ? ((c.getName()!=null ? " " : "")+ c.getBootstrap()) : "") : "") +
( (HasDistances) ? (":" + c.dist) : "");
}
return (printRootInfo)
? ( ( (root.getName() == null) ? "" : nodeName(root.getName())) +
( (HasBootstrap)
- ? ( (root.getBootstrap() > -1) ? (" " + root.getBootstrap()) : "") :
+ ? ( (root.getBootstrap() > -1) ? ((root.getName()!=null ? " " : "")+
+ + root.getBootstrap()) : "") :
"") +
( (RootHasDistance) ? (":" + root.dist) : "")) : "";
}