// Copyright (C) 2008-2009 Christian M. Zmasek
// Copyright (C) 2008-2009 Burnham Institute for Medical Research
// All rights reserved
-//
+//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
-//
+//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
import java.util.regex.Pattern;
import org.forester.io.parsers.PhylogenyParser;
+import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
+import org.forester.io.parsers.util.ParserUtils;
import org.forester.io.parsers.util.PhylogenyParserException;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyMethods;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.data.Accession;
import org.forester.phylogeny.data.Annotation;
+import org.forester.phylogeny.data.Confidence;
import org.forester.phylogeny.data.DomainArchitecture;
import org.forester.phylogeny.data.Event;
import org.forester.phylogeny.data.Identifier;
+import org.forester.phylogeny.data.PhylogenyDataUtil;
import org.forester.phylogeny.data.PropertiesMap;
import org.forester.phylogeny.data.Property;
import org.forester.phylogeny.data.Sequence;
public final class NHXParser implements PhylogenyParser {
- public static final boolean LIMIT_SPECIES_NAMES_TO_FIVE_CHARS = true;
- public static final ForesterUtil.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = ForesterUtil.TAXONOMY_EXTRACTION.NO;
- final static private boolean GUESS_ROOTEDNESS_DEFAULT = true;
- final static private boolean GUESS_IF_SUPPORT_VALUES = true;
- final static private boolean IGNORE_QUOTES_DEFAULT = false;
- final static public boolean REPLACE_UNDERSCORES_DEFAULT = false;
- private boolean _saw_closing_paren;
- final static private byte STRING = 0;
- final static private byte STRING_BUFFER = 1;
- final static private byte CHAR_ARRAY = 2;
- final static private byte BUFFERED_READER = 3;
- private boolean _guess_rootedness;
- private boolean _has_next;
- private boolean _ignore_quotes;
- private byte _input_type;
- private int _source_length;
- private PhylogenyNode _current_node;
- private StringBuilder _current_anotation;
- private Object _nhx_source;
- private int _clade_level;
- private List<Phylogeny> _phylogenies;
- private Phylogeny _current_phylogeny;
- private ForesterUtil.TAXONOMY_EXTRACTION _taxonomy_extraction;
- private boolean _replace_underscores;
- public final static Pattern UC_LETTERS_NUMBERS_PATTERN = Pattern
- .compile( "^[A-Z0-9]+$" );
- public final static Pattern NUMBERS_ONLY_PATTERN = Pattern
- .compile( "^[0-9]+$" );
+ public static final PhylogenyMethods.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = PhylogenyMethods.TAXONOMY_EXTRACTION.NO;
+ final static private boolean GUESS_ROOTEDNESS_DEFAULT = true;
+ final static private boolean GUESS_IF_SUPPORT_VALUES = true;
+ final static private boolean IGNORE_QUOTES_DEFAULT = false;
+ final static public boolean REPLACE_UNDERSCORES_DEFAULT = false;
+ private boolean _saw_closing_paren;
+ final static private byte STRING = 0;
+ final static private byte STRING_BUFFER = 1;
+ final static private byte CHAR_ARRAY = 2;
+ final static private byte BUFFERED_READER = 3;
+ private boolean _guess_rootedness;
+ private boolean _has_next;
+ private boolean _ignore_quotes;
+ private byte _input_type;
+ private int _source_length;
+ private PhylogenyNode _current_node;
+ private StringBuilder _current_anotation;
+ private Object _nhx_source;
+ private int _clade_level;
+ private List<Phylogeny> _phylogenies;
+ private Phylogeny _current_phylogeny;
+ private PhylogenyMethods.TAXONOMY_EXTRACTION _taxonomy_extraction;
+ private boolean _replace_underscores;
+ public final static Pattern UC_LETTERS_NUMBERS_PATTERN = Pattern
+ .compile( "^[A-Z0-9]+$" );
+ public final static Pattern NUMBERS_ONLY_PATTERN = Pattern
+ .compile( "^[0-9\\.]+$" );
+ public final static Pattern MB_PROB_PATTERN = Pattern
+ .compile( "prob=([^,]+)" );
+ public final static Pattern MB_PROB_SD_PATTERN = Pattern
+ .compile( "prob_stddev=([^,]+)" );
+ public final static Pattern MB_BL_PATTERN = Pattern
+ .compile( "length_median=([^,]+)" );
public NHXParser() {
init();
*
* @throws PhylogenyParserException
* @throws NHXFormatException
+ * @throws PhyloXmlDataFormatException
*/
- private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException {
+ private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
setCladeLevel( 0 );
if ( getCurrentPhylogeny() != null ) {
parseNHX( getCurrentAnotation().toString(),
isReplaceUnderscores() );
if ( NHXParser.GUESS_IF_SUPPORT_VALUES ) {
if ( NHXParser.isBranchLengthsLikeBootstrapValues( getCurrentPhylogeny() ) ) {
- NHXParser.moveBranchLengthsToBootstrapValues( getCurrentPhylogeny() );
+ NHXParser.moveBranchLengthsToConfidenceValues( getCurrentPhylogeny() );
}
}
if ( isGuessRootedness() ) {
}
}
- private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException {
+ private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException,
+ PhyloXmlDataFormatException {
setCladeLevel( 0 );
final PhylogenyNode new_node = new PhylogenyNode();
parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() );
return _source_length;
}
- public ForesterUtil.TAXONOMY_EXTRACTION getTaxonomyExtraction() {
+ public PhylogenyMethods.TAXONOMY_EXTRACTION getTaxonomyExtraction() {
return _taxonomy_extraction;
}
* @throws NHXFormatException
* @throws PhylogenyParserException
*/
+ @Override
public Phylogeny[] parse() throws IOException, NHXFormatException {
setHasNext( false );
boolean in_comment = false;
boolean saw_colon = false;
boolean saw_open_bracket = false;
+ boolean in_open_bracket = false;
boolean in_double_quote = false;
boolean in_single_quote = false;
setPhylogenies( new ArrayList<Phylogeny>() );
&& ( ( c != '[' ) && ( c != '.' ) && ( ( c < 48 ) || ( c > 57 ) ) ) ) {
saw_colon = false;
}
+ if ( in_open_bracket && ( c == ']' ) ) {
+ in_open_bracket = false;
+ }
}
// \n\t is always ignored,
// as is " (34) and ' (39) (space is 32):
}
else if ( c == '[' ) {
saw_open_bracket = true;
+ in_open_bracket = true;
}
else if ( saw_open_bracket ) {
if ( c != ']' ) {
// comment consisting just of "[]":
saw_open_bracket = false;
}
- else if ( c == '(' ) {
+ else if ( ( c == '(' ) && !in_open_bracket ) {
processOpenParen();
}
- else if ( c == ')' ) {
+ else if ( ( c == ')' ) && !in_open_bracket ) {
processCloseParen();
}
- else if ( c == ',' ) {
+ else if ( ( c == ',' ) && !in_open_bracket ) {
processComma();
}
else {
*
* @throws PhylogenyParserException
* @throws NHXFormatException
+ * @throws PhyloXmlDataFormatException
*/
- private void processCloseParen() throws PhylogenyParserException, NHXFormatException {
+ private void processCloseParen() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
decreaseCladeLevel();
if ( !isSawClosingParen() ) {
final PhylogenyNode new_node = new PhylogenyNode();
*
* @throws PhylogenyParserException
* @throws NHXFormatException
+ * @throws PhyloXmlDataFormatException
*/
- private void processComma() throws PhylogenyParserException, NHXFormatException {
+ private void processComma() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
if ( !isSawClosingParen() ) {
final PhylogenyNode new_node = new PhylogenyNode();
parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() );
*
* @throws PhylogenyParserException
* @throws NHXFormatException
+ * @throws PhyloXmlDataFormatException
*/
- private void processOpenParen() throws PhylogenyParserException, NHXFormatException {
+ private void processOpenParen() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
final PhylogenyNode new_node = new PhylogenyNode();
if ( getCladeLevel() == 0 ) {
if ( getCurrentPhylogeny() != null ) {
* @throws IOException
* @throws PhylogenyParserException
*/
+ @Override
public void setSource( final Object nhx_source ) throws PhylogenyParserException, IOException {
if ( nhx_source == null ) {
throw new PhylogenyParserException( getClass() + ": attempt to parse null object." );
_source_length = source_length;
}
- public void setTaxonomyExtraction( final ForesterUtil.TAXONOMY_EXTRACTION taxonomy_extraction ) {
+ public void setTaxonomyExtraction( final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction ) {
_taxonomy_extraction = taxonomy_extraction;
}
return true;
}
- private static void moveBranchLengthsToBootstrapValues( final Phylogeny p ) {
+ private static void moveBranchLengthsToConfidenceValues( final Phylogeny p ) {
final PhylogenyNodeIterator it = p.iteratorPostorder();
while ( it.hasNext() ) {
final PhylogenyNode n = it.next();
PhylogenyMethods.setBootstrapConfidence( n, n.getDistanceToParent() );
- n.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT );
+ n.setDistanceToParent( PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT );
}
}
public static void parseNHX( String s,
final PhylogenyNode node_to_annotate,
- final ForesterUtil.TAXONOMY_EXTRACTION taxonomy_extraction,
- final boolean replace_underscores ) throws NHXFormatException {
- if ( ( taxonomy_extraction != ForesterUtil.TAXONOMY_EXTRACTION.NO ) && replace_underscores ) {
+ final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction,
+ final boolean replace_underscores ) throws NHXFormatException,
+ PhyloXmlDataFormatException {
+ if ( ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) && replace_underscores ) {
throw new IllegalArgumentException( "cannot extract taxonomies and replace under scores at the same time" );
}
if ( ( s != null ) && ( s.length() > 0 ) ) {
if ( replace_underscores ) {
s = s.replaceAll( "_+", " " );
}
- int ob = 0;
- int cb = 0;
- String a = "";
- String b = "";
- StringTokenizer t = null;
boolean is_nhx = false;
- ob = s.indexOf( "[" );
- cb = s.indexOf( "]" );
+ final int ob = s.indexOf( "[" );
if ( ob > -1 ) {
- a = "";
- b = "";
+ String b = "";
is_nhx = true;
+ final int cb = s.indexOf( "]" );
if ( cb < 0 ) {
- throw new NHXFormatException( "error in NHX formatted data: no closing \"]\"" );
+ throw new NHXFormatException( "error in NHX formatted data: no closing \"]\" in \"" + s + "\"" );
}
if ( s.indexOf( "&&NHX" ) == ( ob + 1 ) ) {
b = s.substring( ob + 6, cb );
if ( numbers_only.matches() ) {
b = ":" + NHXtags.SUPPORT + bracketed;
}
+ else if ( s.indexOf( "prob=" ) > -1 ) {
+ processMrBayes3Data( s, node_to_annotate );
+ }
}
- a = s.substring( 0, ob );
- s = a + b;
+ s = s.substring( 0, ob ) + b;
if ( ( s.indexOf( "[" ) > -1 ) || ( s.indexOf( "]" ) > -1 ) ) {
throw new NHXFormatException( "error in NHX formatted data: more than one \"]\" or \"[\"" );
}
}
- t = new StringTokenizer( s, ":" );
- if ( t.countTokens() >= 1 ) {
+ final StringTokenizer t = new StringTokenizer( s, ":" );
+ if ( t.countTokens() > 0 ) {
if ( !s.startsWith( ":" ) ) {
node_to_annotate.setName( t.nextToken() );
if ( !replace_underscores
- && ( !is_nhx && ( taxonomy_extraction != ForesterUtil.TAXONOMY_EXTRACTION.NO ) ) ) {
- final String tax = ForesterUtil
- .extractTaxonomyCodeFromNodeName( node_to_annotate.getName(),
- LIMIT_SPECIES_NAMES_TO_FIVE_CHARS,
- taxonomy_extraction );
+ && ( !is_nhx && ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) ) ) {
+ final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node_to_annotate.getName(),
+ taxonomy_extraction );
if ( !ForesterUtil.isEmpty( tax ) ) {
if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) {
node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() );
if ( !node_to_annotate.getNodeData().isHasSequence() ) {
node_to_annotate.getNodeData().setSequence( new Sequence() );
}
- node_to_annotate.getNodeData().getSequence().setDomainArchitecture( new DomainArchitecture( s
- .substring( 3 ) ) );
+ node_to_annotate.getNodeData().getSequence()
+ .setDomainArchitecture( new DomainArchitecture( s.substring( 3 ) ) );
}
else if ( s.startsWith( NHXtags.NODE_IDENTIFIER ) ) {
node_to_annotate.getNodeData().setNodeIdentifier( new Identifier( s.substring( 3 ) ) );
node_to_annotate.getNodeData().getSequence().setName( s.substring( 2 ) );
}
else if ( s.indexOf( '=' ) < 0 ) {
- if ( node_to_annotate.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) {
+ if ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ) {
throw new NHXFormatException( "error in NHX formatted data: more than one distance to parent:"
+ "\"" + s + "\"" );
}
}
}
+ private static void processMrBayes3Data( final String s, final PhylogenyNode node_to_annotate )
+ throws NHXFormatException {
+ double sd = -1;
+ final Matcher mb_prob_sd_matcher = MB_PROB_SD_PATTERN.matcher( s );
+ if ( mb_prob_sd_matcher.find() ) {
+ try {
+ sd = Double.parseDouble( mb_prob_sd_matcher.group( 1 ) );
+ }
+ catch ( final NumberFormatException e ) {
+ throw new NHXFormatException( "failed to parse probability standard deviation (Mr Bayes output) from \""
+ + s + "\"" );
+ }
+ }
+ final Matcher mb_prob_matcher = MB_PROB_PATTERN.matcher( s );
+ if ( mb_prob_matcher.find() ) {
+ double prob = -1;
+ try {
+ prob = Double.parseDouble( mb_prob_matcher.group( 1 ) );
+ }
+ catch ( final NumberFormatException e ) {
+ throw new NHXFormatException( "failed to parse probability (Mr Bayes output) from \"" + s + "\"" );
+ }
+ if ( prob >= 0.0 ) {
+ if ( sd >= 0.0 ) {
+ node_to_annotate.getBranchData()
+ .addConfidence( new Confidence( prob, "posterior probability", sd ) );
+ }
+ else {
+ node_to_annotate.getBranchData().addConfidence( new Confidence( prob, "posterior probability" ) );
+ }
+ }
+ }
+ final Matcher mb_bl_matcher = MB_BL_PATTERN.matcher( s );
+ if ( mb_bl_matcher.find() ) {
+ double bl = -1;
+ try {
+ bl = Double.parseDouble( mb_bl_matcher.group( 1 ) );
+ }
+ catch ( final NumberFormatException e ) {
+ throw new NHXFormatException( "failed to parse median branch length (Mr Bayes output) from \"" + s
+ + "\"" );
+ }
+ if ( bl >= 0.0 ) {
+ node_to_annotate.setDistanceToParent( bl );
+ }
+ }
+ }
+
/**
* Parses String s in the format r.g.b (e.g. "12.34.234" ) into red, green,
* and blue and returns the corresponding Color.