import java.util.regex.Pattern;
import org.forester.io.parsers.PhylogenyParser;
+import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
import org.forester.io.parsers.util.ParserUtils;
import org.forester.io.parsers.util.PhylogenyParserException;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.data.Accession;
import org.forester.phylogeny.data.Annotation;
+import org.forester.phylogeny.data.Confidence;
import org.forester.phylogeny.data.DomainArchitecture;
import org.forester.phylogeny.data.Event;
import org.forester.phylogeny.data.Identifier;
+import org.forester.phylogeny.data.PhylogenyDataUtil;
import org.forester.phylogeny.data.PropertiesMap;
import org.forester.phylogeny.data.Property;
import org.forester.phylogeny.data.Sequence;
public final class NHXParser implements PhylogenyParser {
- public static final boolean LIMIT_SPECIES_NAMES_TO_FIVE_CHARS = true;
- public static final PhylogenyMethods.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = PhylogenyMethods.TAXONOMY_EXTRACTION.NO;
- final static private boolean GUESS_ROOTEDNESS_DEFAULT = true;
- final static private boolean GUESS_IF_SUPPORT_VALUES = true;
- final static private boolean IGNORE_QUOTES_DEFAULT = false;
- final static public boolean REPLACE_UNDERSCORES_DEFAULT = false;
+ public static final PhylogenyMethods.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = PhylogenyMethods.TAXONOMY_EXTRACTION.NO;
+ final static private boolean GUESS_ROOTEDNESS_DEFAULT = true;
+ final static private boolean GUESS_IF_SUPPORT_VALUES = true;
+ final static private boolean IGNORE_QUOTES_DEFAULT = false;
+ final static public boolean REPLACE_UNDERSCORES_DEFAULT = false;
private boolean _saw_closing_paren;
- final static private byte STRING = 0;
- final static private byte STRING_BUFFER = 1;
- final static private byte CHAR_ARRAY = 2;
- final static private byte BUFFERED_READER = 3;
+ final static private byte STRING = 0;
+ final static private byte STRING_BUFFER = 1;
+ final static private byte CHAR_ARRAY = 2;
+ final static private byte BUFFERED_READER = 3;
private boolean _guess_rootedness;
private boolean _has_next;
private boolean _ignore_quotes;
private Phylogeny _current_phylogeny;
private PhylogenyMethods.TAXONOMY_EXTRACTION _taxonomy_extraction;
private boolean _replace_underscores;
- public final static Pattern UC_LETTERS_NUMBERS_PATTERN = Pattern
- .compile( "^[A-Z0-9]+$" );
- public final static Pattern NUMBERS_ONLY_PATTERN = Pattern
- .compile( "^[0-9]+$" );
+ public final static Pattern UC_LETTERS_NUMBERS_PATTERN = Pattern
+ .compile( "^[A-Z0-9]+$" );
+ public final static Pattern NUMBERS_ONLY_PATTERN = Pattern
+ .compile( "^[0-9\\.]+$" );
+ public final static Pattern MB_PROB_PATTERN = Pattern
+ .compile( "prob=([^,]+)" );
+ public final static Pattern MB_PROB_SD_PATTERN = Pattern
+ .compile( "prob_stddev=([^,]+)" );
+ public final static Pattern MB_BL_PATTERN = Pattern
+ .compile( "length_median=([^,]+)" );
public NHXParser() {
init();
*
* @throws PhylogenyParserException
* @throws NHXFormatException
+ * @throws PhyloXmlDataFormatException
*/
- private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException {
+ private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
setCladeLevel( 0 );
if ( getCurrentPhylogeny() != null ) {
parseNHX( getCurrentAnotation().toString(),
isReplaceUnderscores() );
if ( NHXParser.GUESS_IF_SUPPORT_VALUES ) {
if ( NHXParser.isBranchLengthsLikeBootstrapValues( getCurrentPhylogeny() ) ) {
- NHXParser.moveBranchLengthsToBootstrapValues( getCurrentPhylogeny() );
+ NHXParser.moveBranchLengthsToConfidenceValues( getCurrentPhylogeny() );
}
}
if ( isGuessRootedness() ) {
}
}
- private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException {
+ private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException,
+ PhyloXmlDataFormatException {
setCladeLevel( 0 );
final PhylogenyNode new_node = new PhylogenyNode();
parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() );
&& ( ( c != '[' ) && ( c != '.' ) && ( ( c < 48 ) || ( c > 57 ) ) ) ) {
saw_colon = false;
}
- }
- if ( in_open_bracket && c == ']' ) {
- in_open_bracket = false;
+ if ( in_open_bracket && ( c == ']' ) ) {
+ in_open_bracket = false;
+ }
}
// \n\t is always ignored,
// as is " (34) and ' (39) (space is 32):
// comment consisting just of "[]":
saw_open_bracket = false;
}
- else if ( c == '(' && !in_open_bracket ) {
+ else if ( ( c == '(' ) && !in_open_bracket ) {
processOpenParen();
}
- else if ( c == ')' && !in_open_bracket ) {
+ else if ( ( c == ')' ) && !in_open_bracket ) {
processCloseParen();
}
- else if ( c == ',' && !in_open_bracket ) {
+ else if ( ( c == ',' ) && !in_open_bracket ) {
processComma();
}
else {
*
* @throws PhylogenyParserException
* @throws NHXFormatException
+ * @throws PhyloXmlDataFormatException
*/
- private void processCloseParen() throws PhylogenyParserException, NHXFormatException {
+ private void processCloseParen() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
decreaseCladeLevel();
if ( !isSawClosingParen() ) {
final PhylogenyNode new_node = new PhylogenyNode();
*
* @throws PhylogenyParserException
* @throws NHXFormatException
+ * @throws PhyloXmlDataFormatException
*/
- private void processComma() throws PhylogenyParserException, NHXFormatException {
+ private void processComma() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
if ( !isSawClosingParen() ) {
final PhylogenyNode new_node = new PhylogenyNode();
parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() );
*
* @throws PhylogenyParserException
* @throws NHXFormatException
+ * @throws PhyloXmlDataFormatException
*/
- private void processOpenParen() throws PhylogenyParserException, NHXFormatException {
+ private void processOpenParen() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
final PhylogenyNode new_node = new PhylogenyNode();
if ( getCladeLevel() == 0 ) {
if ( getCurrentPhylogeny() != null ) {
return true;
}
- private static void moveBranchLengthsToBootstrapValues( final Phylogeny p ) {
+ private static void moveBranchLengthsToConfidenceValues( final Phylogeny p ) {
final PhylogenyNodeIterator it = p.iteratorPostorder();
while ( it.hasNext() ) {
final PhylogenyNode n = it.next();
PhylogenyMethods.setBootstrapConfidence( n, n.getDistanceToParent() );
- n.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT );
+ n.setDistanceToParent( PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT );
}
}
public static void parseNHX( String s,
final PhylogenyNode node_to_annotate,
final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction,
- final boolean replace_underscores ) throws NHXFormatException {
- System.out.println( s );
- System.out.println();
+ final boolean replace_underscores ) throws NHXFormatException,
+ PhyloXmlDataFormatException {
if ( ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) && replace_underscores ) {
throw new IllegalArgumentException( "cannot extract taxonomies and replace under scores at the same time" );
}
if ( replace_underscores ) {
s = s.replaceAll( "_+", " " );
}
- int ob = 0;
- int cb = 0;
- String a = "";
- String b = "";
- StringTokenizer t = null;
boolean is_nhx = false;
- ob = s.indexOf( "[" );
- cb = s.indexOf( "]" );
+ final int ob = s.indexOf( "[" );
if ( ob > -1 ) {
- a = "";
- b = "";
+ String b = "";
is_nhx = true;
+ final int cb = s.indexOf( "]" );
if ( cb < 0 ) {
throw new NHXFormatException( "error in NHX formatted data: no closing \"]\" in \"" + s + "\"" );
}
if ( numbers_only.matches() ) {
b = ":" + NHXtags.SUPPORT + bracketed;
}
+ else if ( s.indexOf( "prob=" ) > -1 ) {
+ processMrBayes3Data( s, node_to_annotate );
+ }
}
- a = s.substring( 0, ob );
- s = a + b;
+ s = s.substring( 0, ob ) + b;
if ( ( s.indexOf( "[" ) > -1 ) || ( s.indexOf( "]" ) > -1 ) ) {
throw new NHXFormatException( "error in NHX formatted data: more than one \"]\" or \"[\"" );
}
}
- t = new StringTokenizer( s, ":" );
+ final StringTokenizer t = new StringTokenizer( s, ":" );
if ( t.countTokens() > 0 ) {
if ( !s.startsWith( ":" ) ) {
node_to_annotate.setName( t.nextToken() );
if ( !replace_underscores
&& ( !is_nhx && ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) ) ) {
- final String tax = ParserUtils
- .extractTaxonomyCodeFromNodeName( node_to_annotate.getName(),
- LIMIT_SPECIES_NAMES_TO_FIVE_CHARS,
- taxonomy_extraction );
+ final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node_to_annotate.getName(),
+ taxonomy_extraction );
if ( !ForesterUtil.isEmpty( tax ) ) {
if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) {
node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() );
}
while ( t.hasMoreTokens() ) {
s = t.nextToken();
- System.out.println( "=>" + s );
- System.out.println();
if ( s.startsWith( org.forester.io.parsers.nhx.NHXtags.SPECIES_NAME ) ) {
if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) {
node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() );
node_to_annotate.getNodeData().getSequence().setName( s.substring( 2 ) );
}
else if ( s.indexOf( '=' ) < 0 ) {
- if ( node_to_annotate.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) {
+ if ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ) {
throw new NHXFormatException( "error in NHX formatted data: more than one distance to parent:"
+ "\"" + s + "\"" );
}
}
}
+ private static void processMrBayes3Data( final String s, final PhylogenyNode node_to_annotate )
+ throws NHXFormatException {
+ double sd = -1;
+ final Matcher mb_prob_sd_matcher = MB_PROB_SD_PATTERN.matcher( s );
+ if ( mb_prob_sd_matcher.find() ) {
+ try {
+ sd = Double.parseDouble( mb_prob_sd_matcher.group( 1 ) );
+ }
+ catch ( final NumberFormatException e ) {
+ throw new NHXFormatException( "failed to parse probability standard deviation (Mr Bayes output) from \""
+ + s + "\"" );
+ }
+ }
+ final Matcher mb_prob_matcher = MB_PROB_PATTERN.matcher( s );
+ if ( mb_prob_matcher.find() ) {
+ double prob = -1;
+ try {
+ prob = Double.parseDouble( mb_prob_matcher.group( 1 ) );
+ }
+ catch ( final NumberFormatException e ) {
+ throw new NHXFormatException( "failed to parse probability (Mr Bayes output) from \"" + s + "\"" );
+ }
+ if ( prob >= 0.0 ) {
+ if ( sd >= 0.0 ) {
+ node_to_annotate.getBranchData()
+ .addConfidence( new Confidence( prob, "posterior probability", sd ) );
+ }
+ else {
+ node_to_annotate.getBranchData().addConfidence( new Confidence( prob, "posterior probability" ) );
+ }
+ }
+ }
+ final Matcher mb_bl_matcher = MB_BL_PATTERN.matcher( s );
+ if ( mb_bl_matcher.find() ) {
+ double bl = -1;
+ try {
+ bl = Double.parseDouble( mb_bl_matcher.group( 1 ) );
+ }
+ catch ( final NumberFormatException e ) {
+ throw new NHXFormatException( "failed to parse median branch length (Mr Bayes output) from \"" + s
+ + "\"" );
+ }
+ if ( bl >= 0.0 ) {
+ node_to_annotate.setDistanceToParent( bl );
+ }
+ }
+ }
+
/**
* Parses String s in the format r.g.b (e.g. "12.34.234" ) into red, green,
* and blue and returns the corresponding Color.