X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fio%2Fparsers%2Fnhx%2FNHXParser.java;h=99a78bf7567aee88e28686c3656ea6939390c192;hb=1a6e8795d5cd4c9b137069cc47121689b83377ed;hp=47771ce9c1824da1249e1d071fe4e67c4b2c5dec;hpb=038c34792757a86f24296de5683e722fab3f9307;p=jalview.git diff --git a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java index 47771ce..99a78bf 100644 --- a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java +++ b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java @@ -39,6 +39,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.forester.io.parsers.PhylogenyParser; +import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException; import org.forester.io.parsers.util.ParserUtils; import org.forester.io.parsers.util.PhylogenyParserException; import org.forester.phylogeny.Phylogeny; @@ -46,9 +47,11 @@ import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Accession; import org.forester.phylogeny.data.Annotation; +import org.forester.phylogeny.data.Confidence; import org.forester.phylogeny.data.DomainArchitecture; import org.forester.phylogeny.data.Event; import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.PhylogenyDataUtil; import org.forester.phylogeny.data.PropertiesMap; import org.forester.phylogeny.data.Property; import org.forester.phylogeny.data.Sequence; @@ -58,34 +61,34 @@ import org.forester.util.ForesterUtil; public final class NHXParser implements PhylogenyParser { - public static final boolean LIMIT_SPECIES_NAMES_TO_FIVE_CHARS = true; - public static final PhylogenyMethods.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = PhylogenyMethods.TAXONOMY_EXTRACTION.NO; - final static private boolean GUESS_ROOTEDNESS_DEFAULT = true; - final static private boolean GUESS_IF_SUPPORT_VALUES = true; - final static private boolean IGNORE_QUOTES_DEFAULT = false; - final static public boolean REPLACE_UNDERSCORES_DEFAULT = false; - private boolean _saw_closing_paren; - final static private byte STRING = 0; - final static private byte STRING_BUFFER = 1; - final static private byte CHAR_ARRAY = 2; - final static private byte BUFFERED_READER = 3; - private boolean _guess_rootedness; - private boolean _has_next; - private boolean _ignore_quotes; - private byte _input_type; - private int _source_length; - private PhylogenyNode _current_node; - private StringBuilder _current_anotation; - private Object _nhx_source; - private int _clade_level; - private List _phylogenies; - private Phylogeny _current_phylogeny; - private PhylogenyMethods.TAXONOMY_EXTRACTION _taxonomy_extraction; - private boolean _replace_underscores; - public final static Pattern UC_LETTERS_NUMBERS_PATTERN = Pattern - .compile( "^[A-Z0-9]+$" ); - public final static Pattern NUMBERS_ONLY_PATTERN = Pattern - .compile( "^[0-9]+$" ); + public static final TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = NHXParser.TAXONOMY_EXTRACTION.NO; + final static private boolean GUESS_ROOTEDNESS_DEFAULT = true; + final static private boolean GUESS_IF_SUPPORT_VALUES = true; + final static private boolean IGNORE_QUOTES_DEFAULT = false; + final static public boolean REPLACE_UNDERSCORES_DEFAULT = false; + private boolean _saw_closing_paren; + final static private byte STRING = 0; + final static private byte STRING_BUFFER = 1; + final static private byte CHAR_ARRAY = 2; + final static private byte BUFFERED_READER = 3; + private boolean _guess_rootedness; + private boolean _has_next; + private boolean _ignore_quotes; + private byte _input_type; + private int _source_length; + private PhylogenyNode _current_node; + private StringBuilder _current_anotation; + private Object _nhx_source; + private int _clade_level; + private List _phylogenies; + private Phylogeny _current_phylogeny; + private NHXParser.TAXONOMY_EXTRACTION _taxonomy_extraction; + private boolean _replace_underscores; + public final static Pattern UC_LETTERS_NUMBERS_PATTERN = Pattern.compile( "^[A-Z0-9]+$" ); + public final static Pattern NUMBERS_ONLY_PATTERN = Pattern.compile( "^[0-9\\.]+$" ); + public final static Pattern MB_PROB_PATTERN = Pattern.compile( "prob=([^,]+)" ); + public final static Pattern MB_PROB_SD_PATTERN = Pattern.compile( "prob_stddev=([^,]+)" ); + public final static Pattern MB_BL_PATTERN = Pattern.compile( "length_median=([^,]+)" ); public NHXParser() { init(); @@ -110,8 +113,9 @@ public final class NHXParser implements PhylogenyParser { * * @throws PhylogenyParserException * @throws NHXFormatException + * @throws PhyloXmlDataFormatException */ - private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException { + private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException { setCladeLevel( 0 ); if ( getCurrentPhylogeny() != null ) { parseNHX( getCurrentAnotation().toString(), @@ -120,7 +124,7 @@ public final class NHXParser implements PhylogenyParser { isReplaceUnderscores() ); if ( NHXParser.GUESS_IF_SUPPORT_VALUES ) { if ( NHXParser.isBranchLengthsLikeBootstrapValues( getCurrentPhylogeny() ) ) { - NHXParser.moveBranchLengthsToBootstrapValues( getCurrentPhylogeny() ); + NHXParser.moveBranchLengthsToConfidenceValues( getCurrentPhylogeny() ); } } if ( isGuessRootedness() ) { @@ -134,7 +138,8 @@ public final class NHXParser implements PhylogenyParser { } } - private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException { + private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException, + PhyloXmlDataFormatException { setCladeLevel( 0 ); final PhylogenyNode new_node = new PhylogenyNode(); parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() ); @@ -188,7 +193,7 @@ public final class NHXParser implements PhylogenyParser { return _source_length; } - public PhylogenyMethods.TAXONOMY_EXTRACTION getTaxonomyExtraction() { + public NHXParser.TAXONOMY_EXTRACTION getTaxonomyExtraction() { return _taxonomy_extraction; } @@ -256,11 +261,14 @@ public final class NHXParser implements PhylogenyParser { boolean in_comment = false; boolean saw_colon = false; boolean saw_open_bracket = false; + boolean in_open_bracket = false; boolean in_double_quote = false; boolean in_single_quote = false; setPhylogenies( new ArrayList() ); setCladeLevel( 0 ); newCurrentAnotation(); + setCurrentPhylogeny( null ); + setCurrentNode( null ); int i = 0; while ( true ) { char c = '\b'; @@ -299,6 +307,9 @@ public final class NHXParser implements PhylogenyParser { && ( ( c != '[' ) && ( c != '.' ) && ( ( c < 48 ) || ( c > 57 ) ) ) ) { saw_colon = false; } + if ( in_open_bracket && ( c == ']' ) ) { + in_open_bracket = false; + } } // \n\t is always ignored, // as is " (34) and ' (39) (space is 32): @@ -338,6 +349,7 @@ public final class NHXParser implements PhylogenyParser { } else if ( c == '[' ) { saw_open_bracket = true; + in_open_bracket = true; } else if ( saw_open_bracket ) { if ( c != ']' ) { @@ -356,13 +368,13 @@ public final class NHXParser implements PhylogenyParser { // comment consisting just of "[]": saw_open_bracket = false; } - else if ( c == '(' ) { + else if ( ( c == '(' ) && !in_open_bracket ) { processOpenParen(); } - else if ( c == ')' ) { + else if ( ( c == ')' ) && !in_open_bracket ) { processCloseParen(); } - else if ( c == ',' ) { + else if ( ( c == ',' ) && !in_open_bracket ) { processComma(); } else { @@ -386,17 +398,14 @@ public final class NHXParser implements PhylogenyParser { return getPhylogeniesAsArray(); } // parse() - public Phylogeny parseNext() throws IOException, NHXFormatException { - return null; - } - /** * Called if a closing paren is encountered. * * @throws PhylogenyParserException * @throws NHXFormatException + * @throws PhyloXmlDataFormatException */ - private void processCloseParen() throws PhylogenyParserException, NHXFormatException { + private void processCloseParen() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException { decreaseCladeLevel(); if ( !isSawClosingParen() ) { final PhylogenyNode new_node = new PhylogenyNode(); @@ -422,8 +431,9 @@ public final class NHXParser implements PhylogenyParser { * * @throws PhylogenyParserException * @throws NHXFormatException + * @throws PhyloXmlDataFormatException */ - private void processComma() throws PhylogenyParserException, NHXFormatException { + private void processComma() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException { if ( !isSawClosingParen() ) { final PhylogenyNode new_node = new PhylogenyNode(); parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() ); @@ -447,8 +457,9 @@ public final class NHXParser implements PhylogenyParser { * * @throws PhylogenyParserException * @throws NHXFormatException + * @throws PhyloXmlDataFormatException */ - private void processOpenParen() throws PhylogenyParserException, NHXFormatException { + private void processOpenParen() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException { final PhylogenyNode new_node = new PhylogenyNode(); if ( getCladeLevel() == 0 ) { if ( getCurrentPhylogeny() != null ) { @@ -589,7 +600,7 @@ public final class NHXParser implements PhylogenyParser { _source_length = source_length; } - public void setTaxonomyExtraction( final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction ) { + public void setTaxonomyExtraction( final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction ) { _taxonomy_extraction = taxonomy_extraction; } @@ -598,7 +609,7 @@ public final class NHXParser implements PhylogenyParser { return Double.valueOf( str ).doubleValue(); } catch ( final NumberFormatException ex ) { - throw new NHXFormatException( "error in NH/NHX formatted data: failed to parse number from :" + "\"" + str + throw new NHXFormatException( "error in NH/NHX formatted data: failed to parse number from " + "\"" + str + "\"" ); } } @@ -618,40 +629,35 @@ public final class NHXParser implements PhylogenyParser { return true; } - private static void moveBranchLengthsToBootstrapValues( final Phylogeny p ) { + private static void moveBranchLengthsToConfidenceValues( final Phylogeny p ) { final PhylogenyNodeIterator it = p.iteratorPostorder(); while ( it.hasNext() ) { final PhylogenyNode n = it.next(); PhylogenyMethods.setBootstrapConfidence( n, n.getDistanceToParent() ); - n.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT ); + n.setDistanceToParent( PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ); } } public static void parseNHX( String s, final PhylogenyNode node_to_annotate, - final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction, - final boolean replace_underscores ) throws NHXFormatException { - if ( ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) && replace_underscores ) { + final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction, + final boolean replace_underscores ) throws NHXFormatException, + PhyloXmlDataFormatException { + if ( ( taxonomy_extraction != NHXParser.TAXONOMY_EXTRACTION.NO ) && replace_underscores ) { throw new IllegalArgumentException( "cannot extract taxonomies and replace under scores at the same time" ); } if ( ( s != null ) && ( s.length() > 0 ) ) { if ( replace_underscores ) { s = s.replaceAll( "_+", " " ); } - int ob = 0; - int cb = 0; - String a = ""; - String b = ""; - StringTokenizer t = null; boolean is_nhx = false; - ob = s.indexOf( "[" ); - cb = s.indexOf( "]" ); + final int ob = s.indexOf( "[" ); if ( ob > -1 ) { - a = ""; - b = ""; + String b = ""; is_nhx = true; + final int cb = s.indexOf( "]" ); if ( cb < 0 ) { - throw new NHXFormatException( "error in NHX formatted data: no closing \"]\"" ); + throw new NHXFormatException( "error in NHX formatted data: no closing \"]\" in \"" + s + "\"" ); } if ( s.indexOf( "&&NHX" ) == ( ob + 1 ) ) { b = s.substring( ob + 6, cb ); @@ -663,23 +669,23 @@ public final class NHXParser implements PhylogenyParser { if ( numbers_only.matches() ) { b = ":" + NHXtags.SUPPORT + bracketed; } + else if ( s.indexOf( "prob=" ) > -1 ) { + processMrBayes3Data( s, node_to_annotate ); + } } - a = s.substring( 0, ob ); - s = a + b; + s = s.substring( 0, ob ) + b; if ( ( s.indexOf( "[" ) > -1 ) || ( s.indexOf( "]" ) > -1 ) ) { throw new NHXFormatException( "error in NHX formatted data: more than one \"]\" or \"[\"" ); } } - t = new StringTokenizer( s, ":" ); - if ( t.countTokens() >= 1 ) { + final StringTokenizer t = new StringTokenizer( s, ":" ); + if ( t.countTokens() > 0 ) { if ( !s.startsWith( ":" ) ) { node_to_annotate.setName( t.nextToken() ); if ( !replace_underscores - && ( !is_nhx && ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) ) ) { - final String tax = ParserUtils - .extractTaxonomyCodeFromNodeName( node_to_annotate.getName(), - LIMIT_SPECIES_NAMES_TO_FIVE_CHARS, - taxonomy_extraction ); + && ( !is_nhx && ( taxonomy_extraction != NHXParser.TAXONOMY_EXTRACTION.NO ) ) ) { + final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node_to_annotate.getName(), + taxonomy_extraction ); if ( !ForesterUtil.isEmpty( tax ) ) { if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) { node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() ); @@ -772,7 +778,7 @@ public final class NHXParser implements PhylogenyParser { node_to_annotate.getNodeData().getSequence().setName( s.substring( 2 ) ); } else if ( s.indexOf( '=' ) < 0 ) { - if ( node_to_annotate.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) { + if ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ) { throw new NHXFormatException( "error in NHX formatted data: more than one distance to parent:" + "\"" + s + "\"" ); } @@ -783,6 +789,54 @@ public final class NHXParser implements PhylogenyParser { } } + private static void processMrBayes3Data( final String s, final PhylogenyNode node_to_annotate ) + throws NHXFormatException { + double sd = -1; + final Matcher mb_prob_sd_matcher = MB_PROB_SD_PATTERN.matcher( s ); + if ( mb_prob_sd_matcher.find() ) { + try { + sd = Double.parseDouble( mb_prob_sd_matcher.group( 1 ) ); + } + catch ( final NumberFormatException e ) { + throw new NHXFormatException( "failed to parse probability standard deviation (Mr Bayes output) from \"" + + s + "\"" ); + } + } + final Matcher mb_prob_matcher = MB_PROB_PATTERN.matcher( s ); + if ( mb_prob_matcher.find() ) { + double prob = -1; + try { + prob = Double.parseDouble( mb_prob_matcher.group( 1 ) ); + } + catch ( final NumberFormatException e ) { + throw new NHXFormatException( "failed to parse probability (Mr Bayes output) from \"" + s + "\"" ); + } + if ( prob >= 0.0 ) { + if ( sd >= 0.0 ) { + node_to_annotate.getBranchData() + .addConfidence( new Confidence( prob, "posterior probability", sd ) ); + } + else { + node_to_annotate.getBranchData().addConfidence( new Confidence( prob, "posterior probability" ) ); + } + } + } + final Matcher mb_bl_matcher = MB_BL_PATTERN.matcher( s ); + if ( mb_bl_matcher.find() ) { + double bl = -1; + try { + bl = Double.parseDouble( mb_bl_matcher.group( 1 ) ); + } + catch ( final NumberFormatException e ) { + throw new NHXFormatException( "failed to parse median branch length (Mr Bayes output) from \"" + s + + "\"" ); + } + if ( bl >= 0.0 ) { + node_to_annotate.setDistanceToParent( bl ); + } + } + } + /** * Parses String s in the format r.g.b (e.g. "12.34.234" ) into red, green, * and blue and returns the corresponding Color. @@ -797,4 +851,8 @@ public final class NHXParser implements PhylogenyParser { final int blu = ForesterUtil.limitRangeForColor( Integer.parseInt( st.nextToken() ) ); return new Color( red, green, blu ); } + + public static enum TAXONOMY_EXTRACTION { + NO, YES, PFAM_STYLE_ONLY; + } }