X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fio%2Fparsers%2Fnhx%2FNHXParser.java;h=c37310487ec240ce82a1d982b491929c1b362558;hb=44fddb76faa8975295b8b0ad38609256b5011ced;hp=47771ce9c1824da1249e1d071fe4e67c4b2c5dec;hpb=038c34792757a86f24296de5683e722fab3f9307;p=jalview.git diff --git a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java index 47771ce..c373104 100644 --- a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java +++ b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java @@ -46,9 +46,11 @@ import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Accession; import org.forester.phylogeny.data.Annotation; +import org.forester.phylogeny.data.Confidence; import org.forester.phylogeny.data.DomainArchitecture; import org.forester.phylogeny.data.Event; import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.PhylogenyDataUtil; import org.forester.phylogeny.data.PropertiesMap; import org.forester.phylogeny.data.Property; import org.forester.phylogeny.data.Sequence; @@ -58,7 +60,7 @@ import org.forester.util.ForesterUtil; public final class NHXParser implements PhylogenyParser { - public static final boolean LIMIT_SPECIES_NAMES_TO_FIVE_CHARS = true; + public static final boolean LIMIT_SPECIES_NAMES_TO_FIVE_CHARS = false; public static final PhylogenyMethods.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = PhylogenyMethods.TAXONOMY_EXTRACTION.NO; final static private boolean GUESS_ROOTEDNESS_DEFAULT = true; final static private boolean GUESS_IF_SUPPORT_VALUES = true; @@ -85,7 +87,13 @@ public final class NHXParser implements PhylogenyParser { public final static Pattern UC_LETTERS_NUMBERS_PATTERN = Pattern .compile( "^[A-Z0-9]+$" ); public final static Pattern NUMBERS_ONLY_PATTERN = Pattern - .compile( "^[0-9]+$" ); + .compile( "^[0-9\\.]+$" ); + public final static Pattern MB_PROB_PATTERN = Pattern + .compile( "prob=([^,]+)" ); + public final static Pattern MB_PROB_SD_PATTERN = Pattern + .compile( "prob_stddev=([^,]+)" ); + public final static Pattern MB_BL_PATTERN = Pattern + .compile( "length_median=([^,]+)" ); public NHXParser() { init(); @@ -120,7 +128,7 @@ public final class NHXParser implements PhylogenyParser { isReplaceUnderscores() ); if ( NHXParser.GUESS_IF_SUPPORT_VALUES ) { if ( NHXParser.isBranchLengthsLikeBootstrapValues( getCurrentPhylogeny() ) ) { - NHXParser.moveBranchLengthsToBootstrapValues( getCurrentPhylogeny() ); + NHXParser.moveBranchLengthsToConfidenceValues( getCurrentPhylogeny() ); } } if ( isGuessRootedness() ) { @@ -256,6 +264,7 @@ public final class NHXParser implements PhylogenyParser { boolean in_comment = false; boolean saw_colon = false; boolean saw_open_bracket = false; + boolean in_open_bracket = false; boolean in_double_quote = false; boolean in_single_quote = false; setPhylogenies( new ArrayList() ); @@ -299,6 +308,9 @@ public final class NHXParser implements PhylogenyParser { && ( ( c != '[' ) && ( c != '.' ) && ( ( c < 48 ) || ( c > 57 ) ) ) ) { saw_colon = false; } + if ( in_open_bracket && ( c == ']' ) ) { + in_open_bracket = false; + } } // \n\t is always ignored, // as is " (34) and ' (39) (space is 32): @@ -338,6 +350,7 @@ public final class NHXParser implements PhylogenyParser { } else if ( c == '[' ) { saw_open_bracket = true; + in_open_bracket = true; } else if ( saw_open_bracket ) { if ( c != ']' ) { @@ -356,13 +369,13 @@ public final class NHXParser implements PhylogenyParser { // comment consisting just of "[]": saw_open_bracket = false; } - else if ( c == '(' ) { + else if ( ( c == '(' ) && !in_open_bracket ) { processOpenParen(); } - else if ( c == ')' ) { + else if ( ( c == ')' ) && !in_open_bracket ) { processCloseParen(); } - else if ( c == ',' ) { + else if ( ( c == ',' ) && !in_open_bracket ) { processComma(); } else { @@ -618,12 +631,12 @@ public final class NHXParser implements PhylogenyParser { return true; } - private static void moveBranchLengthsToBootstrapValues( final Phylogeny p ) { + private static void moveBranchLengthsToConfidenceValues( final Phylogeny p ) { final PhylogenyNodeIterator it = p.iteratorPostorder(); while ( it.hasNext() ) { final PhylogenyNode n = it.next(); PhylogenyMethods.setBootstrapConfidence( n, n.getDistanceToParent() ); - n.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT ); + n.setDistanceToParent( PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ); } } @@ -638,20 +651,14 @@ public final class NHXParser implements PhylogenyParser { if ( replace_underscores ) { s = s.replaceAll( "_+", " " ); } - int ob = 0; - int cb = 0; - String a = ""; - String b = ""; - StringTokenizer t = null; boolean is_nhx = false; - ob = s.indexOf( "[" ); - cb = s.indexOf( "]" ); + final int ob = s.indexOf( "[" ); if ( ob > -1 ) { - a = ""; - b = ""; + String b = ""; is_nhx = true; + final int cb = s.indexOf( "]" ); if ( cb < 0 ) { - throw new NHXFormatException( "error in NHX formatted data: no closing \"]\"" ); + throw new NHXFormatException( "error in NHX formatted data: no closing \"]\" in \"" + s + "\"" ); } if ( s.indexOf( "&&NHX" ) == ( ob + 1 ) ) { b = s.substring( ob + 6, cb ); @@ -663,15 +670,17 @@ public final class NHXParser implements PhylogenyParser { if ( numbers_only.matches() ) { b = ":" + NHXtags.SUPPORT + bracketed; } + else if ( s.indexOf( "prob=" ) > -1 ) { + processMrBayes3Data( s, node_to_annotate ); + } } - a = s.substring( 0, ob ); - s = a + b; + s = s.substring( 0, ob ) + b; if ( ( s.indexOf( "[" ) > -1 ) || ( s.indexOf( "]" ) > -1 ) ) { throw new NHXFormatException( "error in NHX formatted data: more than one \"]\" or \"[\"" ); } } - t = new StringTokenizer( s, ":" ); - if ( t.countTokens() >= 1 ) { + final StringTokenizer t = new StringTokenizer( s, ":" ); + if ( t.countTokens() > 0 ) { if ( !s.startsWith( ":" ) ) { node_to_annotate.setName( t.nextToken() ); if ( !replace_underscores @@ -772,7 +781,7 @@ public final class NHXParser implements PhylogenyParser { node_to_annotate.getNodeData().getSequence().setName( s.substring( 2 ) ); } else if ( s.indexOf( '=' ) < 0 ) { - if ( node_to_annotate.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) { + if ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ) { throw new NHXFormatException( "error in NHX formatted data: more than one distance to parent:" + "\"" + s + "\"" ); } @@ -783,6 +792,54 @@ public final class NHXParser implements PhylogenyParser { } } + private static void processMrBayes3Data( final String s, final PhylogenyNode node_to_annotate ) + throws NHXFormatException { + double sd = -1; + final Matcher mb_prob_sd_matcher = MB_PROB_SD_PATTERN.matcher( s ); + if ( mb_prob_sd_matcher.find() ) { + try { + sd = Double.parseDouble( mb_prob_sd_matcher.group( 1 ) ); + } + catch ( final NumberFormatException e ) { + throw new NHXFormatException( "failed to parse probability standard deviation (Mr Bayes output) from \"" + + s + "\"" ); + } + } + final Matcher mb_prob_matcher = MB_PROB_PATTERN.matcher( s ); + if ( mb_prob_matcher.find() ) { + double prob = -1; + try { + prob = Double.parseDouble( mb_prob_matcher.group( 1 ) ); + } + catch ( final NumberFormatException e ) { + throw new NHXFormatException( "failed to parse probability (Mr Bayes output) from \"" + s + "\"" ); + } + if ( prob >= 0.0 ) { + if ( sd >= 0.0 ) { + node_to_annotate.getBranchData() + .addConfidence( new Confidence( prob, "posterior probability", sd ) ); + } + else { + node_to_annotate.getBranchData().addConfidence( new Confidence( prob, "posterior probability" ) ); + } + } + } + final Matcher mb_bl_matcher = MB_BL_PATTERN.matcher( s ); + if ( mb_bl_matcher.find() ) { + double bl = -1; + try { + bl = Double.parseDouble( mb_bl_matcher.group( 1 ) ); + } + catch ( final NumberFormatException e ) { + throw new NHXFormatException( "failed to parse median branch length (Mr Bayes output) from \"" + s + + "\"" ); + } + if ( bl >= 0.0 ) { + node_to_annotate.setDistanceToParent( bl ); + } + } + } + /** * Parses String s in the format r.g.b (e.g. "12.34.234" ) into red, green, * and blue and returns the corresponding Color.