X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fio%2Fparsers%2Fnhx%2FNHXParser.java;h=c37310487ec240ce82a1d982b491929c1b362558;hb=44fddb76faa8975295b8b0ad38609256b5011ced;hp=e227fee070534f22d6a348bf699d7df1f8d46be8;hpb=674f7858341235991a8d0eda5f55a20243944832;p=jalview.git diff --git a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java index e227fee..c373104 100644 --- a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java +++ b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java @@ -46,9 +46,11 @@ import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Accession; import org.forester.phylogeny.data.Annotation; +import org.forester.phylogeny.data.Confidence; import org.forester.phylogeny.data.DomainArchitecture; import org.forester.phylogeny.data.Event; import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.PhylogenyDataUtil; import org.forester.phylogeny.data.PropertiesMap; import org.forester.phylogeny.data.Property; import org.forester.phylogeny.data.Sequence; @@ -58,7 +60,7 @@ import org.forester.util.ForesterUtil; public final class NHXParser implements PhylogenyParser { - public static final boolean LIMIT_SPECIES_NAMES_TO_FIVE_CHARS = true; + public static final boolean LIMIT_SPECIES_NAMES_TO_FIVE_CHARS = false; public static final PhylogenyMethods.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = PhylogenyMethods.TAXONOMY_EXTRACTION.NO; final static private boolean GUESS_ROOTEDNESS_DEFAULT = true; final static private boolean GUESS_IF_SUPPORT_VALUES = true; @@ -85,7 +87,13 @@ public final class NHXParser implements PhylogenyParser { public final static Pattern UC_LETTERS_NUMBERS_PATTERN = Pattern .compile( "^[A-Z0-9]+$" ); public final static Pattern NUMBERS_ONLY_PATTERN = Pattern - .compile( "^[0-9]+$" ); + .compile( "^[0-9\\.]+$" ); + public final static Pattern MB_PROB_PATTERN = Pattern + .compile( "prob=([^,]+)" ); + public final static Pattern MB_PROB_SD_PATTERN = Pattern + .compile( "prob_stddev=([^,]+)" ); + public final static Pattern MB_BL_PATTERN = Pattern + .compile( "length_median=([^,]+)" ); public NHXParser() { init(); @@ -120,7 +128,7 @@ public final class NHXParser implements PhylogenyParser { isReplaceUnderscores() ); if ( NHXParser.GUESS_IF_SUPPORT_VALUES ) { if ( NHXParser.isBranchLengthsLikeBootstrapValues( getCurrentPhylogeny() ) ) { - NHXParser.moveBranchLengthsToBootstrapValues( getCurrentPhylogeny() ); + NHXParser.moveBranchLengthsToConfidenceValues( getCurrentPhylogeny() ); } } if ( isGuessRootedness() ) { @@ -300,9 +308,9 @@ public final class NHXParser implements PhylogenyParser { && ( ( c != '[' ) && ( c != '.' ) && ( ( c < 48 ) || ( c > 57 ) ) ) ) { saw_colon = false; } - } - if ( in_open_bracket && c == ']' ) { - in_open_bracket = false; + if ( in_open_bracket && ( c == ']' ) ) { + in_open_bracket = false; + } } // \n\t is always ignored, // as is " (34) and ' (39) (space is 32): @@ -361,13 +369,13 @@ public final class NHXParser implements PhylogenyParser { // comment consisting just of "[]": saw_open_bracket = false; } - else if ( c == '(' && !in_open_bracket ) { + else if ( ( c == '(' ) && !in_open_bracket ) { processOpenParen(); } - else if ( c == ')' && !in_open_bracket ) { + else if ( ( c == ')' ) && !in_open_bracket ) { processCloseParen(); } - else if ( c == ',' && !in_open_bracket ) { + else if ( ( c == ',' ) && !in_open_bracket ) { processComma(); } else { @@ -623,12 +631,12 @@ public final class NHXParser implements PhylogenyParser { return true; } - private static void moveBranchLengthsToBootstrapValues( final Phylogeny p ) { + private static void moveBranchLengthsToConfidenceValues( final Phylogeny p ) { final PhylogenyNodeIterator it = p.iteratorPostorder(); while ( it.hasNext() ) { final PhylogenyNode n = it.next(); PhylogenyMethods.setBootstrapConfidence( n, n.getDistanceToParent() ); - n.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT ); + n.setDistanceToParent( PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ); } } @@ -636,8 +644,6 @@ public final class NHXParser implements PhylogenyParser { final PhylogenyNode node_to_annotate, final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction, final boolean replace_underscores ) throws NHXFormatException { - System.out.println( s ); - System.out.println(); if ( ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) && replace_underscores ) { throw new IllegalArgumentException( "cannot extract taxonomies and replace under scores at the same time" ); } @@ -645,18 +651,12 @@ public final class NHXParser implements PhylogenyParser { if ( replace_underscores ) { s = s.replaceAll( "_+", " " ); } - int ob = 0; - int cb = 0; - String a = ""; - String b = ""; - StringTokenizer t = null; boolean is_nhx = false; - ob = s.indexOf( "[" ); - cb = s.indexOf( "]" ); + final int ob = s.indexOf( "[" ); if ( ob > -1 ) { - a = ""; - b = ""; + String b = ""; is_nhx = true; + final int cb = s.indexOf( "]" ); if ( cb < 0 ) { throw new NHXFormatException( "error in NHX formatted data: no closing \"]\" in \"" + s + "\"" ); } @@ -670,14 +670,16 @@ public final class NHXParser implements PhylogenyParser { if ( numbers_only.matches() ) { b = ":" + NHXtags.SUPPORT + bracketed; } + else if ( s.indexOf( "prob=" ) > -1 ) { + processMrBayes3Data( s, node_to_annotate ); + } } - a = s.substring( 0, ob ); - s = a + b; + s = s.substring( 0, ob ) + b; if ( ( s.indexOf( "[" ) > -1 ) || ( s.indexOf( "]" ) > -1 ) ) { throw new NHXFormatException( "error in NHX formatted data: more than one \"]\" or \"[\"" ); } } - t = new StringTokenizer( s, ":" ); + final StringTokenizer t = new StringTokenizer( s, ":" ); if ( t.countTokens() > 0 ) { if ( !s.startsWith( ":" ) ) { node_to_annotate.setName( t.nextToken() ); @@ -697,8 +699,6 @@ public final class NHXParser implements PhylogenyParser { } while ( t.hasMoreTokens() ) { s = t.nextToken(); - System.out.println( "=>" + s ); - System.out.println(); if ( s.startsWith( org.forester.io.parsers.nhx.NHXtags.SPECIES_NAME ) ) { if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) { node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() ); @@ -781,7 +781,7 @@ public final class NHXParser implements PhylogenyParser { node_to_annotate.getNodeData().getSequence().setName( s.substring( 2 ) ); } else if ( s.indexOf( '=' ) < 0 ) { - if ( node_to_annotate.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) { + if ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ) { throw new NHXFormatException( "error in NHX formatted data: more than one distance to parent:" + "\"" + s + "\"" ); } @@ -792,6 +792,54 @@ public final class NHXParser implements PhylogenyParser { } } + private static void processMrBayes3Data( final String s, final PhylogenyNode node_to_annotate ) + throws NHXFormatException { + double sd = -1; + final Matcher mb_prob_sd_matcher = MB_PROB_SD_PATTERN.matcher( s ); + if ( mb_prob_sd_matcher.find() ) { + try { + sd = Double.parseDouble( mb_prob_sd_matcher.group( 1 ) ); + } + catch ( final NumberFormatException e ) { + throw new NHXFormatException( "failed to parse probability standard deviation (Mr Bayes output) from \"" + + s + "\"" ); + } + } + final Matcher mb_prob_matcher = MB_PROB_PATTERN.matcher( s ); + if ( mb_prob_matcher.find() ) { + double prob = -1; + try { + prob = Double.parseDouble( mb_prob_matcher.group( 1 ) ); + } + catch ( final NumberFormatException e ) { + throw new NHXFormatException( "failed to parse probability (Mr Bayes output) from \"" + s + "\"" ); + } + if ( prob >= 0.0 ) { + if ( sd >= 0.0 ) { + node_to_annotate.getBranchData() + .addConfidence( new Confidence( prob, "posterior probability", sd ) ); + } + else { + node_to_annotate.getBranchData().addConfidence( new Confidence( prob, "posterior probability" ) ); + } + } + } + final Matcher mb_bl_matcher = MB_BL_PATTERN.matcher( s ); + if ( mb_bl_matcher.find() ) { + double bl = -1; + try { + bl = Double.parseDouble( mb_bl_matcher.group( 1 ) ); + } + catch ( final NumberFormatException e ) { + throw new NHXFormatException( "failed to parse median branch length (Mr Bayes output) from \"" + s + + "\"" ); + } + if ( bl >= 0.0 ) { + node_to_annotate.setDistanceToParent( bl ); + } + } + } + /** * Parses String s in the format r.g.b (e.g. "12.34.234" ) into red, green, * and blue and returns the corresponding Color.