in progress
[jalview.git] / forester / java / src / org / forester / io / parsers / nhx / NHXParser.java
index cdde33d..c373104 100644 (file)
@@ -39,15 +39,18 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.forester.io.parsers.PhylogenyParser;
+import org.forester.io.parsers.util.ParserUtils;
 import org.forester.io.parsers.util.PhylogenyParserException;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyMethods;
 import org.forester.phylogeny.PhylogenyNode;
 import org.forester.phylogeny.data.Accession;
 import org.forester.phylogeny.data.Annotation;
+import org.forester.phylogeny.data.Confidence;
 import org.forester.phylogeny.data.DomainArchitecture;
 import org.forester.phylogeny.data.Event;
 import org.forester.phylogeny.data.Identifier;
+import org.forester.phylogeny.data.PhylogenyDataUtil;
 import org.forester.phylogeny.data.PropertiesMap;
 import org.forester.phylogeny.data.Property;
 import org.forester.phylogeny.data.Sequence;
@@ -57,34 +60,40 @@ import org.forester.util.ForesterUtil;
 
 public final class NHXParser implements PhylogenyParser {
 
-    public static final boolean                          LIMIT_SPECIES_NAMES_TO_FIVE_CHARS = true;
-    public static final ForesterUtil.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT       = ForesterUtil.TAXONOMY_EXTRACTION.NO;
-    final static private boolean                         GUESS_ROOTEDNESS_DEFAULT          = true;
-    final static private boolean                         GUESS_IF_SUPPORT_VALUES           = true;
-    final static private boolean                         IGNORE_QUOTES_DEFAULT             = false;
-    final static public boolean                          REPLACE_UNDERSCORES_DEFAULT       = false;
-    private boolean                                      _saw_closing_paren;
-    final static private byte                            STRING                            = 0;
-    final static private byte                            STRING_BUFFER                     = 1;
-    final static private byte                            CHAR_ARRAY                        = 2;
-    final static private byte                            BUFFERED_READER                   = 3;
-    private boolean                                      _guess_rootedness;
-    private boolean                                      _has_next;
-    private boolean                                      _ignore_quotes;
-    private byte                                         _input_type;
-    private int                                          _source_length;
-    private PhylogenyNode                                _current_node;
-    private StringBuilder                                _current_anotation;
-    private Object                                       _nhx_source;
-    private int                                          _clade_level;
-    private List<Phylogeny>                              _phylogenies;
-    private Phylogeny                                    _current_phylogeny;
-    private ForesterUtil.TAXONOMY_EXTRACTION             _taxonomy_extraction;
-    private boolean                                      _replace_underscores;
-    public final static Pattern                          UC_LETTERS_NUMBERS_PATTERN        = Pattern
-                                                                                                   .compile( "^[A-Z0-9]+$" );
-    public final static Pattern                          NUMBERS_ONLY_PATTERN              = Pattern
-                                                                                                   .compile( "^[0-9]+$" );
+    public static final boolean                              LIMIT_SPECIES_NAMES_TO_FIVE_CHARS = false;
+    public static final PhylogenyMethods.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT       = PhylogenyMethods.TAXONOMY_EXTRACTION.NO;
+    final static private boolean                             GUESS_ROOTEDNESS_DEFAULT          = true;
+    final static private boolean                             GUESS_IF_SUPPORT_VALUES           = true;
+    final static private boolean                             IGNORE_QUOTES_DEFAULT             = false;
+    final static public boolean                              REPLACE_UNDERSCORES_DEFAULT       = false;
+    private boolean                                          _saw_closing_paren;
+    final static private byte                                STRING                            = 0;
+    final static private byte                                STRING_BUFFER                     = 1;
+    final static private byte                                CHAR_ARRAY                        = 2;
+    final static private byte                                BUFFERED_READER                   = 3;
+    private boolean                                          _guess_rootedness;
+    private boolean                                          _has_next;
+    private boolean                                          _ignore_quotes;
+    private byte                                             _input_type;
+    private int                                              _source_length;
+    private PhylogenyNode                                    _current_node;
+    private StringBuilder                                    _current_anotation;
+    private Object                                           _nhx_source;
+    private int                                              _clade_level;
+    private List<Phylogeny>                                  _phylogenies;
+    private Phylogeny                                        _current_phylogeny;
+    private PhylogenyMethods.TAXONOMY_EXTRACTION             _taxonomy_extraction;
+    private boolean                                          _replace_underscores;
+    public final static Pattern                              UC_LETTERS_NUMBERS_PATTERN        = Pattern
+                                                                                                       .compile( "^[A-Z0-9]+$" );
+    public final static Pattern                              NUMBERS_ONLY_PATTERN              = Pattern
+                                                                                                       .compile( "^[0-9\\.]+$" );
+    public final static Pattern                              MB_PROB_PATTERN                   = Pattern
+                                                                                                       .compile( "prob=([^,]+)" );
+    public final static Pattern                              MB_PROB_SD_PATTERN                = Pattern
+                                                                                                       .compile( "prob_stddev=([^,]+)" );
+    public final static Pattern                              MB_BL_PATTERN                     = Pattern
+                                                                                                       .compile( "length_median=([^,]+)" );
 
     public NHXParser() {
         init();
@@ -119,7 +128,7 @@ public final class NHXParser implements PhylogenyParser {
                       isReplaceUnderscores() );
             if ( NHXParser.GUESS_IF_SUPPORT_VALUES ) {
                 if ( NHXParser.isBranchLengthsLikeBootstrapValues( getCurrentPhylogeny() ) ) {
-                    NHXParser.moveBranchLengthsToBootstrapValues( getCurrentPhylogeny() );
+                    NHXParser.moveBranchLengthsToConfidenceValues( getCurrentPhylogeny() );
                 }
             }
             if ( isGuessRootedness() ) {
@@ -187,7 +196,7 @@ public final class NHXParser implements PhylogenyParser {
         return _source_length;
     }
 
-    public ForesterUtil.TAXONOMY_EXTRACTION getTaxonomyExtraction() {
+    public PhylogenyMethods.TAXONOMY_EXTRACTION getTaxonomyExtraction() {
         return _taxonomy_extraction;
     }
 
@@ -255,6 +264,7 @@ public final class NHXParser implements PhylogenyParser {
         boolean in_comment = false;
         boolean saw_colon = false;
         boolean saw_open_bracket = false;
+        boolean in_open_bracket = false;
         boolean in_double_quote = false;
         boolean in_single_quote = false;
         setPhylogenies( new ArrayList<Phylogeny>() );
@@ -298,6 +308,9 @@ public final class NHXParser implements PhylogenyParser {
                         && ( ( c != '[' ) && ( c != '.' ) && ( ( c < 48 ) || ( c > 57 ) ) ) ) {
                     saw_colon = false;
                 }
+                if ( in_open_bracket && ( c == ']' ) ) {
+                    in_open_bracket = false;
+                }
             }
             // \n\t is always ignored,
             // as is " (34) and ' (39) (space is 32):
@@ -337,6 +350,7 @@ public final class NHXParser implements PhylogenyParser {
             }
             else if ( c == '[' ) {
                 saw_open_bracket = true;
+                in_open_bracket = true;
             }
             else if ( saw_open_bracket ) {
                 if ( c != ']' ) {
@@ -355,13 +369,13 @@ public final class NHXParser implements PhylogenyParser {
                 // comment consisting just of "[]":
                 saw_open_bracket = false;
             }
-            else if ( c == '(' ) {
+            else if ( ( c == '(' ) && !in_open_bracket ) {
                 processOpenParen();
             }
-            else if ( c == ')' ) {
+            else if ( ( c == ')' ) && !in_open_bracket ) {
                 processCloseParen();
             }
-            else if ( c == ',' ) {
+            else if ( ( c == ',' ) && !in_open_bracket ) {
                 processComma();
             }
             else {
@@ -588,7 +602,7 @@ public final class NHXParser implements PhylogenyParser {
         _source_length = source_length;
     }
 
-    public void setTaxonomyExtraction( final ForesterUtil.TAXONOMY_EXTRACTION taxonomy_extraction ) {
+    public void setTaxonomyExtraction( final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction ) {
         _taxonomy_extraction = taxonomy_extraction;
     }
 
@@ -617,40 +631,34 @@ public final class NHXParser implements PhylogenyParser {
         return true;
     }
 
-    private static void moveBranchLengthsToBootstrapValues( final Phylogeny p ) {
+    private static void moveBranchLengthsToConfidenceValues( final Phylogeny p ) {
         final PhylogenyNodeIterator it = p.iteratorPostorder();
         while ( it.hasNext() ) {
             final PhylogenyNode n = it.next();
             PhylogenyMethods.setBootstrapConfidence( n, n.getDistanceToParent() );
-            n.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT );
+            n.setDistanceToParent( PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT );
         }
     }
 
     public static void parseNHX( String s,
                                  final PhylogenyNode node_to_annotate,
-                                 final ForesterUtil.TAXONOMY_EXTRACTION taxonomy_extraction,
+                                 final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction,
                                  final boolean replace_underscores ) throws NHXFormatException {
-        if ( ( taxonomy_extraction != ForesterUtil.TAXONOMY_EXTRACTION.NO ) && replace_underscores ) {
+        if ( ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) && replace_underscores ) {
             throw new IllegalArgumentException( "cannot extract taxonomies and replace under scores at the same time" );
         }
         if ( ( s != null ) && ( s.length() > 0 ) ) {
             if ( replace_underscores ) {
                 s = s.replaceAll( "_+", " " );
             }
-            int ob = 0;
-            int cb = 0;
-            String a = "";
-            String b = "";
-            StringTokenizer t = null;
             boolean is_nhx = false;
-            ob = s.indexOf( "[" );
-            cb = s.indexOf( "]" );
+            final int ob = s.indexOf( "[" );
             if ( ob > -1 ) {
-                a = "";
-                b = "";
+                String b = "";
                 is_nhx = true;
+                final int cb = s.indexOf( "]" );
                 if ( cb < 0 ) {
-                    throw new NHXFormatException( "error in NHX formatted data: no closing \"]\"" );
+                    throw new NHXFormatException( "error in NHX formatted data: no closing \"]\" in \"" + s + "\"" );
                 }
                 if ( s.indexOf( "&&NHX" ) == ( ob + 1 ) ) {
                     b = s.substring( ob + 6, cb );
@@ -662,20 +670,22 @@ public final class NHXParser implements PhylogenyParser {
                     if ( numbers_only.matches() ) {
                         b = ":" + NHXtags.SUPPORT + bracketed;
                     }
+                    else if ( s.indexOf( "prob=" ) > -1 ) {
+                        processMrBayes3Data( s, node_to_annotate );
+                    }
                 }
-                a = s.substring( 0, ob );
-                s = a + b;
+                s = s.substring( 0, ob ) + b;
                 if ( ( s.indexOf( "[" ) > -1 ) || ( s.indexOf( "]" ) > -1 ) ) {
                     throw new NHXFormatException( "error in NHX formatted data: more than one \"]\" or \"[\"" );
                 }
             }
-            t = new StringTokenizer( s, ":" );
-            if ( t.countTokens() >= 1 ) {
+            final StringTokenizer t = new StringTokenizer( s, ":" );
+            if ( t.countTokens() > 0 ) {
                 if ( !s.startsWith( ":" ) ) {
                     node_to_annotate.setName( t.nextToken() );
                     if ( !replace_underscores
-                            && ( !is_nhx && ( taxonomy_extraction != ForesterUtil.TAXONOMY_EXTRACTION.NO ) ) ) {
-                        final String tax = ForesterUtil
+                            && ( !is_nhx && ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) ) ) {
+                        final String tax = ParserUtils
                                 .extractTaxonomyCodeFromNodeName( node_to_annotate.getName(),
                                                                   LIMIT_SPECIES_NAMES_TO_FIVE_CHARS,
                                                                   taxonomy_extraction );
@@ -771,7 +781,7 @@ public final class NHXParser implements PhylogenyParser {
                         node_to_annotate.getNodeData().getSequence().setName( s.substring( 2 ) );
                     }
                     else if ( s.indexOf( '=' ) < 0 ) {
-                        if ( node_to_annotate.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) {
+                        if ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ) {
                             throw new NHXFormatException( "error in NHX formatted data: more than one distance to parent:"
                                     + "\"" + s + "\"" );
                         }
@@ -782,6 +792,54 @@ public final class NHXParser implements PhylogenyParser {
         }
     }
 
+    private static void processMrBayes3Data( final String s, final PhylogenyNode node_to_annotate )
+            throws NHXFormatException {
+        double sd = -1;
+        final Matcher mb_prob_sd_matcher = MB_PROB_SD_PATTERN.matcher( s );
+        if ( mb_prob_sd_matcher.find() ) {
+            try {
+                sd = Double.parseDouble( mb_prob_sd_matcher.group( 1 ) );
+            }
+            catch ( final NumberFormatException e ) {
+                throw new NHXFormatException( "failed to parse probability standard deviation (Mr Bayes output) from \""
+                        + s + "\"" );
+            }
+        }
+        final Matcher mb_prob_matcher = MB_PROB_PATTERN.matcher( s );
+        if ( mb_prob_matcher.find() ) {
+            double prob = -1;
+            try {
+                prob = Double.parseDouble( mb_prob_matcher.group( 1 ) );
+            }
+            catch ( final NumberFormatException e ) {
+                throw new NHXFormatException( "failed to parse probability (Mr Bayes output) from \"" + s + "\"" );
+            }
+            if ( prob >= 0.0 ) {
+                if ( sd >= 0.0 ) {
+                    node_to_annotate.getBranchData()
+                            .addConfidence( new Confidence( prob, "posterior probability", sd ) );
+                }
+                else {
+                    node_to_annotate.getBranchData().addConfidence( new Confidence( prob, "posterior probability" ) );
+                }
+            }
+        }
+        final Matcher mb_bl_matcher = MB_BL_PATTERN.matcher( s );
+        if ( mb_bl_matcher.find() ) {
+            double bl = -1;
+            try {
+                bl = Double.parseDouble( mb_bl_matcher.group( 1 ) );
+            }
+            catch ( final NumberFormatException e ) {
+                throw new NHXFormatException( "failed to parse median branch length (Mr Bayes output) from \"" + s
+                        + "\"" );
+            }
+            if ( bl >= 0.0 ) {
+                node_to_annotate.setDistanceToParent( bl );
+            }
+        }
+    }
+
     /**
      * Parses String s in the format r.g.b (e.g. "12.34.234" ) into red, green,
      * and blue and returns the corresponding Color.