in progress
[jalview.git] / forester / java / src / org / forester / io / parsers / nhx / NHXParser.java
index e227fee..c1cfa40 100644 (file)
@@ -39,6 +39,7 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.forester.io.parsers.PhylogenyParser;
+import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
 import org.forester.io.parsers.util.ParserUtils;
 import org.forester.io.parsers.util.PhylogenyParserException;
 import org.forester.phylogeny.Phylogeny;
@@ -46,9 +47,11 @@ import org.forester.phylogeny.PhylogenyMethods;
 import org.forester.phylogeny.PhylogenyNode;
 import org.forester.phylogeny.data.Accession;
 import org.forester.phylogeny.data.Annotation;
+import org.forester.phylogeny.data.Confidence;
 import org.forester.phylogeny.data.DomainArchitecture;
 import org.forester.phylogeny.data.Event;
 import org.forester.phylogeny.data.Identifier;
+import org.forester.phylogeny.data.PhylogenyDataUtil;
 import org.forester.phylogeny.data.PropertiesMap;
 import org.forester.phylogeny.data.Property;
 import org.forester.phylogeny.data.Sequence;
@@ -58,17 +61,16 @@ import org.forester.util.ForesterUtil;
 
 public final class NHXParser implements PhylogenyParser {
 
-    public static final boolean                              LIMIT_SPECIES_NAMES_TO_FIVE_CHARS = true;
-    public static final PhylogenyMethods.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT       = PhylogenyMethods.TAXONOMY_EXTRACTION.NO;
-    final static private boolean                             GUESS_ROOTEDNESS_DEFAULT          = true;
-    final static private boolean                             GUESS_IF_SUPPORT_VALUES           = true;
-    final static private boolean                             IGNORE_QUOTES_DEFAULT             = false;
-    final static public boolean                              REPLACE_UNDERSCORES_DEFAULT       = false;
+    public static final PhylogenyMethods.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = PhylogenyMethods.TAXONOMY_EXTRACTION.NO;
+    final static private boolean                             GUESS_ROOTEDNESS_DEFAULT    = true;
+    final static private boolean                             GUESS_IF_SUPPORT_VALUES     = true;
+    final static private boolean                             IGNORE_QUOTES_DEFAULT       = false;
+    final static public boolean                              REPLACE_UNDERSCORES_DEFAULT = false;
     private boolean                                          _saw_closing_paren;
-    final static private byte                                STRING                            = 0;
-    final static private byte                                STRING_BUFFER                     = 1;
-    final static private byte                                CHAR_ARRAY                        = 2;
-    final static private byte                                BUFFERED_READER                   = 3;
+    final static private byte                                STRING                      = 0;
+    final static private byte                                STRING_BUFFER               = 1;
+    final static private byte                                CHAR_ARRAY                  = 2;
+    final static private byte                                BUFFERED_READER             = 3;
     private boolean                                          _guess_rootedness;
     private boolean                                          _has_next;
     private boolean                                          _ignore_quotes;
@@ -82,10 +84,16 @@ public final class NHXParser implements PhylogenyParser {
     private Phylogeny                                        _current_phylogeny;
     private PhylogenyMethods.TAXONOMY_EXTRACTION             _taxonomy_extraction;
     private boolean                                          _replace_underscores;
-    public final static Pattern                              UC_LETTERS_NUMBERS_PATTERN        = Pattern
-                                                                                                       .compile( "^[A-Z0-9]+$" );
-    public final static Pattern                              NUMBERS_ONLY_PATTERN              = Pattern
-                                                                                                       .compile( "^[0-9]+$" );
+    public final static Pattern                              UC_LETTERS_NUMBERS_PATTERN  = Pattern
+                                                                                                 .compile( "^[A-Z0-9]+$" );
+    public final static Pattern                              NUMBERS_ONLY_PATTERN        = Pattern
+                                                                                                 .compile( "^[0-9\\.]+$" );
+    public final static Pattern                              MB_PROB_PATTERN             = Pattern
+                                                                                                 .compile( "prob=([^,]+)" );
+    public final static Pattern                              MB_PROB_SD_PATTERN          = Pattern
+                                                                                                 .compile( "prob_stddev=([^,]+)" );
+    public final static Pattern                              MB_BL_PATTERN               = Pattern
+                                                                                                 .compile( "length_median=([^,]+)" );
 
     public NHXParser() {
         init();
@@ -110,8 +118,9 @@ public final class NHXParser implements PhylogenyParser {
      * 
      * @throws PhylogenyParserException
      * @throws NHXFormatException
+     * @throws PhyloXmlDataFormatException 
      */
-    private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException {
+    private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
         setCladeLevel( 0 );
         if ( getCurrentPhylogeny() != null ) {
             parseNHX( getCurrentAnotation().toString(),
@@ -120,7 +129,7 @@ public final class NHXParser implements PhylogenyParser {
                       isReplaceUnderscores() );
             if ( NHXParser.GUESS_IF_SUPPORT_VALUES ) {
                 if ( NHXParser.isBranchLengthsLikeBootstrapValues( getCurrentPhylogeny() ) ) {
-                    NHXParser.moveBranchLengthsToBootstrapValues( getCurrentPhylogeny() );
+                    NHXParser.moveBranchLengthsToConfidenceValues( getCurrentPhylogeny() );
                 }
             }
             if ( isGuessRootedness() ) {
@@ -134,7 +143,8 @@ public final class NHXParser implements PhylogenyParser {
         }
     }
 
-    private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException {
+    private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException,
+            PhyloXmlDataFormatException {
         setCladeLevel( 0 );
         final PhylogenyNode new_node = new PhylogenyNode();
         parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() );
@@ -300,9 +310,9 @@ public final class NHXParser implements PhylogenyParser {
                         && ( ( c != '[' ) && ( c != '.' ) && ( ( c < 48 ) || ( c > 57 ) ) ) ) {
                     saw_colon = false;
                 }
-            }
-            if ( in_open_bracket && c == ']' ) {
-                in_open_bracket = false;
+                if ( in_open_bracket && ( c == ']' ) ) {
+                    in_open_bracket = false;
+                }
             }
             // \n\t is always ignored,
             // as is " (34) and ' (39) (space is 32):
@@ -361,13 +371,13 @@ public final class NHXParser implements PhylogenyParser {
                 // comment consisting just of "[]":
                 saw_open_bracket = false;
             }
-            else if ( c == '(' && !in_open_bracket ) {
+            else if ( ( c == '(' ) && !in_open_bracket ) {
                 processOpenParen();
             }
-            else if ( c == ')' && !in_open_bracket ) {
+            else if ( ( c == ')' ) && !in_open_bracket ) {
                 processCloseParen();
             }
-            else if ( c == ',' && !in_open_bracket ) {
+            else if ( ( c == ',' ) && !in_open_bracket ) {
                 processComma();
             }
             else {
@@ -400,8 +410,9 @@ public final class NHXParser implements PhylogenyParser {
      * 
      * @throws PhylogenyParserException
      * @throws NHXFormatException
+     * @throws PhyloXmlDataFormatException 
      */
-    private void processCloseParen() throws PhylogenyParserException, NHXFormatException {
+    private void processCloseParen() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
         decreaseCladeLevel();
         if ( !isSawClosingParen() ) {
             final PhylogenyNode new_node = new PhylogenyNode();
@@ -427,8 +438,9 @@ public final class NHXParser implements PhylogenyParser {
      * 
      * @throws PhylogenyParserException
      * @throws NHXFormatException
+     * @throws PhyloXmlDataFormatException 
      */
-    private void processComma() throws PhylogenyParserException, NHXFormatException {
+    private void processComma() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
         if ( !isSawClosingParen() ) {
             final PhylogenyNode new_node = new PhylogenyNode();
             parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() );
@@ -452,8 +464,9 @@ public final class NHXParser implements PhylogenyParser {
      * 
      * @throws PhylogenyParserException
      * @throws NHXFormatException
+     * @throws PhyloXmlDataFormatException 
      */
-    private void processOpenParen() throws PhylogenyParserException, NHXFormatException {
+    private void processOpenParen() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
         final PhylogenyNode new_node = new PhylogenyNode();
         if ( getCladeLevel() == 0 ) {
             if ( getCurrentPhylogeny() != null ) {
@@ -623,21 +636,20 @@ public final class NHXParser implements PhylogenyParser {
         return true;
     }
 
-    private static void moveBranchLengthsToBootstrapValues( final Phylogeny p ) {
+    private static void moveBranchLengthsToConfidenceValues( final Phylogeny p ) {
         final PhylogenyNodeIterator it = p.iteratorPostorder();
         while ( it.hasNext() ) {
             final PhylogenyNode n = it.next();
             PhylogenyMethods.setBootstrapConfidence( n, n.getDistanceToParent() );
-            n.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT );
+            n.setDistanceToParent( PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT );
         }
     }
 
     public static void parseNHX( String s,
                                  final PhylogenyNode node_to_annotate,
                                  final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction,
-                                 final boolean replace_underscores ) throws NHXFormatException {
-        System.out.println( s );
-        System.out.println();
+                                 final boolean replace_underscores ) throws NHXFormatException,
+            PhyloXmlDataFormatException {
         if ( ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) && replace_underscores ) {
             throw new IllegalArgumentException( "cannot extract taxonomies and replace under scores at the same time" );
         }
@@ -645,18 +657,12 @@ public final class NHXParser implements PhylogenyParser {
             if ( replace_underscores ) {
                 s = s.replaceAll( "_+", " " );
             }
-            int ob = 0;
-            int cb = 0;
-            String a = "";
-            String b = "";
-            StringTokenizer t = null;
             boolean is_nhx = false;
-            ob = s.indexOf( "[" );
-            cb = s.indexOf( "]" );
+            final int ob = s.indexOf( "[" );
             if ( ob > -1 ) {
-                a = "";
-                b = "";
+                String b = "";
                 is_nhx = true;
+                final int cb = s.indexOf( "]" );
                 if ( cb < 0 ) {
                     throw new NHXFormatException( "error in NHX formatted data: no closing \"]\" in \"" + s + "\"" );
                 }
@@ -670,23 +676,23 @@ public final class NHXParser implements PhylogenyParser {
                     if ( numbers_only.matches() ) {
                         b = ":" + NHXtags.SUPPORT + bracketed;
                     }
+                    else if ( s.indexOf( "prob=" ) > -1 ) {
+                        processMrBayes3Data( s, node_to_annotate );
+                    }
                 }
-                a = s.substring( 0, ob );
-                s = a + b;
+                s = s.substring( 0, ob ) + b;
                 if ( ( s.indexOf( "[" ) > -1 ) || ( s.indexOf( "]" ) > -1 ) ) {
                     throw new NHXFormatException( "error in NHX formatted data: more than one \"]\" or \"[\"" );
                 }
             }
-            t = new StringTokenizer( s, ":" );
+            final StringTokenizer t = new StringTokenizer( s, ":" );
             if ( t.countTokens() > 0 ) {
                 if ( !s.startsWith( ":" ) ) {
                     node_to_annotate.setName( t.nextToken() );
                     if ( !replace_underscores
                             && ( !is_nhx && ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) ) ) {
-                        final String tax = ParserUtils
-                                .extractTaxonomyCodeFromNodeName( node_to_annotate.getName(),
-                                                                  LIMIT_SPECIES_NAMES_TO_FIVE_CHARS,
-                                                                  taxonomy_extraction );
+                        final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node_to_annotate.getName(),
+                                                                                        taxonomy_extraction );
                         if ( !ForesterUtil.isEmpty( tax ) ) {
                             if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) {
                                 node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() );
@@ -697,8 +703,6 @@ public final class NHXParser implements PhylogenyParser {
                 }
                 while ( t.hasMoreTokens() ) {
                     s = t.nextToken();
-                    System.out.println( "=>" + s );
-                    System.out.println();
                     if ( s.startsWith( org.forester.io.parsers.nhx.NHXtags.SPECIES_NAME ) ) {
                         if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) {
                             node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() );
@@ -781,7 +785,7 @@ public final class NHXParser implements PhylogenyParser {
                         node_to_annotate.getNodeData().getSequence().setName( s.substring( 2 ) );
                     }
                     else if ( s.indexOf( '=' ) < 0 ) {
-                        if ( node_to_annotate.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) {
+                        if ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ) {
                             throw new NHXFormatException( "error in NHX formatted data: more than one distance to parent:"
                                     + "\"" + s + "\"" );
                         }
@@ -792,6 +796,54 @@ public final class NHXParser implements PhylogenyParser {
         }
     }
 
+    private static void processMrBayes3Data( final String s, final PhylogenyNode node_to_annotate )
+            throws NHXFormatException {
+        double sd = -1;
+        final Matcher mb_prob_sd_matcher = MB_PROB_SD_PATTERN.matcher( s );
+        if ( mb_prob_sd_matcher.find() ) {
+            try {
+                sd = Double.parseDouble( mb_prob_sd_matcher.group( 1 ) );
+            }
+            catch ( final NumberFormatException e ) {
+                throw new NHXFormatException( "failed to parse probability standard deviation (Mr Bayes output) from \""
+                        + s + "\"" );
+            }
+        }
+        final Matcher mb_prob_matcher = MB_PROB_PATTERN.matcher( s );
+        if ( mb_prob_matcher.find() ) {
+            double prob = -1;
+            try {
+                prob = Double.parseDouble( mb_prob_matcher.group( 1 ) );
+            }
+            catch ( final NumberFormatException e ) {
+                throw new NHXFormatException( "failed to parse probability (Mr Bayes output) from \"" + s + "\"" );
+            }
+            if ( prob >= 0.0 ) {
+                if ( sd >= 0.0 ) {
+                    node_to_annotate.getBranchData()
+                            .addConfidence( new Confidence( prob, "posterior probability", sd ) );
+                }
+                else {
+                    node_to_annotate.getBranchData().addConfidence( new Confidence( prob, "posterior probability" ) );
+                }
+            }
+        }
+        final Matcher mb_bl_matcher = MB_BL_PATTERN.matcher( s );
+        if ( mb_bl_matcher.find() ) {
+            double bl = -1;
+            try {
+                bl = Double.parseDouble( mb_bl_matcher.group( 1 ) );
+            }
+            catch ( final NumberFormatException e ) {
+                throw new NHXFormatException( "failed to parse median branch length (Mr Bayes output) from \"" + s
+                        + "\"" );
+            }
+            if ( bl >= 0.0 ) {
+                node_to_annotate.setDistanceToParent( bl );
+            }
+        }
+    }
+
     /**
      * Parses String s in the format r.g.b (e.g. "12.34.234" ) into red, green,
      * and blue and returns the corresponding Color.