in progress

[jalview.git] / forester / java / src / org / forester / io / parsers / nhx / NHXParser.java
diff --git a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java

index e227fee..c1cfa40 100644 (file)
--- a/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java
+++ b/forester/java/src/org/forester/io/parsers/nhx/NHXParser.java
@@ -39,6 +39,7 @@ import java.util.regex.Matcher;
  import java.util.regex.Pattern;
  
  import org.forester.io.parsers.PhylogenyParser;
+import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
  import org.forester.io.parsers.util.ParserUtils;
  import org.forester.io.parsers.util.PhylogenyParserException;
  import org.forester.phylogeny.Phylogeny;
@@ -46,9 +47,11 @@ import org.forester.phylogeny.PhylogenyMethods;
  import org.forester.phylogeny.PhylogenyNode;
  import org.forester.phylogeny.data.Accession;
  import org.forester.phylogeny.data.Annotation;
+import org.forester.phylogeny.data.Confidence;
  import org.forester.phylogeny.data.DomainArchitecture;
  import org.forester.phylogeny.data.Event;
  import org.forester.phylogeny.data.Identifier;
+import org.forester.phylogeny.data.PhylogenyDataUtil;
  import org.forester.phylogeny.data.PropertiesMap;
  import org.forester.phylogeny.data.Property;
  import org.forester.phylogeny.data.Sequence;
@@ -58,17 +61,16 @@ import org.forester.util.ForesterUtil;
  
  public final class NHXParser implements PhylogenyParser {
  
-    public static final boolean                              LIMIT_SPECIES_NAMES_TO_FIVE_CHARS = true;
-    public static final PhylogenyMethods.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT       = PhylogenyMethods.TAXONOMY_EXTRACTION.NO;
-    final static private boolean                             GUESS_ROOTEDNESS_DEFAULT          = true;
-    final static private boolean                             GUESS_IF_SUPPORT_VALUES           = true;
-    final static private boolean                             IGNORE_QUOTES_DEFAULT             = false;
-    final static public boolean                              REPLACE_UNDERSCORES_DEFAULT       = false;
+    public static final PhylogenyMethods.TAXONOMY_EXTRACTION TAXONOMY_EXTRACTION_DEFAULT = PhylogenyMethods.TAXONOMY_EXTRACTION.NO;
+    final static private boolean                             GUESS_ROOTEDNESS_DEFAULT    = true;
+    final static private boolean                             GUESS_IF_SUPPORT_VALUES     = true;
+    final static private boolean                             IGNORE_QUOTES_DEFAULT       = false;
+    final static public boolean                              REPLACE_UNDERSCORES_DEFAULT = false;
      private boolean                                          _saw_closing_paren;
-    final static private byte                                STRING                            = 0;
-    final static private byte                                STRING_BUFFER                     = 1;
-    final static private byte                                CHAR_ARRAY                        = 2;
-    final static private byte                                BUFFERED_READER                   = 3;
+    final static private byte                                STRING                      = 0;
+    final static private byte                                STRING_BUFFER               = 1;
+    final static private byte                                CHAR_ARRAY                  = 2;
+    final static private byte                                BUFFERED_READER             = 3;
      private boolean                                          _guess_rootedness;
      private boolean                                          _has_next;
      private boolean                                          _ignore_quotes;
@@ -82,10 +84,16 @@ public final class NHXParser implements PhylogenyParser {
      private Phylogeny                                        _current_phylogeny;
      private PhylogenyMethods.TAXONOMY_EXTRACTION             _taxonomy_extraction;
      private boolean                                          _replace_underscores;
-    public final static Pattern                              UC_LETTERS_NUMBERS_PATTERN        = Pattern
-                                                                                                       .compile( "^[A-Z0-9]+$" );
-    public final static Pattern                              NUMBERS_ONLY_PATTERN              = Pattern
-                                                                                                       .compile( "^[0-9]+$" );
+    public final static Pattern                              UC_LETTERS_NUMBERS_PATTERN  = Pattern
+                                                                                                 .compile( "^[A-Z0-9]+$" );
+    public final static Pattern                              NUMBERS_ONLY_PATTERN        = Pattern
+                                                                                                 .compile( "^[0-9\\.]+$" );
+    public final static Pattern                              MB_PROB_PATTERN             = Pattern
+                                                                                                 .compile( "prob=([^,]+)" );
+    public final static Pattern                              MB_PROB_SD_PATTERN          = Pattern
+                                                                                                 .compile( "prob_stddev=([^,]+)" );
+    public final static Pattern                              MB_BL_PATTERN               = Pattern
+                                                                                                 .compile( "length_median=([^,]+)" );
  
      public NHXParser() {
          init();
@@ -110,8 +118,9 @@ public final class NHXParser implements PhylogenyParser {
       * 
       * @throws PhylogenyParserException
       * @throws NHXFormatException
+     * @throws PhyloXmlDataFormatException 
       */
-    private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException {
+    private void finishPhylogeny() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
          setCladeLevel( 0 );
          if ( getCurrentPhylogeny() != null ) {
              parseNHX( getCurrentAnotation().toString(),
@@ -120,7 +129,7 @@ public final class NHXParser implements PhylogenyParser {
                        isReplaceUnderscores() );
              if ( NHXParser.GUESS_IF_SUPPORT_VALUES ) {
                  if ( NHXParser.isBranchLengthsLikeBootstrapValues( getCurrentPhylogeny() ) ) {
-                    NHXParser.moveBranchLengthsToBootstrapValues( getCurrentPhylogeny() );
+                    NHXParser.moveBranchLengthsToConfidenceValues( getCurrentPhylogeny() );
                  }
              }
              if ( isGuessRootedness() ) {
@@ -134,7 +143,8 @@ public final class NHXParser implements PhylogenyParser {
          }
      }
  
-    private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException {
+    private void finishSingleNodePhylogeny() throws PhylogenyParserException, NHXFormatException,
+            PhyloXmlDataFormatException {
          setCladeLevel( 0 );
          final PhylogenyNode new_node = new PhylogenyNode();
          parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() );
@@ -300,9 +310,9 @@ public final class NHXParser implements PhylogenyParser {
                          && ( ( c != '[' ) && ( c != '.' ) && ( ( c < 48 ) || ( c > 57 ) ) ) ) {
                      saw_colon = false;
                  }
-            }
-            if ( in_open_bracket && c == ']' ) {
-                in_open_bracket = false;
+                if ( in_open_bracket && ( c == ']' ) ) {
+                    in_open_bracket = false;
+                }
              }
              // \n\t is always ignored,
              // as is " (34) and ' (39) (space is 32):
@@ -361,13 +371,13 @@ public final class NHXParser implements PhylogenyParser {
                  // comment consisting just of "[]":
                  saw_open_bracket = false;
              }
-            else if ( c == '(' && !in_open_bracket ) {
+            else if ( ( c == '(' ) && !in_open_bracket ) {
                  processOpenParen();
              }
-            else if ( c == ')' && !in_open_bracket ) {
+            else if ( ( c == ')' ) && !in_open_bracket ) {
                  processCloseParen();
              }
-            else if ( c == ',' && !in_open_bracket ) {
+            else if ( ( c == ',' ) && !in_open_bracket ) {
                  processComma();
              }
              else {
@@ -400,8 +410,9 @@ public final class NHXParser implements PhylogenyParser {
       * 
       * @throws PhylogenyParserException
       * @throws NHXFormatException
+     * @throws PhyloXmlDataFormatException 
       */
-    private void processCloseParen() throws PhylogenyParserException, NHXFormatException {
+    private void processCloseParen() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
          decreaseCladeLevel();
          if ( !isSawClosingParen() ) {
              final PhylogenyNode new_node = new PhylogenyNode();
@@ -427,8 +438,9 @@ public final class NHXParser implements PhylogenyParser {
       * 
       * @throws PhylogenyParserException
       * @throws NHXFormatException
+     * @throws PhyloXmlDataFormatException 
       */
-    private void processComma() throws PhylogenyParserException, NHXFormatException {
+    private void processComma() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
          if ( !isSawClosingParen() ) {
              final PhylogenyNode new_node = new PhylogenyNode();
              parseNHX( getCurrentAnotation().toString(), new_node, getTaxonomyExtraction(), isReplaceUnderscores() );
@@ -452,8 +464,9 @@ public final class NHXParser implements PhylogenyParser {
       * 
       * @throws PhylogenyParserException
       * @throws NHXFormatException
+     * @throws PhyloXmlDataFormatException 
       */
-    private void processOpenParen() throws PhylogenyParserException, NHXFormatException {
+    private void processOpenParen() throws PhylogenyParserException, NHXFormatException, PhyloXmlDataFormatException {
          final PhylogenyNode new_node = new PhylogenyNode();
          if ( getCladeLevel() == 0 ) {
              if ( getCurrentPhylogeny() != null ) {
@@ -623,21 +636,20 @@ public final class NHXParser implements PhylogenyParser {
          return true;
      }
  
-    private static void moveBranchLengthsToBootstrapValues( final Phylogeny p ) {
+    private static void moveBranchLengthsToConfidenceValues( final Phylogeny p ) {
          final PhylogenyNodeIterator it = p.iteratorPostorder();
          while ( it.hasNext() ) {
              final PhylogenyNode n = it.next();
              PhylogenyMethods.setBootstrapConfidence( n, n.getDistanceToParent() );
-            n.setDistanceToParent( PhylogenyNode.DISTANCE_DEFAULT );
+            n.setDistanceToParent( PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT );
          }
      }
  
      public static void parseNHX( String s,
                                   final PhylogenyNode node_to_annotate,
                                   final PhylogenyMethods.TAXONOMY_EXTRACTION taxonomy_extraction,
-                                 final boolean replace_underscores ) throws NHXFormatException {
-        System.out.println( s );
-        System.out.println();
+                                 final boolean replace_underscores ) throws NHXFormatException,
+            PhyloXmlDataFormatException {
          if ( ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) && replace_underscores ) {
              throw new IllegalArgumentException( "cannot extract taxonomies and replace under scores at the same time" );
          }
@@ -645,18 +657,12 @@ public final class NHXParser implements PhylogenyParser {
              if ( replace_underscores ) {
                  s = s.replaceAll( "_+", " " );
              }
-            int ob = 0;
-            int cb = 0;
-            String a = "";
-            String b = "";
-            StringTokenizer t = null;
              boolean is_nhx = false;
-            ob = s.indexOf( "[" );
-            cb = s.indexOf( "]" );
+            final int ob = s.indexOf( "[" );
              if ( ob > -1 ) {
-                a = "";
-                b = "";
+                String b = "";
                  is_nhx = true;
+                final int cb = s.indexOf( "]" );
                  if ( cb < 0 ) {
                      throw new NHXFormatException( "error in NHX formatted data: no closing \"]\" in \"" + s + "\"" );
                  }
@@ -670,23 +676,23 @@ public final class NHXParser implements PhylogenyParser {
                      if ( numbers_only.matches() ) {
                          b = ":" + NHXtags.SUPPORT + bracketed;
                      }
+                    else if ( s.indexOf( "prob=" ) > -1 ) {
+                        processMrBayes3Data( s, node_to_annotate );
+                    }
                  }
-                a = s.substring( 0, ob );
-                s = a + b;
+                s = s.substring( 0, ob ) + b;
                  if ( ( s.indexOf( "[" ) > -1 ) || ( s.indexOf( "]" ) > -1 ) ) {
                      throw new NHXFormatException( "error in NHX formatted data: more than one \"]\" or \"[\"" );
                  }
              }
-            t = new StringTokenizer( s, ":" );
+            final StringTokenizer t = new StringTokenizer( s, ":" );
              if ( t.countTokens() > 0 ) {
                  if ( !s.startsWith( ":" ) ) {
                      node_to_annotate.setName( t.nextToken() );
                      if ( !replace_underscores
                              && ( !is_nhx && ( taxonomy_extraction != PhylogenyMethods.TAXONOMY_EXTRACTION.NO ) ) ) {
-                        final String tax = ParserUtils
-                                .extractTaxonomyCodeFromNodeName( node_to_annotate.getName(),
-                                                                  LIMIT_SPECIES_NAMES_TO_FIVE_CHARS,
-                                                                  taxonomy_extraction );
+                        final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node_to_annotate.getName(),
+                                                                                        taxonomy_extraction );
                          if ( !ForesterUtil.isEmpty( tax ) ) {
                              if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) {
                                  node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() );
@@ -697,8 +703,6 @@ public final class NHXParser implements PhylogenyParser {
                  }
                  while ( t.hasMoreTokens() ) {
                      s = t.nextToken();
-                    System.out.println( "=>" + s );
-                    System.out.println();
                      if ( s.startsWith( org.forester.io.parsers.nhx.NHXtags.SPECIES_NAME ) ) {
                          if ( !node_to_annotate.getNodeData().isHasTaxonomy() ) {
                              node_to_annotate.getNodeData().setTaxonomy( new Taxonomy() );
@@ -781,7 +785,7 @@ public final class NHXParser implements PhylogenyParser {
                          node_to_annotate.getNodeData().getSequence().setName( s.substring( 2 ) );
                      }
                      else if ( s.indexOf( '=' ) < 0 ) {
-                        if ( node_to_annotate.getDistanceToParent() != PhylogenyNode.DISTANCE_DEFAULT ) {
+                        if ( node_to_annotate.getDistanceToParent() != PhylogenyDataUtil.BRANCH_LENGTH_DEFAULT ) {
                              throw new NHXFormatException( "error in NHX formatted data: more than one distance to parent:"
                                      + "\"" + s + "\"" );
                          }
@@ -792,6 +796,54 @@ public final class NHXParser implements PhylogenyParser {
          }
      }
  
+    private static void processMrBayes3Data( final String s, final PhylogenyNode node_to_annotate )
+            throws NHXFormatException {
+        double sd = -1;
+        final Matcher mb_prob_sd_matcher = MB_PROB_SD_PATTERN.matcher( s );
+        if ( mb_prob_sd_matcher.find() ) {
+            try {
+                sd = Double.parseDouble( mb_prob_sd_matcher.group( 1 ) );
+            }
+            catch ( final NumberFormatException e ) {
+                throw new NHXFormatException( "failed to parse probability standard deviation (Mr Bayes output) from \""
+                        + s + "\"" );
+            }
+        }
+        final Matcher mb_prob_matcher = MB_PROB_PATTERN.matcher( s );
+        if ( mb_prob_matcher.find() ) {
+            double prob = -1;
+            try {
+                prob = Double.parseDouble( mb_prob_matcher.group( 1 ) );
+            }
+            catch ( final NumberFormatException e ) {
+                throw new NHXFormatException( "failed to parse probability (Mr Bayes output) from \"" + s + "\"" );
+            }
+            if ( prob >= 0.0 ) {
+                if ( sd >= 0.0 ) {
+                    node_to_annotate.getBranchData()
+                            .addConfidence( new Confidence( prob, "posterior probability", sd ) );
+                }
+                else {
+                    node_to_annotate.getBranchData().addConfidence( new Confidence( prob, "posterior probability" ) );
+                }
+            }
+        }
+        final Matcher mb_bl_matcher = MB_BL_PATTERN.matcher( s );
+        if ( mb_bl_matcher.find() ) {
+            double bl = -1;
+            try {
+                bl = Double.parseDouble( mb_bl_matcher.group( 1 ) );
+            }
+            catch ( final NumberFormatException e ) {
+                throw new NHXFormatException( "failed to parse median branch length (Mr Bayes output) from \"" + s
+                        + "\"" );
+            }
+            if ( bl >= 0.0 ) {
+                node_to_annotate.setDistanceToParent( bl );
+            }
+        }
+    }
+
      /**
       * Parses String s in the format r.g.b (e.g. "12.34.234" ) into red, green,
       * and blue and returns the corresponding Color.