in progress

[jalview.git] / forester / java / src / org / forester / io / parsers / nexus / NexusPhylogeniesParser.java
diff --git a/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java b/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java

index 4ba9ca9..6ff0439 100644 (file)
--- a/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java
+++ b/forester/java/src/org/forester/io/parsers/nexus/NexusPhylogeniesParser.java
@@ -35,7 +35,7 @@ import java.util.Map;
  import java.util.regex.Matcher;
  import java.util.regex.Pattern;
  
-import org.forester.archaeopteryx.Constants;
+import org.forester.archaeopteryx.AptxConstants;
  import org.forester.io.parsers.IteratingPhylogenyParser;
  import org.forester.io.parsers.PhylogenyParser;
  import org.forester.io.parsers.nhx.NHXFormatException;
@@ -45,38 +45,66 @@ import org.forester.io.parsers.util.ParserUtils;
  import org.forester.io.parsers.util.PhylogenyParserException;
  import org.forester.phylogeny.Phylogeny;
  import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.data.Sequence;
  import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
+import org.forester.sequence.BasicSequence;
+import org.forester.sequence.MolecularSequence;
+import org.forester.util.ForesterConstants;
  import org.forester.util.ForesterUtil;
  
  public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser {
  
-    final private static String  begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
-    final private static String  taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
-    final private static String  translate                 = NexusConstants.TRANSLATE.toLowerCase();
-    final private static String  tree                      = NexusConstants.TREE.toLowerCase();
-    final private static String  utree                     = NexusConstants.UTREE.toLowerCase();
-    final private static String  end                       = NexusConstants.END.toLowerCase();
-    final private static String  endblock                  = "endblock";
-    final private static Pattern TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
-                                                                              Pattern.CASE_INSENSITIVE );
-    final private static Pattern ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
-    private Object               _nexus_source;
-    private List<String>         _taxlabels;
-    private Map<String, String>  _translate_map;
-    private boolean              _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
-    private boolean              _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
-    private TAXONOMY_EXTRACTION  _taxonomy_extraction      = NHXParser.TAXONOMY_EXTRACTION_DEFAULT;
-    private Phylogeny            _next;
-    private BufferedReader       _br;
-    private boolean              _in_trees_block;
-    private StringBuilder        _nh;
-    private String               _name;
-    private StringBuilder        _translate_sb;
-    private boolean              _in_taxalabels;
-    private boolean              _in_translate;
-    private boolean              _is_rooted;
-    private boolean              _rooted_info_present;
-    private boolean              _in_tree;
+   
+    final private static boolean DEBUG                               = false;
+    
+    final private static String            begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
+    final private static String            end                       = NexusConstants.END.toLowerCase();
+    final private static String            endblock                  = "endblock";
+    final private static Pattern           ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
+    final private static String            taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
+    final private static Pattern           TITLE_PATTERN             = Pattern.compile( "TITLE.?\\s+([^;]+)",
+                                                                                        Pattern.CASE_INSENSITIVE );
+    final private static String            translate                 = NexusConstants.TRANSLATE.toLowerCase();
+    final private static String            data                      = NexusConstants.BEGIN_CHARACTERS.toLowerCase();
+    final private static String            characters                = NexusConstants.BEGIN_DATA.toLowerCase();
+    final private static String            tree                      = NexusConstants.TREE.toLowerCase();
+    final private static Pattern           TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
+                                                                                        Pattern.CASE_INSENSITIVE );
+    final private static Pattern           TRANSLATE_PATTERN         = Pattern.compile( "([0-9A-Za-z]+)\\s+(.+)" );
+    final private static Pattern           ALN_PATTERN               = Pattern.compile( "(.+)\\s+([A-Za-z-_\\*\\?]+)" );
+    final private static Pattern           DATATYPE_PATTERN          = Pattern.compile( "datatype\\s?.\\s?([a-z]+)" );
+    //final private static Pattern           LINK_TAXA_PATTERN         = Pattern.compile( "link\\s+taxa\\s?.\\s?([^;]+)",
+    //                                                                                    Pattern.CASE_INSENSITIVE );
+    final private static String            utree                     = NexusConstants.UTREE.toLowerCase();
+    private BufferedReader                 _br;
+    private boolean                        _ignore_quotes_in_nh_data = AptxConstants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
+    private boolean                        _in_taxalabels;
+    private boolean                        _in_translate;
+    private boolean                        _in_tree;
+    private boolean                        _in_trees_block;
+    private boolean                        _in_data_block;
+    private boolean                        _is_rooted;
+    private String                         _datatype;
+    private String                         _name;
+    private Phylogeny                      _next;
+    private Object                         _nexus_source;
+    private StringBuilder                  _nh;
+    private boolean                        _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
+    private boolean                        _rooted_info_present;
+    private List<String>                   _taxlabels;
+    private TAXONOMY_EXTRACTION            _taxonomy_extraction      = TAXONOMY_EXTRACTION.NO;
+    private String                         _title;
+    private Map<String, String>            _translate_map;
+    private StringBuilder                  _translate_sb;
+    private Map<String, MolecularSequence> _seqs;
+    private final boolean                  _add_sequences            = true;
+    private boolean                       _parse_beast_style_extended_tags           = false;
+           
+
+    @Override
+    public String getName() {
+        return "Nexus Phylogenies Parser";
+    }
  
      @Override
      public final boolean hasNext() {
@@ -92,7 +120,6 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
  
      @Override
      public final Phylogeny[] parse() throws IOException {
-        reset();
          final List<Phylogeny> l = new ArrayList<Phylogeny>();
          while ( hasNext() ) {
              l.add( next() );
@@ -101,6 +128,7 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
          for( int i = 0; i < l.size(); ++i ) {
              p[ i ] = l.get( i );
          }
+        reset();
          return p;
      }
  
@@ -110,7 +138,8 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
          _translate_map = new HashMap<String, String>();
          _nh = new StringBuilder();
          _name = "";
-        _translate_sb = new StringBuilder();
+        _title = "";
+        _translate_sb = null;
          _next = null;
          _in_trees_block = false;
          _in_taxalabels = false;
@@ -118,7 +147,8 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
          _in_tree = false;
          _rooted_info_present = false;
          _is_rooted = false;
-        _br = ParserUtils.createReader( _nexus_source );
+        _seqs = new HashMap<String, MolecularSequence>();
+        _br = ParserUtils.createReader( _nexus_source, ForesterConstants.UTF_8 );
          getNext();
      }
  
@@ -143,31 +173,38 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
          _taxonomy_extraction = taxonomy_extraction;
      }
  
-    private final void createPhylogeny( final String name,
+    private final void createPhylogeny( final String title,
+                                        final String name,
                                          final StringBuilder nhx,
                                          final boolean rooted_info_present,
                                          final boolean is_rooted ) throws IOException {
          _next = null;
          final NHXParser pars = new NHXParser();
-        if ( ( _taxlabels.size() < 1 ) && ( _translate_map.size() < 1 ) ) {
-            pars.setTaxonomyExtraction( _taxonomy_extraction );
-            pars.setReplaceUnderscores( _replace_underscores );
-            pars.setIgnoreQuotes( _ignore_quotes_in_nh_data );
-        }
-        else {
-            pars.setTaxonomyExtraction( TAXONOMY_EXTRACTION.NO );
-            pars.setReplaceUnderscores( false );
-            pars.setIgnoreQuotes( false );
-        }
+        pars.setTaxonomyExtraction( _taxonomy_extraction );
+        pars.setReplaceUnderscores( _replace_underscores );
+        pars.setIgnoreQuotes( _ignore_quotes_in_nh_data );
+        pars.setParseBeastStyleExtendedTags( _parse_beast_style_extended_tags );
          if ( rooted_info_present ) {
              pars.setGuessRootedness( false );
          }
-        pars.setSource( nhx );
+        pars.setSource( nhx.toString() );
          final Phylogeny p = pars.next();
          if ( p == null ) {
              throw new PhylogenyParserException( "failed to create phylogeny" );
          }
-        p.setName( name );
+        String myname = null;
+        if ( !ForesterUtil.isEmpty( title ) && !ForesterUtil.isEmpty( name ) ) {
+            myname = title.replace( '_', ' ' ).trim() + " (" + name.trim() + ")";
+        }
+        else if ( !ForesterUtil.isEmpty( title ) ) {
+            myname = title.replace( '_', ' ' ).trim();
+        }
+        else if ( !ForesterUtil.isEmpty( name ) ) {
+            myname = name.trim();
+        }
+        if ( !ForesterUtil.isEmpty( myname ) ) {
+            p.setName( myname );
+        }
          if ( rooted_info_present ) {
              p.setRooted( is_rooted );
          }
@@ -192,14 +229,20 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                  }
                  if ( !_replace_underscores && ( ( _taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) {
                      ParserUtils.extractTaxonomyDataFromNodeName( node, _taxonomy_extraction );
-                    //                    final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(),
-                    //                                                                                    getTaxonomyExtraction() );
-                    //                    if ( !ForesterUtil.isEmpty( tax ) ) {
-                    //                        if ( !node.getNodeData().isHasTaxonomy() ) {
-                    //                            node.getNodeData().setTaxonomy( new Taxonomy() );
-                    //                        }
-                    //                        node.getNodeData().getTaxonomy().setTaxonomyCode( tax );
-                    //                    }
+                }
+                else if ( _replace_underscores ) {
+                    if ( !ForesterUtil.isEmpty( node.getName() ) ) {
+                        node.setName( node.getName().replace( '_', ' ' ).trim() );
+                    }
+                }
+                if ( _add_sequences ) {
+                    if ( _seqs.containsKey( node.getName() ) ) {
+                        final MolecularSequence s = _seqs.get( node.getName() );
+                        //TODO need to check for uniqueness when adding seqs....
+                        final Sequence ns = new Sequence( s );
+                        ns.setMolecularSequenceAligned( true ); //TODO need to check if all same length
+                        node.getNodeData().addSequence( ns );
+                    }
                  }
              }
          }
@@ -210,6 +253,9 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
          _next = null;
          String line;
          while ( ( line = _br.readLine() ) != null ) {
+            if ( DEBUG ) {
+                System.out.println( line );
+            }
              line = line.trim();
              if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
                  line = ForesterUtil.collapseWhiteSpace( line );
@@ -219,27 +265,51 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                      _in_trees_block = true;
                      _in_taxalabels = false;
                      _in_translate = false;
+                    _in_data_block = false;
+                    _datatype = null;
+                    _title = "";
                  }
                  else if ( line_lc.startsWith( taxlabels ) ) {
+                    //TODO need to be taxa block instead
                      _in_trees_block = false;
                      _in_taxalabels = true;
                      _in_translate = false;
+                    _in_data_block = false;
+                    _datatype = null;
                  }
                  else if ( line_lc.startsWith( translate ) ) {
+                    _translate_sb = new StringBuilder();
                      _in_taxalabels = false;
                      _in_translate = true;
+                    _in_data_block = false;
+                    _datatype = null;
+                }
+                else if ( line_lc.startsWith( characters ) || line_lc.startsWith( data ) ) {
+                    _in_taxalabels = false;
+                    _in_trees_block = false;
+                    _in_translate = false;
+                    _in_data_block = true;
+                    _datatype = null;
                  }
                  else if ( _in_trees_block ) {
-                    //FIXME TODO need to work on this "title" and "link"
-                    if ( line_lc.startsWith( "title" ) || line_lc.startsWith( "link" ) ) {
-                        // Do nothing.
+                    if ( line_lc.startsWith( "title" ) ) {
+                        final Matcher title_m = TITLE_PATTERN.matcher( line );
+                        if ( title_m.lookingAt() ) {
+                            _title = title_m.group( 1 );
+                        }
+                    }
+                    else if ( line_lc.startsWith( "link" ) ) {
+                        //final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
+                        //if ( link_m.lookingAt() ) {
+                            //final String link = link_m.group( 1 );  //TODO why?
+                       // }
                      }
                      else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
                          _in_trees_block = false;
                          _in_tree = false;
                          _in_translate = false;
                          if ( _nh.length() > 0 ) {
-                            createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
+                            createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
                              _nh = new StringBuilder();
                              _name = "";
                              _rooted_info_present = false;
@@ -253,7 +323,7 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                          boolean might = false;
                          if ( _nh.length() > 0 ) {
                              might = true;
-                            createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
+                            createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
                              _nh = new StringBuilder();
                              _name = "";
                              _rooted_info_present = false;
@@ -286,7 +356,7 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                              && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
                          _in_tree = false;
                          _in_translate = false;
-                        createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
+                        createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
                          _nh = new StringBuilder();
                          _name = "";
                          _rooted_info_present = false;
@@ -328,10 +398,54 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                          }
                      }
                  }
+                if ( _in_data_block ) {
+                    if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
+                        _in_data_block = false;
+                        _datatype = null;
+                    }
+                    else if ( line_lc.startsWith( "link" ) ) {
+                     //   final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
+                     //   if ( link_m.lookingAt() ) {
+                     //       final String link = link_m.group( 1 );
+                     //   }
+                    }
+                    else {
+                        final Matcher datatype_matcher = DATATYPE_PATTERN.matcher( line_lc );
+                        if ( datatype_matcher.find() ) {
+                            _datatype = datatype_matcher.group( 1 );
+                        }
+                        else {
+                            if ( ( _datatype != null )
+                                    && ( _datatype.equals( "protein" ) || _datatype.equals( "dna" ) || _datatype
+                                            .equals( "rna" ) ) ) {
+                                if ( line.endsWith( ";" ) ) {
+                                    _in_data_block = false;
+                                    line = line.substring( 0, line.length() - 1 );
+                                }
+                                final Matcher aln_matcher = ALN_PATTERN.matcher( line );
+                                if ( aln_matcher.matches() ) {
+                                    final String id = aln_matcher.group( 1 );
+                                    final String seq = aln_matcher.group( 2 );
+                                    MolecularSequence s = null;
+                                    if ( _datatype.equals( "protein" ) ) {
+                                        s = BasicSequence.createAaSequence( id, seq );
+                                    }
+                                    else if ( _datatype.equals( "dna" ) ) {
+                                        s = BasicSequence.createDnaSequence( id, seq );
+                                    }
+                                    else {
+                                        s = BasicSequence.createRnaSequence( id, seq );
+                                    }
+                                    _seqs.put( id, s );
+                                }
+                            }
+                        }
+                    }
+                }
              }
          }
          if ( _nh.length() > 0 ) {
-            createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
+            createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
              if ( _next != null ) {
                  return;
              }
@@ -343,23 +457,20 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
          if ( s.endsWith( ";" ) ) {
              s = s.substring( 0, s.length() - 1 ).trim();
          }
-        for( final String pair : s.split( "," ) ) {
-            final String[] kv = pair.trim().split( "\\s+" );
-            if ( ( kv.length < 2 ) || ( kv.length > 3 ) ) {
-                throw new IOException( "ill-formatted translate values: " + translate_sb );
-            }
-            if ( ( kv.length == 3 ) && !kv[ 0 ].toLowerCase().trim().equals( translate ) ) {
-                throw new IOException( "ill-formatted translate values: " + translate_sb );
-            }
+        for( String pair : s.split( "," ) ) {
              String key = "";
              String value = "";
-            if ( kv.length == 3 ) {
-                key = kv[ 1 ];
-                value = kv[ 2 ];
+            final int ti = pair.toLowerCase().indexOf( "translate" );
+            if ( ti > -1 ) {
+                pair = pair.substring( ti + 9 );
+            }
+            final Matcher m = TRANSLATE_PATTERN.matcher( pair );
+            if ( m.find() ) {
+                key = m.group( 1 );
+                value = m.group( 2 ).replaceAll( "\'", "" ).replaceAll( "\"", "" ).trim();
              }
              else {
-                key = kv[ 0 ];
-                value = kv[ 1 ];
+                throw new IOException( "ill-formatted translate values: " + pair );
              }
              if ( value.endsWith( ";" ) ) {
                  value = value.substring( 0, value.length() - 1 );
@@ -367,7 +478,11 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
              _translate_map.put( key, value );
          }
      }
-
+    
+    public final void setParseBeastStyleExtendedTags( final boolean parse_beast_style_extended_tags ) {
+        _parse_beast_style_extended_tags = parse_beast_style_extended_tags;
+    }
+    
      private final static String removeWhiteSpaceBeforeSemicolon( final String s ) {
          return s.replaceAll( "\\s+;", ";" );
      }