in progress
[jalview.git] / forester / java / src / org / forester / io / parsers / nexus / NexusPhylogeniesParser.java
index 4ba9ca9..6ff0439 100644 (file)
@@ -35,7 +35,7 @@ import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-import org.forester.archaeopteryx.Constants;
+import org.forester.archaeopteryx.AptxConstants;
 import org.forester.io.parsers.IteratingPhylogenyParser;
 import org.forester.io.parsers.PhylogenyParser;
 import org.forester.io.parsers.nhx.NHXFormatException;
@@ -45,38 +45,66 @@ import org.forester.io.parsers.util.ParserUtils;
 import org.forester.io.parsers.util.PhylogenyParserException;
 import org.forester.phylogeny.Phylogeny;
 import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.data.Sequence;
 import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
+import org.forester.sequence.BasicSequence;
+import org.forester.sequence.MolecularSequence;
+import org.forester.util.ForesterConstants;
 import org.forester.util.ForesterUtil;
 
 public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, PhylogenyParser {
 
-    final private static String  begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
-    final private static String  taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
-    final private static String  translate                 = NexusConstants.TRANSLATE.toLowerCase();
-    final private static String  tree                      = NexusConstants.TREE.toLowerCase();
-    final private static String  utree                     = NexusConstants.UTREE.toLowerCase();
-    final private static String  end                       = NexusConstants.END.toLowerCase();
-    final private static String  endblock                  = "endblock";
-    final private static Pattern TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
-                                                                              Pattern.CASE_INSENSITIVE );
-    final private static Pattern ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
-    private Object               _nexus_source;
-    private List<String>         _taxlabels;
-    private Map<String, String>  _translate_map;
-    private boolean              _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
-    private boolean              _ignore_quotes_in_nh_data = Constants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
-    private TAXONOMY_EXTRACTION  _taxonomy_extraction      = NHXParser.TAXONOMY_EXTRACTION_DEFAULT;
-    private Phylogeny            _next;
-    private BufferedReader       _br;
-    private boolean              _in_trees_block;
-    private StringBuilder        _nh;
-    private String               _name;
-    private StringBuilder        _translate_sb;
-    private boolean              _in_taxalabels;
-    private boolean              _in_translate;
-    private boolean              _is_rooted;
-    private boolean              _rooted_info_present;
-    private boolean              _in_tree;
+   
+    final private static boolean DEBUG                               = false;
+    
+    final private static String            begin_trees               = NexusConstants.BEGIN_TREES.toLowerCase();
+    final private static String            end                       = NexusConstants.END.toLowerCase();
+    final private static String            endblock                  = "endblock";
+    final private static Pattern           ROOTEDNESS_PATTERN        = Pattern.compile( ".+=\\s*\\[&([R|U])\\].*" );
+    final private static String            taxlabels                 = NexusConstants.TAXLABELS.toLowerCase();
+    final private static Pattern           TITLE_PATTERN             = Pattern.compile( "TITLE.?\\s+([^;]+)",
+                                                                                        Pattern.CASE_INSENSITIVE );
+    final private static String            translate                 = NexusConstants.TRANSLATE.toLowerCase();
+    final private static String            data                      = NexusConstants.BEGIN_CHARACTERS.toLowerCase();
+    final private static String            characters                = NexusConstants.BEGIN_DATA.toLowerCase();
+    final private static String            tree                      = NexusConstants.TREE.toLowerCase();
+    final private static Pattern           TREE_NAME_PATTERN         = Pattern.compile( "\\s*.?Tree\\s+(.+?)\\s*=.+",
+                                                                                        Pattern.CASE_INSENSITIVE );
+    final private static Pattern           TRANSLATE_PATTERN         = Pattern.compile( "([0-9A-Za-z]+)\\s+(.+)" );
+    final private static Pattern           ALN_PATTERN               = Pattern.compile( "(.+)\\s+([A-Za-z-_\\*\\?]+)" );
+    final private static Pattern           DATATYPE_PATTERN          = Pattern.compile( "datatype\\s?.\\s?([a-z]+)" );
+    //final private static Pattern           LINK_TAXA_PATTERN         = Pattern.compile( "link\\s+taxa\\s?.\\s?([^;]+)",
+    //                                                                                    Pattern.CASE_INSENSITIVE );
+    final private static String            utree                     = NexusConstants.UTREE.toLowerCase();
+    private BufferedReader                 _br;
+    private boolean                        _ignore_quotes_in_nh_data = AptxConstants.NH_PARSING_IGNORE_QUOTES_DEFAULT;
+    private boolean                        _in_taxalabels;
+    private boolean                        _in_translate;
+    private boolean                        _in_tree;
+    private boolean                        _in_trees_block;
+    private boolean                        _in_data_block;
+    private boolean                        _is_rooted;
+    private String                         _datatype;
+    private String                         _name;
+    private Phylogeny                      _next;
+    private Object                         _nexus_source;
+    private StringBuilder                  _nh;
+    private boolean                        _replace_underscores      = NHXParser.REPLACE_UNDERSCORES_DEFAULT;
+    private boolean                        _rooted_info_present;
+    private List<String>                   _taxlabels;
+    private TAXONOMY_EXTRACTION            _taxonomy_extraction      = TAXONOMY_EXTRACTION.NO;
+    private String                         _title;
+    private Map<String, String>            _translate_map;
+    private StringBuilder                  _translate_sb;
+    private Map<String, MolecularSequence> _seqs;
+    private final boolean                  _add_sequences            = true;
+    private boolean                       _parse_beast_style_extended_tags           = false;
+           
+
+    @Override
+    public String getName() {
+        return "Nexus Phylogenies Parser";
+    }
 
     @Override
     public final boolean hasNext() {
@@ -92,7 +120,6 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
 
     @Override
     public final Phylogeny[] parse() throws IOException {
-        reset();
         final List<Phylogeny> l = new ArrayList<Phylogeny>();
         while ( hasNext() ) {
             l.add( next() );
@@ -101,6 +128,7 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
         for( int i = 0; i < l.size(); ++i ) {
             p[ i ] = l.get( i );
         }
+        reset();
         return p;
     }
 
@@ -110,7 +138,8 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
         _translate_map = new HashMap<String, String>();
         _nh = new StringBuilder();
         _name = "";
-        _translate_sb = new StringBuilder();
+        _title = "";
+        _translate_sb = null;
         _next = null;
         _in_trees_block = false;
         _in_taxalabels = false;
@@ -118,7 +147,8 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
         _in_tree = false;
         _rooted_info_present = false;
         _is_rooted = false;
-        _br = ParserUtils.createReader( _nexus_source );
+        _seqs = new HashMap<String, MolecularSequence>();
+        _br = ParserUtils.createReader( _nexus_source, ForesterConstants.UTF_8 );
         getNext();
     }
 
@@ -143,31 +173,38 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
         _taxonomy_extraction = taxonomy_extraction;
     }
 
-    private final void createPhylogeny( final String name,
+    private final void createPhylogeny( final String title,
+                                        final String name,
                                         final StringBuilder nhx,
                                         final boolean rooted_info_present,
                                         final boolean is_rooted ) throws IOException {
         _next = null;
         final NHXParser pars = new NHXParser();
-        if ( ( _taxlabels.size() < 1 ) && ( _translate_map.size() < 1 ) ) {
-            pars.setTaxonomyExtraction( _taxonomy_extraction );
-            pars.setReplaceUnderscores( _replace_underscores );
-            pars.setIgnoreQuotes( _ignore_quotes_in_nh_data );
-        }
-        else {
-            pars.setTaxonomyExtraction( TAXONOMY_EXTRACTION.NO );
-            pars.setReplaceUnderscores( false );
-            pars.setIgnoreQuotes( false );
-        }
+        pars.setTaxonomyExtraction( _taxonomy_extraction );
+        pars.setReplaceUnderscores( _replace_underscores );
+        pars.setIgnoreQuotes( _ignore_quotes_in_nh_data );
+        pars.setParseBeastStyleExtendedTags( _parse_beast_style_extended_tags );
         if ( rooted_info_present ) {
             pars.setGuessRootedness( false );
         }
-        pars.setSource( nhx );
+        pars.setSource( nhx.toString() );
         final Phylogeny p = pars.next();
         if ( p == null ) {
             throw new PhylogenyParserException( "failed to create phylogeny" );
         }
-        p.setName( name );
+        String myname = null;
+        if ( !ForesterUtil.isEmpty( title ) && !ForesterUtil.isEmpty( name ) ) {
+            myname = title.replace( '_', ' ' ).trim() + " (" + name.trim() + ")";
+        }
+        else if ( !ForesterUtil.isEmpty( title ) ) {
+            myname = title.replace( '_', ' ' ).trim();
+        }
+        else if ( !ForesterUtil.isEmpty( name ) ) {
+            myname = name.trim();
+        }
+        if ( !ForesterUtil.isEmpty( myname ) ) {
+            p.setName( myname );
+        }
         if ( rooted_info_present ) {
             p.setRooted( is_rooted );
         }
@@ -192,14 +229,20 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                 }
                 if ( !_replace_underscores && ( ( _taxonomy_extraction != TAXONOMY_EXTRACTION.NO ) ) ) {
                     ParserUtils.extractTaxonomyDataFromNodeName( node, _taxonomy_extraction );
-                    //                    final String tax = ParserUtils.extractTaxonomyCodeFromNodeName( node.getName(),
-                    //                                                                                    getTaxonomyExtraction() );
-                    //                    if ( !ForesterUtil.isEmpty( tax ) ) {
-                    //                        if ( !node.getNodeData().isHasTaxonomy() ) {
-                    //                            node.getNodeData().setTaxonomy( new Taxonomy() );
-                    //                        }
-                    //                        node.getNodeData().getTaxonomy().setTaxonomyCode( tax );
-                    //                    }
+                }
+                else if ( _replace_underscores ) {
+                    if ( !ForesterUtil.isEmpty( node.getName() ) ) {
+                        node.setName( node.getName().replace( '_', ' ' ).trim() );
+                    }
+                }
+                if ( _add_sequences ) {
+                    if ( _seqs.containsKey( node.getName() ) ) {
+                        final MolecularSequence s = _seqs.get( node.getName() );
+                        //TODO need to check for uniqueness when adding seqs....
+                        final Sequence ns = new Sequence( s );
+                        ns.setMolecularSequenceAligned( true ); //TODO need to check if all same length
+                        node.getNodeData().addSequence( ns );
+                    }
                 }
             }
         }
@@ -210,6 +253,9 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
         _next = null;
         String line;
         while ( ( line = _br.readLine() ) != null ) {
+            if ( DEBUG ) {
+                System.out.println( line );
+            }
             line = line.trim();
             if ( ( line.length() > 0 ) && !line.startsWith( "#" ) && !line.startsWith( ">" ) ) {
                 line = ForesterUtil.collapseWhiteSpace( line );
@@ -219,27 +265,51 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                     _in_trees_block = true;
                     _in_taxalabels = false;
                     _in_translate = false;
+                    _in_data_block = false;
+                    _datatype = null;
+                    _title = "";
                 }
                 else if ( line_lc.startsWith( taxlabels ) ) {
+                    //TODO need to be taxa block instead
                     _in_trees_block = false;
                     _in_taxalabels = true;
                     _in_translate = false;
+                    _in_data_block = false;
+                    _datatype = null;
                 }
                 else if ( line_lc.startsWith( translate ) ) {
+                    _translate_sb = new StringBuilder();
                     _in_taxalabels = false;
                     _in_translate = true;
+                    _in_data_block = false;
+                    _datatype = null;
+                }
+                else if ( line_lc.startsWith( characters ) || line_lc.startsWith( data ) ) {
+                    _in_taxalabels = false;
+                    _in_trees_block = false;
+                    _in_translate = false;
+                    _in_data_block = true;
+                    _datatype = null;
                 }
                 else if ( _in_trees_block ) {
-                    //FIXME TODO need to work on this "title" and "link"
-                    if ( line_lc.startsWith( "title" ) || line_lc.startsWith( "link" ) ) {
-                        // Do nothing.
+                    if ( line_lc.startsWith( "title" ) ) {
+                        final Matcher title_m = TITLE_PATTERN.matcher( line );
+                        if ( title_m.lookingAt() ) {
+                            _title = title_m.group( 1 );
+                        }
+                    }
+                    else if ( line_lc.startsWith( "link" ) ) {
+                        //final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
+                        //if ( link_m.lookingAt() ) {
+                            //final String link = link_m.group( 1 );  //TODO why?
+                       // }
                     }
                     else if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
                         _in_trees_block = false;
                         _in_tree = false;
                         _in_translate = false;
                         if ( _nh.length() > 0 ) {
-                            createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
+                            createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
                             _nh = new StringBuilder();
                             _name = "";
                             _rooted_info_present = false;
@@ -253,7 +323,7 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                         boolean might = false;
                         if ( _nh.length() > 0 ) {
                             might = true;
-                            createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
+                            createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
                             _nh = new StringBuilder();
                             _name = "";
                             _rooted_info_present = false;
@@ -286,7 +356,7 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                             && !line_lc.startsWith( end ) && !line_lc.startsWith( endblock ) && line_lc.endsWith( ";" ) ) {
                         _in_tree = false;
                         _in_translate = false;
-                        createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
+                        createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
                         _nh = new StringBuilder();
                         _name = "";
                         _rooted_info_present = false;
@@ -328,10 +398,54 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
                         }
                     }
                 }
+                if ( _in_data_block ) {
+                    if ( line_lc.startsWith( end ) || line_lc.startsWith( endblock ) ) {
+                        _in_data_block = false;
+                        _datatype = null;
+                    }
+                    else if ( line_lc.startsWith( "link" ) ) {
+                     //   final Matcher link_m = LINK_TAXA_PATTERN.matcher( line );
+                     //   if ( link_m.lookingAt() ) {
+                     //       final String link = link_m.group( 1 );
+                     //   }
+                    }
+                    else {
+                        final Matcher datatype_matcher = DATATYPE_PATTERN.matcher( line_lc );
+                        if ( datatype_matcher.find() ) {
+                            _datatype = datatype_matcher.group( 1 );
+                        }
+                        else {
+                            if ( ( _datatype != null )
+                                    && ( _datatype.equals( "protein" ) || _datatype.equals( "dna" ) || _datatype
+                                            .equals( "rna" ) ) ) {
+                                if ( line.endsWith( ";" ) ) {
+                                    _in_data_block = false;
+                                    line = line.substring( 0, line.length() - 1 );
+                                }
+                                final Matcher aln_matcher = ALN_PATTERN.matcher( line );
+                                if ( aln_matcher.matches() ) {
+                                    final String id = aln_matcher.group( 1 );
+                                    final String seq = aln_matcher.group( 2 );
+                                    MolecularSequence s = null;
+                                    if ( _datatype.equals( "protein" ) ) {
+                                        s = BasicSequence.createAaSequence( id, seq );
+                                    }
+                                    else if ( _datatype.equals( "dna" ) ) {
+                                        s = BasicSequence.createDnaSequence( id, seq );
+                                    }
+                                    else {
+                                        s = BasicSequence.createRnaSequence( id, seq );
+                                    }
+                                    _seqs.put( id, s );
+                                }
+                            }
+                        }
+                    }
+                }
             }
         }
         if ( _nh.length() > 0 ) {
-            createPhylogeny( _name, _nh, _rooted_info_present, _is_rooted );
+            createPhylogeny( _title, _name, _nh, _rooted_info_present, _is_rooted );
             if ( _next != null ) {
                 return;
             }
@@ -343,23 +457,20 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
         if ( s.endsWith( ";" ) ) {
             s = s.substring( 0, s.length() - 1 ).trim();
         }
-        for( final String pair : s.split( "," ) ) {
-            final String[] kv = pair.trim().split( "\\s+" );
-            if ( ( kv.length < 2 ) || ( kv.length > 3 ) ) {
-                throw new IOException( "ill-formatted translate values: " + translate_sb );
-            }
-            if ( ( kv.length == 3 ) && !kv[ 0 ].toLowerCase().trim().equals( translate ) ) {
-                throw new IOException( "ill-formatted translate values: " + translate_sb );
-            }
+        for( String pair : s.split( "," ) ) {
             String key = "";
             String value = "";
-            if ( kv.length == 3 ) {
-                key = kv[ 1 ];
-                value = kv[ 2 ];
+            final int ti = pair.toLowerCase().indexOf( "translate" );
+            if ( ti > -1 ) {
+                pair = pair.substring( ti + 9 );
+            }
+            final Matcher m = TRANSLATE_PATTERN.matcher( pair );
+            if ( m.find() ) {
+                key = m.group( 1 );
+                value = m.group( 2 ).replaceAll( "\'", "" ).replaceAll( "\"", "" ).trim();
             }
             else {
-                key = kv[ 0 ];
-                value = kv[ 1 ];
+                throw new IOException( "ill-formatted translate values: " + pair );
             }
             if ( value.endsWith( ";" ) ) {
                 value = value.substring( 0, value.length() - 1 );
@@ -367,7 +478,11 @@ public final class NexusPhylogeniesParser implements IteratingPhylogenyParser, P
             _translate_map.put( key, value );
         }
     }
-
+    
+    public final void setParseBeastStyleExtendedTags( final boolean parse_beast_style_extended_tags ) {
+        _parse_beast_style_extended_tags = parse_beast_style_extended_tags;
+    }
+    
     private final static String removeWhiteSpaceBeforeSemicolon( final String s ) {
         return s.replaceAll( "\\s+;", ";" );
     }